diff --git a/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py b/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py index 1bbf75f..b75c52d 100644 --- a/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py +++ b/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py @@ -1573,6 +1573,23 @@ def _combine_1q_gate_run(gates, array_fn=None): return Gate.from_raw(G, gates[0].qubits) +def _combine_2q_gate_run(gates, array_fn=None): + """Combine a run of two qubit gates in application order.""" + gates = tuple(gate for _, gate in gates) + G = gates[0].array + if array_fn is not None: + G = array_fn(G) + G = reshape(G, (4, 4)) + + for gate in gates[1:]: + Gi = gate.array + if array_fn is not None: + Gi = array_fn(Gi) + G = reshape(Gi, (4, 4)) @ G + + return Gate.from_raw(reshape(G, (2, 2, 2, 2)), gates[0].qubits) + + def _can_merge_1q_gate(gate): return ( (gate.controls is None) @@ -1583,48 +1600,96 @@ def _can_merge_1q_gate(gate): ) -def _iter_gates_with_merged_1q_runs(gates): +def _can_merge_2q_gate(gate): + return ( + (gate.controls is None) + and (not gate.special) + and (not gate.parametrize) + and (gate.qubits is not None) + and (len(gate.qubits) == 2) + ) + + +def _iter_gates_with_merged_runs(gates, merge_1q=True, merge_2q=True): """Yield ``(gate_to_apply, gates_to_record)``, merging adjacent runs of - single qubit gates that are not interrupted by any operation touching the - same qubit. + local gates that are not interrupted by any operation touching the same + qubits. """ - pending = {} + pending_1q = {} + pending_2q = {} def flush_qubit(q): - run = pending.pop(q, None) + run = pending_1q.pop(q, None) if run is None: return if len(run) == 1: return run[0][1], run return None, run + def flush_pair(pair): + run = pending_2q.pop(pair, None) + if run is None: + return + if len(run) == 1: + return run[0][1], run + return None, run + + def flush_touched(touched, keep_qubit=None, keep_pair=None): + for q in tuple(pending_1q): + if q == keep_qubit: + continue + if q in touched: + item = flush_qubit(q) + if item is not None: + yield item + + for pair in tuple(pending_2q): + if pair == keep_pair: + continue + if touched.intersection(pair): + item = flush_pair(pair) + if item is not None: + yield item + def flush_all(): - for q in tuple(pending): + for q in tuple(pending_1q): item = flush_qubit(q) if item is not None: yield item + for pair in tuple(pending_2q): + item = flush_pair(pair) + if item is not None: + yield item for i, gate in enumerate(gates): - if _can_merge_1q_gate(gate): + if merge_1q and _can_merge_1q_gate(gate): (q,) = gate.qubits - pending.setdefault(q, []).append((i, gate)) + yield from flush_touched({q}, keep_qubit=q) + pending_1q.setdefault(q, []).append((i, gate)) + continue + + if merge_2q and _can_merge_2q_gate(gate): + pair = gate.qubits + yield from flush_touched(set(pair), keep_pair=pair) + pending_2q.setdefault(pair, []).append((i, gate)) continue touched = set(gate.qubits or ()) if gate.controls: touched.update(gate.controls) - for q in tuple(pending): - if q in touched: - item = flush_qubit(q) - if item is not None: - yield item + yield from flush_touched(touched) yield gate, ((i, gate),) yield from flush_all() +_iter_gates_with_merged_1q_runs = functools.partial( + _iter_gates_with_merged_runs, merge_1q=True, merge_2q=False +) + + # --------------------------- main circuit class ---------------------------- # @@ -2103,6 +2168,24 @@ class Circuit: self._psi.gate_(G, gates[0][1].qubits, tags=tags, **opts) + def _apply_merged_2q_gate_run(self, gates, gate_number_offset=0, **gate_opts): + tags = tags_to_oset(gate_opts.pop("tags", None)) + for i, gate in gates: + tags |= self._gate_tags_for_record( + gate, gate_number=gate_number_offset + i + ) + + opts = {**self.gate_opts, **gate_opts} + + if self.convert_eager: + G = _combine_2q_gate_run( + gates, array_fn=self._maybe_convert_gate_array + ).array + else: + G = _combine_2q_gate_run(gates).array + + self._psi.gate_(G, gates[0][1].qubits, tags=tags, **opts) + def apply_gate( self, gate_id, @@ -2178,11 +2261,14 @@ class Circuit: Supplied to :meth:`~quimb.tensor.circuit.Circuit.apply_gate`. """ merge_1q = gate_opts.pop("merge_1q", "auto") + merge_2q = gate_opts.pop("merge_2q", "auto") if merge_1q == "auto": merge_1q = True + if merge_2q == "auto": + merge_2q = True - if merge_1q: + if merge_1q or merge_2q: gates = tuple( gate if isinstance(gate, Gate) else parse_to_gate(gate) for gate in gates @@ -2195,15 +2281,22 @@ class Circuit: pbar = _progbar(total=len(gates)) gate_number_offset = len(self._gates) - for gate, gates_to_record in _iter_gates_with_merged_1q_runs( - gates + for gate, gates_to_record in _iter_gates_with_merged_runs( + gates, merge_1q=merge_1q, merge_2q=merge_2q ): if gate is None: - self._apply_merged_1q_gate_run( - gates_to_record, - gate_number_offset=gate_number_offset, - **gate_opts, - ) + if len(gates_to_record[0][1].qubits) == 1: + self._apply_merged_1q_gate_run( + gates_to_record, + gate_number_offset=gate_number_offset, + **gate_opts, + ) + else: + self._apply_merged_2q_gate_run( + gates_to_record, + gate_number_offset=gate_number_offset, + **gate_opts, + ) else: self._apply_gate( gate, @@ -4892,11 +4985,16 @@ class CircuitMPS(Circuit): def apply_gates(self, gates, progbar=False, **gate_opts): merge_1q = gate_opts.pop("merge_1q", "auto") + merge_2q = gate_opts.pop("merge_2q", "auto") if merge_1q == "auto": merge_1q = True + if merge_2q == "auto": + # MPS truncation semantics are sensitive to when a 2q gate is + # materialized, so keep the default conservative here. + merge_2q = False - if merge_1q: + if merge_1q or merge_2q: gates = tuple( gate if isinstance(gate, Gate) else parse_to_gate(gate) for gate in gates @@ -4913,15 +5011,22 @@ class CircuitMPS(Circuit): ) gate_number_offset = len(self._gates) - for gate, gates_to_record in _iter_gates_with_merged_1q_runs( - gates + for gate, gates_to_record in _iter_gates_with_merged_runs( + gates, merge_1q=merge_1q, merge_2q=merge_2q ): if gate is None: - self._apply_merged_1q_gate_run( - gates_to_record, - gate_number_offset=gate_number_offset, - **gate_opts, - ) + if len(gates_to_record[0][1].qubits) == 1: + self._apply_merged_1q_gate_run( + gates_to_record, + gate_number_offset=gate_number_offset, + **gate_opts, + ) + else: + self._apply_merged_2q_gate_run( + gates_to_record, + gate_number_offset=gate_number_offset, + **gate_opts, + ) gate_for_progress = gates_to_record[-1][1] else: self._apply_gate( diff --git a/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py b/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py index a27060f..43ca8d8 100644 --- a/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py +++ b/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py @@ -5050,8 +5050,6 @@ class TNLinearOperator1D(spla.LinearOperator): if self.is_conj: T = T.conj() - print(T) - assert(0) return T.to_dense(self.left_inds, self.right_inds) def toarray(self): diff --git a/README.md b/README.md index 150b8e8..440a9fa 100644 --- a/README.md +++ b/README.md @@ -28,15 +28,24 @@ Currently, the supported tensor network libraries are: ## CPU expectation benchmarks -The current CPU expectation entrypoint is: +Use the library APIs directly: -```sh -python -u benchmark_cpu_expectation.py --ansatz mps --nqubits 40 --nlayers 10 --bond 2048 --circuits brickwall_cnot --observables ring_xz +```py +import qibotn + +records = qibotn.run_cpu_benchmark_cases( + ansatz="mps", + nqubits=40, + nlayers=10, + bond=2048, + circuits=("brickwall_cnot",), + observables=("ring_xz",), +) ``` -Use `--ansatz tn` for the generic TN path and `--mpi` under `mpiexec` for MPI runs. -Reusable circuit and observable builders live in `src/qibotn/benchmark_cases.py`; execution logic lives in `src/qibotn/expectation_runner.py`. -For Vidal/MPS 1D-chain scale tests, use `run_vidal_mps_cases.sh`. +For generic TN use `ansatz="tn"`. Contest/custom runners are available as +`qibotn.run_contest_tn_case`, `qibotn.run_custom_tn_expectation`, +`qibotn.run_contest_mps_case`, and `qibotn.run_vidal_validation_cases`. ## Installation diff --git a/benchmark_cpu_expectation.py b/benchmark_cpu_expectation.py deleted file mode 100644 index 3d5897d..0000000 --- a/benchmark_cpu_expectation.py +++ /dev/null @@ -1,285 +0,0 @@ -"""CLI for CPU TN/MPS expectation benchmarks.""" - -from __future__ import annotations - -import argparse -import os -import subprocess -from pathlib import Path -from urllib.parse import urlparse - -from qibotn.benchmark_cases import ( - CIRCUITS, - OBSERVABLES, - build_circuit, - observable_terms, - parse_names, - terms_to_dict, -) -from qibotn.expectation_runner import ( - ExpectationConfig, - exact_for_observable, - run_cpu_expectation, -) - - -def optional_int(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return int(text) - - -def optional_float(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return float(text) - - -def format_optional(value, fmt="g"): - return "None" if value is None else format(value, fmt) - - -def should_stop_dask(args): - return ( - not args.keep_dask - and args.tn_search_backend == "dask" - and args.dask_address is not None - and args.tn_load_tree is None - ) - - -def stop_dask_cluster(args, rank): - if rank != 0 or not should_stop_dask(args): - return - script = Path(__file__).resolve().parent / "tools" / "manage_tn_dask_cluster.sh" - if not script.exists(): - print(f"dask_stop_skipped reason=missing_script path={script}", flush=True) - return - - env = os.environ.copy() - parsed = urlparse(args.dask_address) - if parsed.hostname: - env.setdefault("SCHEDULER_HOST", parsed.hostname) - if parsed.port: - env.setdefault("SCHEDULER_PORT", str(parsed.port)) - - print("dask_stop_after_search start", flush=True) - subprocess.run([str(script), "stop"], cwd=str(script.parent.parent), env=env, check=False) - print("dask_stop_after_search done", flush=True) - - -def build_parallel_opts(args): - slicing_opts = {} - if args.tn_target_slices is not None: - slicing_opts["target_slices"] = args.tn_target_slices - if args.tn_target_size is not None: - slicing_opts["target_size"] = args.tn_target_size - - opts = { - "slicing_opts": slicing_opts or None, - "search_workers": args.tn_search_workers or args.torch_threads, - "max_repeats": args.tn_search_repeats, - "max_time": args.tn_search_time, - "print_stats": not args.no_tn_stats, - } - if args.tn_search_backend is not None: - opts["search_backend"] = args.tn_search_backend - if args.dask_address is not None: - opts["dask_address"] = args.dask_address - if args.tn_save_tree is not None: - opts["save_tree_path"] = args.tn_save_tree - if args.tn_load_tree is not None: - opts["load_tree_path"] = args.tn_load_tree - if args.tn_search_only: - opts["search_only"] = True - if args.tn_debug_trials: - opts["debug_trials"] = True - if args.tn_contract_implementation is not None: - opts["contract_implementation"] = args.tn_contract_implementation - if args.dask_close_workers: - opts["dask_close_workers"] = True - return opts - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=40) - parser.add_argument("--nlayers", type=int, default=30) - parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024) - parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--torch-threads", type=int, default=8) - parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch") - parser.add_argument( - "--dtype", - choices=("complex128", "complex64"), - default="complex128", - ) - parser.add_argument("--ansatz", choices=("tn", "mps"), default=None) - parser.add_argument("--mps", action="store_true") - parser.add_argument("--mpi", action="store_true") - parser.add_argument("--exact", action="store_true") - parser.add_argument("--exact-max-qubits", type=int, default=24) - parser.add_argument("--circuits", nargs="+", default=["brickwall_cnot"]) - parser.add_argument("--observables", nargs="+", default=["ring_xz"]) - parser.add_argument("--pauli-pattern") - parser.add_argument("--tn-target-slices", type=int) - parser.add_argument("--tn-target-size", type=int,default=2**32) - parser.add_argument("--tn-search-workers", type=int) - parser.add_argument("--tn-search-repeats", type=int, default=128) - parser.add_argument("--tn-search-time", type=float, default=60.0) - parser.add_argument( - "--no-tn-stats", - action="store_true", - help="Do not print per-term TN search/contraction diagnostics.", - ) - parser.add_argument( - "--tn-search-backend", - choices=("processpool", "dask"), - default="dask", - help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.", - ) - parser.add_argument( - "--dask-address", - help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.", - ) - parser.add_argument( - "--dask-close-workers", - action="store_true", - help="After dask path search, ask the scheduler to close all currently connected workers.", - ) - parser.add_argument( - "--keep-dask", - action="store_true", - help=( - "Keep an external dask cluster running after search. By default, " - "tools/manage_tn_dask_cluster.sh stop is called after search when " - "--dask-address is used." - ), - ) - parser.add_argument( - "--tn-save-tree", - help="Save searched cotengra contraction tree(s) to this pickle file.", - ) - parser.add_argument( - "--tn-load-tree", - help="Load cotengra contraction tree(s) from this pickle file and skip path search.", - ) - parser.add_argument( - "--tn-search-only", - action="store_true", - help="Only run path search and optional --tn-save-tree; skip contraction.", - ) - parser.add_argument( - "--tn-debug-trials", - action="store_true", - help="Print dask worker summary and per-trial worker start/done logs.", - ) - parser.add_argument( - "--tn-contract-implementation", - choices=("auto", "cotengra", "autoray", "cpp"), - help="cotengra contraction implementation for TN contraction.", - ) - args = parser.parse_args() - - ansatz = "mps" if args.mps else (args.ansatz or "tn") - circuits = parse_names(args.circuits, CIRCUITS, "circuits") - observables = [] if args.pauli_pattern else parse_names( - args.observables, OBSERVABLES, "observables" - ) - - rank = 0 - if args.mpi: - from mpi4py import MPI - - rank = MPI.COMM_WORLD.Get_rank() - - config = ExpectationConfig( - ansatz=ansatz, - mpi=args.mpi, - bond=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - quimb_backend=args.quimb_backend, - dtype=args.dtype, - torch_threads=args.torch_threads, - parallel_opts=build_parallel_opts(args), - ) - - if rank == 0: - mode = "MPI" if args.mpi else "serial" - print( - f"backend=cpu ansatz={ansatz.upper()} mode={mode} " - f"nqubits={args.nqubits} nlayers={args.nlayers} " - f"bond={format_optional(args.bond)} " - f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} " - f"quimb_backend={args.quimb_backend} dtype={args.dtype} " - f"torch_threads={args.torch_threads} " - f"tn_search_backend={args.tn_search_backend}" - ) - print("circuit observable exact value abs_error rel_error seconds") - - try: - for circuit_kind in circuits: - circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed) - named_observables = ( - [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})] - if args.pauli_pattern - else [ - (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits))) - for obs_kind in observables - ] - ) - - for obs_name, observable in named_observables: - exact = None - if args.exact and rank == 0: - if args.nqubits > args.exact_max_qubits: - raise ValueError( - f"--exact is limited to {args.exact_max_qubits} qubits by default." - ) - exact = exact_for_observable(circuit, observable, args.nqubits) - - result = run_cpu_expectation(circuit, observable, config) - if args.mpi and result.rank != 0: - continue - - abs_error = float("nan") if exact is None else abs(result.value - exact) - rel_error = ( - float("nan") - if exact is None - else abs_error / max(abs(exact), 1e-15) - ) - exact_text = "nan" if exact is None else f"{exact:.16e}" - print( - f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} " - f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}" - ) - for stat in result.parallel_stats or (): - cost = stat["path_cost"] - search_stats = stat.get("search_stats", {}) - print( - "tn_term_summary " - f"term={stat.get('term_index', 0)} " - f"search_seconds={stat.get('search_seconds', float('nan')):.3f} " - f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} " - f"completed_trials={search_stats.get('completed_trials', 'na')} " - f"finite_trials={search_stats.get('finite_trials', 'na')} " - f"failed_trials={search_stats.get('failed_trials', 'na')} " - f"requested_trials={search_stats.get('requested_trials', 'na')} " - f"best_score={search_stats.get('best_score', float('nan')):.6g} " - f"slices={cost['nslices']} " - f"log10_flops={cost['log10_flops']:.3f} " - f"log10_write={cost['log10_write']:.3f} " - f"log2_size={cost['log2_size']:.3f} " - f"log10_combo={cost['log10_combo']:.3f} " - f"peak_memory_gib={cost['peak_memory_gib']:.6g} " - f"slicing_overhead={cost['slicing_overhead']:.6g} " - f"rank_slices={stat.get('rank_slices', 'na')}" - ) - finally: - stop_dask_cluster(args, rank) - - -if __name__ == "__main__": - main() diff --git a/docs/contest_runners.md b/docs/contest_runners.md index 5298328..406b68e 100644 --- a/docs/contest_runners.md +++ b/docs/contest_runners.md @@ -1,88 +1,12 @@ -# TN -```bash -# search + contract,Open MPI 多节点:每节点 2 rank,每 rank 绑定 1 个 NUMA。 -# MPI_HOSTS 里每个节点写 :2,MPI_RANKS = 节点数 * 2。 -# 每个 rank 使用 MPI_PE 个 core;这台 2-NUMA AMD 节点用 MPI_PE=128。 +# Contest Runners -NQUBITS=40 \ -TN_DEBUG_TRIALS=1 \ -SCHEDULER_HOST=10.20.1.100 \ -DASK_ADDRESS=tcp://10.20.1.100:8786 \ -WORKER_HOSTS="10.20.1.100 10.20.1.101 10.20.1.102 10.20.1.103" \ -CASE=main1 \ -OBSERVABLES=long_z_string \ -TORCH_THREADS=80 \ -MPI_PE=80 \ -MPI_MAP_BY=ppr:1:numa:PE=80 \ -MPI_BIND_TO=core \ -OMP_NUM_THREADS=80 \ -MKL_NUM_THREADS=80 \ -BLIS_NUM_THREADS=80 \ -MPI_HOSTS="node-0:2,node-1:2,node-2:2,node-3:2" \ -MPI_RANKS=8 \ -NWORKERS=96 \ -TN_TARGET_SIZE=17179869184 \ -tools/run_tn_dask_mpi_all.sh +The reusable implementations live in `src/qibotn/backends/`. -# 单独缩并contract计算 +- `qibotn.run_contest_tn_case`: quimb+torch TN search/contract cases. +- `qibotn.run_contest_mps_case`: Vidal/MPS contest expectation cases. +- `qibotn.run_vidal_mpi_contest_case`: direct Vidal MPI observable sweep. +- `qibotn.run_custom_tn_expectation`: custom quimb+torch TN cases. -mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \ - -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \ - -x BLIS_NUM_THREADS=80 \ - -x OMP_NUM_THREADS=80 \ - -x MKL_NUM_THREADS=80 \ - -x OMP_PROC_BIND=close \ - -x OMP_PLACES=cores \ - -np 8 \ - -host node-0:2,node-1:2,node-2:2,node-3:2 \ - .venv/bin/python -u tools/tn_contest_runner.py contract \ - --mpi \ - --case main1 \ - --nqubits 34 \ - --nlayers 20 \ - --observables long_z_string \ - --tree-dir trees/contest_tn \ - --torch-threads 80 \ - --dtype complex64 -``` - -# MPS -``` -cd /home/qibo/qibotn - -MPIEXEC=mpirun \ -MPI_HOSTS="node-2:4,node-3:4" \ -MPI_RANKS=8 \ -MPI_PE=48 \ -MPI_MAP_BY=ppr:2:numa:PE=48 \ -MPI_BIND_TO=core \ -MPI_REPORT_BINDINGS=1 \ -TORCH_THREADS=48 \ -OMP_NUM_THREADS=48 \ -MKL_NUM_THREADS=48 \ -BLIS_NUM_THREADS=48 \ -OBS_FILTER=ring_xz \ -MAIN1_NQ=128 \ -MAIN1_LAYERS=24 \ -MAIN1_BOND=1024 \ -tools/run_vidal_mpi_contest_cases.sh main1 - - - -MPIEXEC=mpirun \ -MPI_HOSTS="node-2:4" \ -MPI_RANKS=4 \ -MPI_PE=48 \ -MPI_MAP_BY=ppr:2:numa:PE=48 \ -MPI_BIND_TO=core \ -MPI_REPORT_BINDINGS=1 \ -TORCH_THREADS=48 \ -OMP_NUM_THREADS=48 \ -MKL_NUM_THREADS=48 \ -BLIS_NUM_THREADS=48 \ -OBS_FILTER=ring_xz \ -MAIN1_NQ=128 \ -MAIN1_LAYERS=24 \ -MAIN1_BOND=1024 \ -tools/run_vidal_mpi_contest_cases.sh main1 -``` +`src/qibotn/backends/quimb.py` holds the TN helpers, +`src/qibotn/backends/qmatchatea.py` holds the qmatchatea MPS helpers, +and `src/qibotn/backends/vidal.py` holds the Vidal helpers. diff --git a/docs/home.md b/docs/home.md new file mode 100644 index 0000000..a6bb8c1 --- /dev/null +++ b/docs/home.md @@ -0,0 +1,26 @@ +# qibotn + +Core reusable code lives under `src/qibotn/`. Prefer importing from `qibotn` +or `qibotn.backends.*`; benchmark and runner helpers have been folded into the +package instead of being kept as standalone scripts. + +- `backends/quimb.py`: TN + torch helpers for quimb. +- `backends/qmatchatea.py`: qmatchatea + torch MPS helpers. +- `backends/vidal.py`: Vidal + torch helpers. +- `contest_cases.py`: shared contest circuits, observables, and case specs. +- `torch_utils.py`: shared torch array/thread helpers. + +Quimb TN reusable entrypoints include `build_quimb_backend_circuit`, +`build_expectation_tn`, `run_quimb_torch_expectation`, +`compare_quimb_gate_merge`, `compare_quimb_gate_merge_expectation`, +`profile_quimb_torch_expectation`, and `time_quimb_contract_implementations`. + +Common public imports include `qibotn.cpu_expectation`, +`qibotn.mps_expectation`, `qibotn.run_qmatchatea_expectation`, +`qibotn.run_vidal_expectation`, `qibotn.build_contest_circuit`, and +`qibotn.build_contest_observable`. + +Former script entrypoints are available as importable functions: +`qibotn.run_cpu_benchmark_cases`, `qibotn.run_contest_tn_case`, +`qibotn.run_custom_tn_expectation`, `qibotn.run_contest_mps_case`, +`qibotn.run_vidal_mpi_contest_case`, and `qibotn.run_vidal_validation_cases`. diff --git a/docs/xianchang.md b/docs/xianchang.md deleted file mode 100644 index 57411cc..0000000 --- a/docs/xianchang.md +++ /dev/null @@ -1,42 +0,0 @@ -mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \ - -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \ - -x BLIS_NUM_THREADS=80 \ - -x OMP_NUM_THREADS=80 \ - -x MKL_NUM_THREADS=80 \ - -x OMP_PROC_BIND=close \ - -x OMP_PLACES=cores \ - -np 4 \ - -host node-0:2,node-1:2,node-2:2,node-3:2 \ - .venv/bin/python -u tools/tn_contest_runner.py contract \ - --mpi \ - --case main1 \ - --nqubits 34 \ - --nlayers 20 \ - --observables long_z_string \ - --tree-dir trees/contest_tn \ - --torch-threads 80 \ - --dtype complex64 - - -SEARCH_TIME=300 NQUBITS=40 TN_DEBUG_TRIALS=1 SCHEDULER_HOST=10.20.1.102 DASK_ADDRESS=tcp://10.20.1.102:8786 WORKER_HOSTS="10.20.1.102 10.20.1.103" CASE=main1 OBSERVABLES=long_z_string TORCH_THREADS=80 MPI_PE=80 MPI_MAP_BY=ppr:1:numa:PE=80 MPI_BIND_TO=core OMP_NUM_THREADS=80 MKL_NUM_THREADS=80 BLIS_NUM_THREADS=80 MPI_HOSTS="node-2:2,node-3:2" MPI_RANKS=4 NWORKERS=128 TN_TARGET_SIZE=17179869184 tools/run_tn_dask_mpi_all.sh - - -NQUBITS=40 \ -TN_DEBUG_TRIALS=1 \ -SCHEDULER_HOST=10.20.1.102 \ -DASK_ADDRESS=tcp://10.20.1.102:8786 \ -WORKER_HOSTS="10.20.1.102 10.20.1.103" \ -CASE=main1 \ -OBSERVABLES=long_z_string \ -TORCH_THREADS=80 \ -MPI_PE=80 \ -MPI_MAP_BY=ppr:1:numa:PE=80 \ -MPI_BIND_TO=core \ -OMP_NUM_THREADS=80 \ -MKL_NUM_THREADS=80 \ -BLIS_NUM_THREADS=80 \ -MPI_HOSTS="node-2:2,node-3:2" \ -MPI_RANKS=4 \ -NWORKERS=96 \ -TN_TARGET_SIZE=17179869184 \ -tools/run_tn_dask_mpi_all.sh \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7ac26d8..6d668fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -60,7 +60,7 @@ mpmath==1.3.0 msgpack==1.1.2 networkx==3.6.1 numba==0.61.2 -numpy==2.0.1 +numpy @ file:///home/yx/numpy openqasm3==1.0.1 opt_einsum==3.4.0 optuna==4.8.0 @@ -93,7 +93,7 @@ python-multipart==0.0.26 PyYAML==6.0.3 qibo==0.3.2 qibojit==0.1.15 --e git+https://git.nudt.space/jaunatisblue/qibotn.git@4c7a10d026d514897dcc501b507fa604fb4e52d4#egg=qibotn +-e git+https://git.nudt.space/jaunatisblue/qibotn.git@eed42dcfa9739c609a58f7367fe403abf2e992a9#egg=qibotn qiskit==1.4.5 qmatchatea==1.5.8 qredtea==0.3.15 @@ -106,7 +106,7 @@ regex==2026.4.4 requests==2.33.1 rpds-py==0.30.0 rustworkx==0.17.1 -scipy==1.17.1 +scipy @ file:///home/yx/scipy setuptools==70.2.0 six==1.17.0 sniffio==1.3.1 @@ -118,13 +118,15 @@ stack-data==0.6.3 starlette==1.0.0 stevedore==5.7.0 symengine==0.13.0 -sympy==1.13.1 +sympy==1.14.0 tabulate==0.9.0 tblib==3.2.2 texttable==1.7.0 threadpoolctl==3.6.0 toolz==1.1.0 -torch @ file:///home/qibo/qibotn/wheels/torch-2.10.0a0+a36e1d3-cp312-cp312-linux_x86_64.whl +torch==2.11.0+cpu +torchaudio==2.11.0+cpu +torchvision==0.26.0+cpu tornado==6.5.5 tqdm==4.67.3 traitlets==5.14.3 @@ -135,4 +137,3 @@ uvicorn==0.46.0 wcwidth==0.6.0 webencodings==0.5.1 zict==3.0.0 - diff --git a/run_vidal_mps_cases.sh b/run_vidal_mps_cases.sh deleted file mode 100755 index 93d0268..0000000 --- a/run_vidal_mps_cases.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Focused Vidal/MPS expectation test cases for 1D chain circuits. -# -# These cases intentionally avoid qmatchatea and generic TN paths. They target -# the current supported scope: one-qubit gates, adjacent two-qubit gates, and -# Pauli-sum expectation values on a 1D chain. - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd "$ROOT_DIR" - -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -MPIEXEC="${MPIEXEC:-mpiexec}" -HOSTFILE="${HOSTFILE:-hostfile}" - -THREADS="${THREADS:-32}" -MPI_RANKS="${MPI_RANKS:-16}" -MPI_THREADS="${MPI_THREADS:-12}" - -export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}" -export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}" -source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh" - -run() { - echo - echo "--------------------------------------------------------------------------------" - echo "$*" - echo "--------------------------------------------------------------------------------" - "$@" -} - -case "${1:-help}" in - smoke) - # Short correctness-oriented run. Useful before starting long jobs. - run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - --mps \ - --nqubits 40 \ - --nlayers 10 \ - --bond 2048 \ - --torch-threads "$THREADS" \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz range2_xx long_z_string - ;; - - convergence) - # Same circuit/observable, increasing bond. Check value convergence. - for bond in ${BONDS:-4096 16384 65536}; do - run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - --mps \ - --nqubits "${NQ:-80}" \ - --nlayers "${LAYERS:-16}" \ - --bond "$bond" \ - --torch-threads "$THREADS" \ - --circuits "${CIRCUIT:-brickwall_cnot}" \ - --observables "${OBSERVABLE:-ring_xz}" - done - ;; - - single-long) - # Single long Vidal run. On node-3, a similar n=40,l=30,bond=2048 case - # took about 9 minutes for one expectation. This one is meant to be longer. - run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - --mps \ - --nqubits "${NQ:-80}" \ - --nlayers "${LAYERS:-16}" \ - --bond "${BOND:-65536}" \ - --torch-threads "$THREADS" \ - --circuits "${CIRCUIT:-brickwall_cnot}" \ - --observables "${OBSERVABLE:-ring_xz}" - ;; - - suite-long) - # Application-style multi-circuit, multi-observable MPS run. - # This is intentionally multi-term and should run much longer than single-long. - run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - --mps \ - --nqubits "${NQ:-80}" \ - --nlayers "${LAYERS:-16}" \ - --bond "${BOND:-65536}" \ - --torch-threads "$THREADS" \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz mixed_local range2_xx long_z_string - ;; - - mpi-long) - # Multi-node Vidal segmented MPS run. Uses HOSTFILE. - run "$MPIEXEC" -hostfile "$HOSTFILE" -n "$MPI_RANKS" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - --mpi --mps \ - --nqubits "${NQ:-80}" \ - --nlayers "${LAYERS:-16}" \ - --bond "${BOND:-65536}" \ - --torch-threads "$MPI_THREADS" \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz mixed_local range2_xx long_z_string - ;; - - stress) - # Heavier entanglement. Start only after single-long is stable. - run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - --mps \ - --nqubits "${NQ:-80}" \ - --nlayers "${LAYERS:-18}" \ - --bond "${BOND:-262144}" \ - --torch-threads "${THREADS:-48}" \ - --circuits "${CIRCUIT:-rxx_rzz}" \ - --observables ring_xz open_zz range2_xx - ;; - - help|*) - cat <<'EOF' -Usage: ./run_vidal_mps_cases.sh [smoke|convergence|single-long|suite-long|mpi-long|stress] - -Common overrides: - PYTHON_BIN=.venv/bin/python - THREADS=32 - OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 - -Single-node scale overrides: - NQ=80 LAYERS=16 BOND=65536 - CIRCUIT=brickwall_cnot - OBSERVABLE=ring_xz - BONDS="4096 16384 65536" # for convergence mode - -Multi-node overrides: - HOSTFILE=hostfile - MPI_RANKS=16 MPI_THREADS=12 - -Recommended first runs: - ./run_vidal_mps_cases.sh smoke - ./run_vidal_mps_cases.sh convergence - ./run_vidal_mps_cases.sh single-long -EOF - ;; -esac diff --git a/src/qibotn/__init__.py b/src/qibotn/__init__.py index fb2c1f7..9a1ee8a 100644 --- a/src/qibotn/__init__.py +++ b/src/qibotn/__init__.py @@ -8,6 +8,108 @@ _LAZY_EXPORTS = { "cpu_expectation": ("qibotn.expectation_runner", "cpu_expectation"), "mps_expectation": ("qibotn.expectation_runner", "mps_expectation"), "cpu_runcard": ("qibotn.expectation_runner", "cpu_runcard"), + "ExpectationConfig": ("qibotn.expectation_runner", "ExpectationConfig"), + "exact_for_observable": ("qibotn.expectation_runner", "exact_for_observable"), + "run_cpu_expectation": ("qibotn.expectation_runner", "run_cpu_expectation"), + "cpu_benchmark_parallel_opts": ( + "qibotn.expectation_runner", + "cpu_benchmark_parallel_opts", + ), + "run_cpu_benchmark_cases": ( + "qibotn.expectation_runner", + "run_cpu_benchmark_cases", + ), + "build_benchmark_circuit": ("qibotn.benchmark_cases", "build_circuit"), + "benchmark_observable_terms": ("qibotn.benchmark_cases", "observable_terms"), + "exact_pauli_sum": ("qibotn.benchmark_cases", "exact_pauli_sum"), + "ring_xz_statevector_expectation": ( + "qibotn.benchmark_cases", + "ring_xz_statevector_expectation", + ), + "terms_to_dict": ("qibotn.benchmark_cases", "terms_to_dict"), + "build_contest_circuit": ("qibotn.contest_cases", "build_contest_circuit"), + "build_contest_observable": ( + "qibotn.contest_cases", + "build_contest_observable", + ), + "contest_cases": ("qibotn.contest_cases", "CASES"), + "analyze_contraction_tree": ("qibotn.parallel", "analyze_contraction_tree"), + "load_tree_payload": ("qibotn.parallel", "load_tree_payload"), + "save_tree_payload": ("qibotn.parallel", "save_tree_payload"), + "slice_tree_payload": ("qibotn.parallel", "slice_tree_payload"), + "make_qmatchatea_backend": ( + "qibotn.backends.qmatchatea", + "make_qmatchatea_backend", + ), + "build_qmatchatea_backend": ( + "qibotn.backends.qmatchatea", + "build_qmatchatea_backend", + ), + "benchmark_qmatchatea_svd_control": ( + "qibotn.backends.qmatchatea", + "benchmark_qmatchatea_svd_control", + ), + "run_qmatchatea_expectation": ( + "qibotn.backends.qmatchatea", + "run_qmatchatea_expectation", + ), + "exact_mps_expectation": ( + "qibotn.backends.qmatchatea", + "exact_mps_expectation", + ), + "make_vidal_backend": ("qibotn.backends.vidal", "make_vidal_backend"), + "compare_vidal_backend_qmatchatea": ( + "qibotn.backends.vidal", + "compare_vidal_backend_qmatchatea", + ), + "run_vidal_expectation": ("qibotn.backends.vidal", "run_vidal_expectation"), + "run_segmented_vidal_ring_xz": ( + "qibotn.backends.vidal", + "run_segmented_vidal_ring_xz", + ), + "build_expectation_tn": ("qibotn.backends.quimb", "build_expectation_tn"), + "build_quimb_circuit_stats": ( + "qibotn.backends.quimb", + "build_quimb_circuit_stats", + ), + "compare_quimb_gate_merge": ( + "qibotn.backends.quimb", + "compare_quimb_gate_merge", + ), + "compare_quimb_gate_merge_expectation": ( + "qibotn.backends.quimb", + "compare_quimb_gate_merge_expectation", + ), + "contract_tn": ("qibotn.backends.quimb", "contract_tn"), + "load_custom_case_module": ("qibotn.backends.quimb", "load_custom_case_module"), + "profile_quimb_torch_expectation": ( + "qibotn.backends.quimb", + "profile_quimb_torch_expectation", + ), + "qibo_circuit_to_quimb_torch": ( + "qibotn.backends.quimb", + "qibo_circuit_to_quimb_torch", + ), + "search_contraction_tree": ("qibotn.backends.quimb", "search_contraction_tree"), + "sorted_tree": ("qibotn.backends.quimb", "sorted_tree"), + "run_contest_tn_case": ("qibotn.backends.quimb", "run_contest_tn_case"), + "run_custom_tn_expectation": ( + "qibotn.backends.quimb", + "run_custom_tn_expectation", + ), + "time_quimb_contract_implementations": ( + "qibotn.backends.quimb", + "time_quimb_contract_implementations", + ), + "run_contest_mps_case": ("qibotn.backends.vidal", "run_contest_mps_case"), + "run_vidal_mpi_contest_case": ( + "qibotn.backends.vidal", + "run_vidal_mpi_contest_case", + ), + "run_vidal_validation_cases": ( + "qibotn.backends.vidal", + "run_vidal_validation_cases", + ), "pauli_pattern": ("qibotn.observables", "pauli_pattern"), "pauli_sum": ("qibotn.observables", "pauli_sum"), } diff --git a/src/qibotn/backends/cpu.py b/src/qibotn/backends/cpu.py index 91b0528..c724cd9 100644 --- a/src/qibotn/backends/cpu.py +++ b/src/qibotn/backends/cpu.py @@ -18,6 +18,7 @@ from qibotn.backends.vidal import ( _unsupported_reason, ) from qibotn.observables import check_observable +from qibotn.torch_utils import arrays_to_backend, torch_cpu_array, torch_dtype def _as_bool_or_dict(value, name): @@ -310,10 +311,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): def _quimb_backend(self): import qibotn.backends.quimb as qmb - return qmb.BACKENDS[self.quimb_backend]( + backend = qmb.BACKENDS[self.quimb_backend]( quimb_backend=self.quimb_backend, contraction_optimizer=self.contraction_optimizer, ) + backend.dtype = self.dtype + return backend def _bind_rank_to_numa_domain(self, rank): self.numa_domain = _bind_numa_node(rank) @@ -375,6 +378,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): dask_close_workers = bool(opts.get("dask_close_workers", False)) print_stats = bool(opts.get("print_stats", False)) debug_trials = bool(opts.get("debug_trials", False)) + search_seed = int(opts.get("search_seed", 0)) + merge_1q = opts.get("merge_1q", "auto") + merge_2q = opts.get("merge_2q", "auto") + sort_contract_indices = opts.get("sort_contract_indices", "auto") + if sort_contract_indices == "auto": + sort_contract_indices = self.quimb_backend == "torch" search_only = bool(opts.get("search_only", False)) save_tree_path = opts.get("save_tree_path") load_tree_path = opts.get("load_tree_path") @@ -382,6 +391,38 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): saved_trees = [] saved_costs = [] + def term_stats( + term_index, + factors, + path_cost, + search_stats, + tree_slices, + slice_assignment, + rank_slices, + search_seconds, + contract_seconds, + ): + return { + "term_index": term_index, + "term_factors": tuple(factors), + "path_cost": path_cost, + "search_stats": search_stats, + "tree_slices": tree_slices, + "slice_assignment": slice_assignment, + "rank_slices": rank_slices, + "search_seconds": search_seconds, + "contract_seconds": contract_seconds, + "search_workers": search_workers, + "search_repeats": search_repeats, + "search_time": search_time, + "search_backend": search_backend or method, + "search_seed": search_seed, + "merge_1q": merge_1q, + "merge_2q": merge_2q, + "dask_address": dask_address, + "numa_domain": getattr(self, "numa_domain", None), + } + if load_tree_path: with Path(load_tree_path).open("rb") as f: payload = pickle.load(f) @@ -396,6 +437,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): "max_bond": self.max_bond_dimension, "cutoff": self.cut_ratio, }, + merge_1q=merge_1q, + merge_2q=merge_2q, ) total_value = 0.0 + 0.0j @@ -415,6 +458,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): ) else: op, where = _pauli_term_to_dense_operator(factors) + if self.quimb_backend == "torch": + op = torch_cpu_array(op, dtype=torch_dtype(self.dtype)) tn = qc.local_expectation( op, where, @@ -455,10 +500,18 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): debug_trials=debug_trials, dask_close_workers=dask_close_workers, expected_workers=dask_expected_workers, + search_seed=search_seed, ) search_seconds = time.perf_counter() - search_start if tree is None: raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.") + if sort_contract_indices and hasattr(tree, "sort_contraction_indices"): + tree.sort_contraction_indices( + priority=opts.get("sort_contract_indices_priority", "flops"), + make_output_contig=True, + make_contracted_contig=True, + reset=True, + ) if self.parallel_opts.get("contract_implementation") == "cpp": from qibotn.torch_contractor import prepare_torch_cpp_contractor @@ -490,23 +543,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): if search_only: self.parallel_stats.append( - { - "term_index": term_index, - "term_factors": tuple(factors), - "path_cost": path_cost, - "search_stats": search_stats, - "tree_slices": int(getattr(tree, "multiplicity", 1)), - "slice_assignment": "search_only", - "rank_slices": [], - "search_seconds": search_seconds, - "contract_seconds": 0.0, - "search_workers": search_workers, - "search_repeats": search_repeats, - "search_time": search_time, - "search_backend": search_backend or method, - "dask_address": dask_address, - "numa_domain": getattr(self, "numa_domain", None), - } + term_stats( + term_index, + factors, + path_cost, + search_stats, + int(getattr(tree, "multiplicity", 1)), + "search_only", + [], + search_seconds, + 0.0, + ) ) continue @@ -523,23 +570,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): flush=True, ) self.parallel_stats.append( - { - "term_index": term_index, - "term_factors": tuple(factors), - "path_cost": path_cost, - "search_stats": search_stats, - "tree_slices": 1, - "slice_assignment": "root", - "rank_slices": [1] + [0] * (size - 1), - "search_seconds": search_seconds, - "contract_seconds": contract_seconds, - "search_workers": search_workers, - "search_repeats": search_repeats, - "search_time": search_time, - "search_backend": search_backend or method, - "dask_address": dask_address, - "numa_domain": getattr(self, "numa_domain", None), - } + term_stats( + term_index, + factors, + path_cost, + search_stats, + 1, + "root", + [1] + [0] * (size - 1), + search_seconds, + contract_seconds, + ) ) total_value += coeff * complex(value) continue @@ -556,36 +597,31 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): flush=True, ) self.parallel_stats.append( - { - "term_index": term_index, - "term_factors": tuple(factors), - "path_cost": path_cost, - "search_stats": search_stats, - "tree_slices": int(getattr(tree, "multiplicity", 1)), - "slice_assignment": "local", - "rank_slices": [int(getattr(tree, "multiplicity", 1))], - "search_seconds": search_seconds, - "contract_seconds": contract_seconds, - "search_workers": search_workers, - "search_repeats": search_repeats, - "search_time": search_time, - "search_backend": search_backend or method, - "dask_address": dask_address, - "numa_domain": getattr(self, "numa_domain", None), - } + term_stats( + term_index, + factors, + path_cost, + search_stats, + int(getattr(tree, "multiplicity", 1)), + "local", + [int(getattr(tree, "multiplicity", 1))], + search_seconds, + contract_seconds, + ) ) total_value += coeff * complex(np.asarray(value).reshape(-1)[0]) continue contract_start = time.perf_counter() arrays = self._term_arrays(tn, backend) + contract_implementation = self._contract_implementation(backend) value, stats = parallel_contract( tree, arrays, method="mpi", comm=comm, return_stats=True, - implementation=self.parallel_opts.get("contract_implementation"), + implementation=contract_implementation, ) contract_seconds = time.perf_counter() - contract_start gathered_stats = comm.gather(stats, root=0) @@ -598,25 +634,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): flush=True, ) self.parallel_stats.append( - { - "term_index": term_index, - "term_factors": tuple(factors), - "path_cost": path_cost, - "search_stats": search_stats, - "tree_slices": stats.nslices, - "slice_assignment": stats.assignment, - "rank_slices": [ - item.local_slices for item in gathered_stats - ], - "search_seconds": search_seconds, - "contract_seconds": contract_seconds, - "search_workers": search_workers, - "search_repeats": search_repeats, - "search_time": search_time, - "search_backend": search_backend or method, - "dask_address": dask_address, - "numa_domain": getattr(self, "numa_domain", None), - } + term_stats( + term_index, + factors, + path_cost, + search_stats, + stats.nslices, + stats.assignment, + [item.local_slices for item in gathered_stats], + search_seconds, + contract_seconds, + ) ) total_value += coeff * complex(np.asarray(value).reshape(-1)[0]) @@ -644,18 +672,20 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): return np.nan if rank != 0 else float(np.real(total_value)) + def _contract_implementation(self, backend): + implementation = self.parallel_opts.get("contract_implementation") + if implementation is None and backend.backend == "torch": + return "autoray" + return implementation + def _contract_term_unsliced(self, tn, tree, backend): - contract_implementation = self.parallel_opts.get("contract_implementation") + contract_implementation = self._contract_implementation(backend) if contract_implementation == "cpp": if backend.backend != "torch": raise ValueError("contract_implementation='cpp' requires torch backend.") - from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype from qibotn.torch_contractor import contract_tree_cpp - arrays = [ - _torch_cpu_array(array, dtype=_torch_dtype(self.dtype)) - for array in tn.arrays - ] + arrays = arrays_to_backend(tn.arrays, "torch", dtype=self.dtype) nslices = int(getattr(tree, "multiplicity", 1)) if nslices > 1: total = None @@ -666,12 +696,10 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): return contract_tree_cpp(tree, arrays) if backend.backend == "torch": - from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype - for tensor in tn.tensors: - tensor._data = _torch_cpu_array( + tensor._data = torch_cpu_array( tensor._data, - dtype=_torch_dtype(self.dtype), + dtype=torch_dtype(self.dtype), ) return tn.contract( all, @@ -693,13 +721,9 @@ class CpuTensorNet(QibotnBackend, NumpyBackend): return None if user_slicing_opts is None else dict(user_slicing_opts) def _term_arrays(self, tn, backend): - if backend.backend == "torch": - from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype - - return [ - _torch_cpu_array(array, dtype=_torch_dtype(self.dtype)) - for array in tn.arrays - ] - from qibotn.backends.quimb import _numpy_dtype - - return [backend.engine.asarray(array, dtype=_numpy_dtype(self.dtype)) for array in tn.arrays] + return arrays_to_backend( + tn.arrays, + backend.backend, + engine=backend.engine, + dtype=self.dtype, + ) diff --git a/src/qibotn/backends/cutensornet_helpers.py b/src/qibotn/backends/cutensornet_helpers.py new file mode 100644 index 0000000..1ba4511 --- /dev/null +++ b/src/qibotn/backends/cutensornet_helpers.py @@ -0,0 +1,321 @@ +"""cuTensorNet circuit and MPS conversion helpers.""" + +from __future__ import annotations + +import numpy as np + +try: + import cupy as cp + import cuquantum.bindings.cutensornet as cutn + from cuquantum.tensornet import contract, contract_path + from cuquantum.tensornet.experimental import contract_decompose +except ImportError: # pragma: no cover - exercised on CPU-only installations + cp = None + cutn = None + contract = None + contract_path = None + contract_decompose = None + + +def _require_cupy(): + if cp is None: + raise ImportError( + "The cuQuantum circuit converter requires cupy. " + "Install the GPU dependencies or use the CPU backend." + ) + return cp + + +def _require_cutensornet(): + if cp is None or cutn is None: + raise ImportError( + "The cuQuantum MPS converter requires cupy and cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) + + +def _require_tensornet_mps(): + if cp is None or contract is None or contract_decompose is None: + raise ImportError( + "The cuQuantum MPS helpers require cupy and cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) + + +def _require_contract(): + if contract is None or contract_path is None: + raise ImportError( + "The cuQuantum MPS contraction helper requires cuquantum. " + "Install the GPU dependencies or use the CPU backend." + ) + + +class QiboCircuitToEinsum: + """Convert a Qibo circuit to cuQuantum interleaved TN operands.""" + + def __init__(self, circuit, dtype="complex128"): + self.backend = _require_cupy() + self.dtype = getattr(self.backend, dtype) + self.init_basis_map(self.backend, dtype) + self.init_intermediate_circuit(circuit) + self.circuit = circuit + + def state_vector_operands(self): + input_bitstring = "0" * len(self.active_qubits) + input_operands = self._get_bitstring_tensors(input_bitstring) + mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits( + self.active_qubits + ) + gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( + self.gate_tensors, qubits_frontier, next_frontier + ) + operands = input_operands + gate_operands + mode_labels += gate_mode_labels + out_list = [qubits_frontier[key] for key in qubits_frontier] + operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] + operand_exp_interleave.append(out_list) + return operand_exp_interleave + + def _init_mode_labels_from_qubits(self, qubits): + nqubits = len(qubits) + frontier_dict = {q: i for i, q in enumerate(qubits)} + mode_labels = [[i] for i in range(nqubits)] + return mode_labels, frontier_dict, nqubits + + def _get_bitstring_tensors(self, bitstring): + return [self.basis_map[ibit] for ibit in bitstring] + + def _parse_gates_to_mode_labels_operands(self, gates, qubits_frontier, next_frontier): + mode_labels = [] + operands = [] + for tensor, gate_qubits in gates: + operands.append(tensor) + input_mode_labels = [] + output_mode_labels = [] + for qubit in gate_qubits: + input_mode_labels.append(qubits_frontier[qubit]) + output_mode_labels.append(next_frontier) + qubits_frontier[qubit] = next_frontier + next_frontier += 1 + mode_labels.append(output_mode_labels + input_mode_labels) + return mode_labels, operands + + def op_shape_from_qubits(self, nqubits): + return (2, 2) * nqubits + + def init_intermediate_circuit(self, circuit): + self.gate_tensors = [] + gates_qubits = [] + for gate in circuit.queue: + gate_qubits = gate.control_qubits + gate.target_qubits + gates_qubits.extend(gate_qubits) + required_shape = self.op_shape_from_qubits(len(gate_qubits)) + self.gate_tensors.append( + ( + self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape( + required_shape + ), + gate_qubits, + ) + ) + self.active_qubits = np.unique(gates_qubits) + + def init_basis_map(self, backend, dtype): + asarray = backend.asarray + self.basis_map = { + "0": asarray([1, 0], dtype=dtype), + "1": asarray([0, 1], dtype=dtype), + } + + def init_inverse_circuit(self, circuit): + self.gate_tensors_inverse = [] + gates_qubits_inverse = [] + for gate in circuit.queue: + gate_qubits = gate.control_qubits + gate.target_qubits + gates_qubits_inverse.extend(gate_qubits) + required_shape = self.op_shape_from_qubits(len(gate_qubits)) + self.gate_tensors_inverse.append( + (self.backend.asarray(gate.matrix()).reshape(required_shape), gate_qubits) + ) + self.active_qubits_inverse = np.unique(gates_qubits_inverse) + + def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None): + if backend is None: + backend = _require_cupy() + asarray = backend.asarray + operand_map = { + "I": asarray([[1, 0], [0, 1]], dtype=dtype), + "X": asarray([[0, 1], [1, 0]], dtype=dtype), + "Y": asarray([[0, -1j], [1j, 0]], dtype=dtype), + "Z": asarray([[1, 0], [0, -1]], dtype=dtype), + } + gates = [] + for qubit, pauli_char in pauli_map.items(): + operand = operand_map.get(pauli_char) + if operand is None: + raise ValueError("pauli string character must be one of I/X/Y/Z") + gates.append((operand, (qubit,))) + return gates + + def expectation_operands(self, ham_gates): + input_bitstring = "0" * self.circuit.nqubits + input_operands = self._get_bitstring_tensors(input_bitstring) + mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits( + range(self.circuit.nqubits) + ) + gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( + self.gate_tensors, qubits_frontier, next_frontier + ) + operands = input_operands + gate_operands + mode_labels += gate_mode_labels + + self.init_inverse_circuit(self.circuit.invert()) + next_frontier = max(qubits_frontier.values()) + 1 + gates_inverse = ham_gates + self.gate_tensors_inverse + gate_mode_labels_inverse, gate_operands_inverse = ( + self._parse_gates_to_mode_labels_operands( + gates_inverse, qubits_frontier, next_frontier + ) + ) + mode_labels = ( + mode_labels + + gate_mode_labels_inverse + + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)] + ) + operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits] + operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] + operand_exp_interleave.append([]) + return operand_exp_interleave + + +def initial_mps(num_qubits, dtype): + _require_tensornet_mps() + state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1) + return [state_tensor] * num_qubits + + +def mps_site_right_swap(mps_tensors, i, **kwargs): + _require_tensornet_mps() + left, _, right = contract_decompose( + "ipj,jqk->iqj,jpk", + *mps_tensors[i : i + 2], + algorithm=kwargs.get("algorithm", None), + options=kwargs.get("options", None), + ) + mps_tensors[i : i + 2] = (left, right) + return mps_tensors + + +def apply_mps_gate(mps_tensors, gate, qubits, **kwargs): + _require_tensornet_mps() + n_qubits = len(qubits) + if n_qubits == 1: + site = qubits[0] + mps_tensors[site] = contract( + "ipj,qp->iqj", + mps_tensors[site], + gate, + options=kwargs.get("options", None), + ) + elif n_qubits == 2: + left, right = qubits + if left > right: + return apply_mps_gate( + mps_tensors, gate.transpose(1, 0, 3, 2), (right, left), **kwargs + ) + if left + 1 == right: + a_tensor, _, b_tensor = contract_decompose( + "ipj,jqk,rspq->irj,jsk", + *mps_tensors[left : left + 2], + gate, + algorithm=kwargs.get("algorithm", None), + options=kwargs.get("options", None), + ) + mps_tensors[left : left + 2] = (a_tensor, b_tensor) + else: + mps_site_right_swap(mps_tensors, left, **kwargs) + apply_mps_gate(mps_tensors, gate, (left + 1, right), **kwargs) + mps_site_right_swap(mps_tensors, left, **kwargs) + else: + raise NotImplementedError("Only one- and two-qubit gates supported") + + +class QiboCircuitToMPS: + """Convert a Qibo circuit to a cuTensorNet MPS representation.""" + + def __init__(self, circ_qibo, gate_algo, dtype="complex128", rand_seed=0): + _require_cutensornet() + np.random.seed(rand_seed) + cp.random.seed(rand_seed) + self.num_qubits = circ_qibo.nqubits + self.handle = cutn.create() + self.dtype = dtype + self.mps_tensors = initial_mps(self.num_qubits, dtype=dtype) + circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype) + for gate, qubits in circuitconvertor.gate_tensors: + apply_mps_gate( + self.mps_tensors, + gate, + qubits, + algorithm=gate_algo, + options={"handle": self.handle}, + ) + + def __del__(self): + handle = getattr(self, "handle", None) + if cutn is not None and handle is not None: + cutn.destroy(handle) + + +class MPSContractionHelper: + """Contract cuTensorNet MPS tensors to norms, states, or expectations.""" + + def __init__(self, num_qubits): + self.num_qubits = num_qubits + self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)] + offset = 2 * num_qubits + 1 + self.ket_modes = [ + (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits) + ] + + def contract_norm(self, mps_tensors, options=None): + interleaved_inputs = [] + for i, tensor in enumerate(mps_tensors): + interleaved_inputs.extend( + [tensor, self.bra_modes[i], tensor.conj(), self.ket_modes[i]] + ) + interleaved_inputs.append([]) + return self._contract(interleaved_inputs, options=options).real + + def contract_state_vector(self, mps_tensors, options=None): + interleaved_inputs = [] + for i, tensor in enumerate(mps_tensors): + interleaved_inputs.extend([tensor, self.bra_modes[i]]) + output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes]) + interleaved_inputs.append(output_modes) + return self._contract(interleaved_inputs, options=options) + + def contract_expectation( + self, mps_tensors, operator, qubits, options=None, normalize=False + ): + interleaved_inputs = [] + extra_mode = 3 * self.num_qubits + 2 + operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits] + qubits = list(qubits) + for i, tensor in enumerate(mps_tensors): + interleaved_inputs.extend([tensor, self.bra_modes[i]]) + ket_modes = self.ket_modes[i] + if i in qubits: + ket_modes = (ket_modes[0], extra_mode, ket_modes[2]) + operator_modes[qubits.index(i)] = extra_mode + extra_mode += 1 + interleaved_inputs.extend([tensor.conj(), ket_modes]) + interleaved_inputs.extend([operator, tuple(operator_modes)]) + interleaved_inputs.append([]) + norm = self.contract_norm(mps_tensors, options=options) if normalize else 1 + return self._contract(interleaved_inputs, options=options) / norm + + def _contract(self, interleaved_inputs, options=None): + _require_contract() + path = contract_path(*interleaved_inputs, options=options)[0] + return contract(*interleaved_inputs, options=options, optimize={"path": path}) diff --git a/src/qibotn/backends/qmatchatea.py b/src/qibotn/backends/qmatchatea.py index 41381dc..b76424f 100644 --- a/src/qibotn/backends/qmatchatea.py +++ b/src/qibotn/backends/qmatchatea.py @@ -1,6 +1,9 @@ """Implementation of Quantum Matcha Tea backend.""" +from __future__ import annotations + import re +import time from dataclasses import dataclass import numpy as np @@ -12,6 +15,7 @@ from qibo.config import raise_error from qmatchatea.utils import MPISettings from qibotn.backends.abstract import QibotnBackend +from qibotn.benchmark_cases import exact_pauli_sum from qibotn.observables import check_observable from qibotn.result import TensorNetworkResult @@ -364,3 +368,207 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend): use_itpo=False, ) return obs_sum + + +@dataclass(frozen=True) +class QMatchaTeaExpectationResult: + value: float + seconds: float + backend: object + + +@dataclass(frozen=True) +class QMatchaTeaBuildResult: + backend: object + build_seconds: float + + +@dataclass(frozen=True) +class QMatchaTeaSvdControlResult: + ctrl: str + contract_singvals: str + status: str + median_ms: float + min_ms: float + rel_error: float | None + kept: int | None + error: str + + +def make_qmatchatea_backend( + *, + bond=10, + cut_ratio=1e-9, + tensor_module="torch", + svd_control="E!", + compile_circuit=True, + track_memory=False, + mpi_approach="SR", + mpi_num_procs=1, + mpi_where_barriers=-1, + mpi_isometrization=-1, +): + backend = QMatchaTeaBackend() + backend.configure_tn_simulation( + ansatz="MPS", + max_bond_dimension=bond, + cut_ratio=cut_ratio, + svd_control=svd_control, + tensor_module=tensor_module, + compile_circuit=compile_circuit, + track_memory=track_memory, + mpi_approach=mpi_approach, + mpi_num_procs=mpi_num_procs, + mpi_where_barriers=mpi_where_barriers, + mpi_isometrization=mpi_isometrization, + ) + return backend + + +def build_qmatchatea_backend( + *, + bond=10, + cut_ratio=1e-9, + tensor_module="torch", + svd_control="E!", + compile_circuit=True, + track_memory=False, + mpi_approach="SR", + mpi_num_procs=1, + mpi_where_barriers=-1, + mpi_isometrization=-1, +): + start = time.perf_counter() + backend = make_qmatchatea_backend( + bond=bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + svd_control=svd_control, + compile_circuit=compile_circuit, + track_memory=track_memory, + mpi_approach=mpi_approach, + mpi_num_procs=mpi_num_procs, + mpi_where_barriers=mpi_where_barriers, + mpi_isometrization=mpi_isometrization, + ) + return QMatchaTeaBuildResult(backend=backend, build_seconds=time.perf_counter() - start) + + +def exact_mps_expectation(circuit, observable, nqubits): + if isinstance(observable, dict) and "terms" in observable: + terms = [ + ( + term["coefficient"], + tuple((name, site) for name, site in term["operators"]), + ) + for term in observable["terms"] + ] + return exact_pauli_sum(circuit, terms, nqubits) + + hamiltonian = check_observable(observable, nqubits) + return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real) + + +def run_qmatchatea_expectation( + circuit, + observable, + *, + bond=10, + cut_ratio=1e-9, + tensor_module="torch", + svd_control="E!", + compile_circuit=True, + preprocess=True, + track_memory=False, + mpi_approach="SR", + mpi_num_procs=1, + mpi_where_barriers=-1, + mpi_isometrization=-1, +): + built = build_qmatchatea_backend( + bond=bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + svd_control=svd_control, + compile_circuit=compile_circuit, + track_memory=track_memory, + mpi_approach=mpi_approach, + mpi_num_procs=mpi_num_procs, + mpi_where_barriers=mpi_where_barriers, + mpi_isometrization=mpi_isometrization, + ) + start = time.perf_counter() + value = built.backend.expectation( + circuit, + observable, + preprocess=preprocess, + compile_circuit=compile_circuit, + ) + return QMatchaTeaExpectationResult( + value=float(np.real(value)), + seconds=time.perf_counter() - start, + backend=built.backend, + ) + + +def benchmark_qmatchatea_svd_control(matrix, *, ctrl, max_bond, contract_singvals, repeats): + import gc + import statistics + + import torch + + from qredtea.torchapi import QteaTorchTensor + + conv = qmatchatea.QCConvergenceParameters( + max_bond_dimension=max_bond, + cut_ratio=0.0, + svd_ctrl=ctrl, + ) + qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu") + + times = [] + rel_error = None + kept = None + status = "ok" + error = "" + + for i in range(repeats): + gc.collect() + if torch.cuda.is_available(): + torch.cuda.synchronize() + t0 = time.perf_counter() + try: + left, right, singvals, _ = qtensor.split_svd( + [0], + [1], + contract_singvals=contract_singvals, + conv_params=conv, + ) + except Exception as exc: # noqa: BLE001 + status = "error" + error = repr(exc) + break + if torch.cuda.is_available(): + torch.cuda.synchronize() + times.append(time.perf_counter() - t0) + + if i == repeats - 1: + left_matrix = left.elem.reshape(matrix.shape[0], -1) + right_matrix = right.elem.reshape(-1, matrix.shape[1]) + recon = left_matrix @ right_matrix + rel_error = ( + torch.linalg.vector_norm(matrix - recon) + / torch.linalg.vector_norm(matrix) + ).item() + kept = int(singvals.numel()) + + return QMatchaTeaSvdControlResult( + ctrl=ctrl, + contract_singvals=contract_singvals, + status=status, + median_ms=float("nan") if not times else statistics.median(times) * 1000, + min_ms=float("nan") if not times else min(times) * 1000, + rel_error=rel_error, + kept=kept, + error=error, + ) diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py index 3d49b00..3d20cbb 100644 --- a/src/qibotn/backends/quimb.py +++ b/src/qibotn/backends/quimb.py @@ -1,6 +1,14 @@ +import copy +import importlib.util +import inspect +import json +import time from collections import Counter +from dataclasses import dataclass +from pathlib import Path from typing import Optional +import numpy as np import quimb as qu import quimb.tensor as qtn from qibo.config import raise_error @@ -8,7 +16,39 @@ from qibo.gates.abstract import ParametrizedGate from qibo.models import Circuit from qibotn.backends.abstract import QibotnBackend +from qibotn.observables import extract_gates_and_qubits +from qibotn.parallel import contraction_tree_costs, parallel_path_search from qibotn.result import TensorNetworkResult +from qibotn.torch_utils import ( + arrays_to_backend as _arrays_to_backend, + numpy_dtype as _numpy_dtype, + torch_cpu_array as _torch_cpu_array, + torch_dtype as _torch_dtype, +) + + +def _real_scalar(x): + return float(x.real) + + +def torch_contract_implementation(backend="torch", implementation=None): + if implementation is not None: + return implementation + return "autoray" if backend == "torch" else None + + +def _quimb_should_parametrize(gate): + """Use quimb parametrized tensors only for non-plain numeric parameters.""" + if not isinstance(gate, ParametrizedGate) or not getattr(gate, "trainable", True): + return False + for param in getattr(gate, "parameters", ()): + if isinstance(param, (int, float, complex, np.number)): + continue + if isinstance(param, np.ndarray) and param.ndim == 0: + continue + return True + return False + GATE_MAP = { "h": "H", @@ -20,6 +60,9 @@ GATE_MAP = { "rx": "RX", "ry": "RY", "rz": "RZ", + "rxx": "RXX", + "ryy": "RYY", + "rzz": "RZZ", "u3": "U3", "cx": "CX", "cnot": "CNOT", @@ -40,50 +83,6 @@ GATE_MAP = { PAULI_DENSE_MAX_QUBITS = 8 -def _torch_cpu_array(data, dtype=None): - """Convert array-like data to a contiguous CPU torch tensor.""" - import numpy as np - import torch - - if isinstance(data, torch.Tensor): - x = data - else: - array = np.asarray(data) - if any(stride < 0 for stride in array.strides): - array = np.ascontiguousarray(array) - x = torch.from_numpy(array) - - if x.device.type != "cpu": - x = x.cpu() - if dtype is not None and x.dtype != dtype: - x = x.to(dtype) - if not x.is_contiguous(): - x = x.contiguous() - return x - - -def _torch_dtype(dtype): - import torch - - if dtype in ("complex64", "single"): - return torch.complex64 - return torch.complex128 - - -def _numpy_dtype(dtype): - import numpy as np - - if dtype in ("complex64", "single"): - return np.complex64 - return np.complex128 - - -def _arrays_to_backend(arrays, backend, engine, dtype="complex128"): - if backend == "torch": - return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in arrays] - return [engine.asarray(array, dtype=_numpy_dtype(dtype)) for array in arrays] - - def _pauli_term_to_dense_operator(factors): op = None where = [] @@ -101,45 +100,54 @@ def pauli_product_expectation_tn( simplify_atol=1e-12, simplify_equalize_norms=True, ): - """Build the scalar TN for ```` without dense Pauli strings.""" + """Build the scalar TN for ```` without dense Pauli strings. + + Use quimb's reverse-lightcone reduced-density TN for the Pauli support, + then attach one 2x2 operator tensor per acted-on site. This keeps long + Pauli products sparse without adding identity tensors outside the support. + """ import numpy as np + from autoray import infer_backend op_by_site = { int(qubit): qu.pauli(str(gate_name).lower()) for qubit, gate_name in factors if str(gate_name).upper() != "I" } - ket = quimb_circuit.get_psi_simplified( - seq=simplify_sequence, - atol=simplify_atol, - equalize_norms=simplify_equalize_norms, - ) - bra = ket.conj().reindex( - { - quimb_circuit.ket_site_ind(qubit): quimb_circuit.bra_site_ind(qubit) - for qubit in range(quimb_circuit.N) - } - ) + if not op_by_site: + return qtn.TensorNetwork( + [qtn.Tensor(data=np.asarray(1.0 + 0.0j), inds=())] + ) - tn = bra | ket - identity = np.eye(2, dtype=complex) - for qubit in range(quimb_circuit.N): - data = op_by_site.get(qubit, identity) - tn |= qtn.Tensor( - data=data, + where = tuple(sorted(op_by_site)) + fs_opts = { + "seq": simplify_sequence, + "atol": simplify_atol, + "equalize_norms": simplify_equalize_norms, + } + rho = quimb_circuit.get_rdm_lightcone_simplified( + where=where, + **fs_opts, + ) + rho_backend = infer_backend(rho.tensors[0].data) if rho.tensors else "numpy" + for qubit in where: + op = op_by_site[qubit] + if rho_backend == "torch": + dtype = getattr(quimb_circuit, "dtype", None) or "complex128" + op = _torch_cpu_array(op, dtype=_torch_dtype(dtype)) + rho |= qtn.Tensor( + data=op, inds=( quimb_circuit.bra_site_ind(qubit), quimb_circuit.ket_site_ind(qubit), ), ) - tn.full_simplify_( + rho.full_simplify_( output_inds=(), - seq=simplify_sequence, - atol=simplify_atol, - equalize_norms=simplify_equalize_norms, + **fs_opts, ) - return tn + return rho def pauli_product_expectation( @@ -156,7 +164,13 @@ def pauli_product_expectation( simplify_sequence=simplify_sequence, simplify_atol=simplify_atol, ) - return tn.contract(all, output_inds=(), optimize=optimize, backend=backend) + return tn.contract( + all, + output_inds=(), + optimize=optimize, + backend=backend, + implementation=torch_contract_implementation(backend), + ) def __init__(self, quimb_backend="torch", contraction_optimizer="auto-hq"): @@ -287,8 +301,14 @@ def execute_circuit( elif initial_state is not None: raise_error(ValueError, "Initial state not None supported only for MPS ansatz.") + gate_opts = { + "max_bond": self.max_bond_dimension, + "cutoff": self.svd_cutoff, + } circ_quimb = self.circuit_ansatz.from_openqasm2_str( - circuit.to_qasm(), psi0=initial_state, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff} + circuit.to_qasm(), + psi0=initial_state, + gate_opts=gate_opts, ) if nshots: @@ -390,7 +410,7 @@ def exp_value_observable_symbolic( expectation_value = expectation_value + coeff * exp_values - return self.real(expectation_value) + return _real_scalar(expectation_value) def _qibo_circuit_to_quimb( @@ -414,7 +434,19 @@ def _qibo_circuit_to_quimb( The converted circuit. """ nqubits = qibo_circ.nqubits + merge_1q = circuit_kwargs.pop("merge_1q", "auto") + merge_2q = circuit_kwargs.pop("merge_2q", "auto") + if self.backend == "torch": + circuit_kwargs.setdefault("to_backend", _torch_cpu_array) + circuit_kwargs.setdefault("convert_eager", True) + circuit_kwargs.setdefault("dtype", getattr(self, "dtype", "complex128")) circ = quimb_circuit_type(nqubits, **circuit_kwargs) + pending_gates = [] + + def flush_pending_gates(): + if pending_gates: + circ.apply_gates(pending_gates, merge_1q=merge_1q, merge_2q=merge_2q) + pending_gates.clear() for gate in qibo_circ.queue: gate_name = getattr(gate, "name", None) @@ -424,34 +456,39 @@ def _qibo_circuit_to_quimb( if gate_name == "cu1": theta = gate.parameters[0] c, t = gate.qubits - circ.apply_gate("RZ", theta / 2, c) - circ.apply_gate("RZ", theta / 2, t) - circ.apply_gate("CNOT", c, t) - circ.apply_gate("RZ", -theta / 2, t) - circ.apply_gate("CNOT", c, t) + pending_gates.extend( + ( + ("RZ", theta / 2, c), + ("RZ", theta / 2, t), + ("CNOT", c, t), + ("RZ", -theta / 2, t), + ("CNOT", c, t), + ) + ) continue if quimb_gate_name is None: if hasattr(gate, "matrix"): - circ.apply_gate_raw(gate.matrix(), getattr(gate, "qubits", ())) + pending_gates.append((gate.matrix(), *getattr(gate, "qubits", ()))) continue raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.") params = getattr(gate, "parameters", ()) qubits = getattr(gate, "qubits", ()) - is_parametrized = isinstance(gate, ParametrizedGate) and getattr( - gate, "trainable", True - ) + is_parametrized = _quimb_should_parametrize(gate) if is_parametrized: - circ.apply_gate( - quimb_gate_name, *params, *qubits, parametrized=is_parametrized - ) - else: + flush_pending_gates() circ.apply_gate( quimb_gate_name, *params, *qubits, + parametrize=True, ) + continue + + pending_gates.append((quimb_gate_name, *params, *qubits)) + + flush_pending_gates() return circ @@ -509,7 +546,6 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None): if parallel is None: # Use original implementation - from qibotn.observables import extract_gates_and_qubits all_terms = extract_gates_and_qubits(observable) qc = self._qibo_circuit_to_quimb( @@ -532,7 +568,8 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None): else: op, where = _pauli_term_to_dense_operator(factors) val = qc.local_expectation( - op, where, + op, + where, backend=self.backend, optimize=self.contractions_optimizer, simplify_sequence="ADCRS", @@ -540,7 +577,7 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None): ) exp_val += coeff * val - return self.real(exp_val) + return _real_scalar(exp_val) else: # Use parallel implementation @@ -549,10 +586,11 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None): def _expectation_parallel(self, circuit, observable, method, opts): """Parallel expectation value computation.""" - from qibotn.observables import extract_gates_and_qubits - from qibotn.parallel import parallel_path_search, parallel_contract import torch + from qibotn.observables import extract_gates_and_qubits + from qibotn.parallel import parallel_contract, parallel_path_search + try: from mpi4py import MPI comm = MPI.COMM_WORLD if method == 'mpi' else None @@ -568,11 +606,16 @@ def _expectation_parallel(self, circuit, observable, method, opts): torch_threads = opts.get('torch_threads', None) slicing_opts = opts.get('slicing_opts', None) trial_timeout = opts.get('trial_timeout', None) + search_seed = opts.get('search_seed', 0) + merge_1q = opts.get("merge_1q", "auto") + merge_2q = opts.get("merge_2q", "auto") qc = self._qibo_circuit_to_quimb( circuit, quimb_circuit_type=self.circuit_ansatz, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff}, + merge_1q=merge_1q, + merge_2q=merge_2q, ) all_terms = extract_gates_and_qubits(observable) @@ -599,6 +642,7 @@ def _expectation_parallel(self, circuit, observable, method, opts): n_workers=search_workers, slicing_opts=slicing_opts, trial_timeout=trial_timeout, + search_seed=search_seed, ) if tree is None: @@ -611,7 +655,8 @@ def _expectation_parallel(self, circuit, observable, method, opts): if self.backend == "torch": for tensor in tn.tensors: tensor._data = _torch_cpu_array( - tensor._data, dtype=torch.complex128 + tensor._data, + dtype=_torch_dtype(getattr(self, "dtype", "complex128")), ) val = complex( tn.contract( @@ -619,6 +664,7 @@ def _expectation_parallel(self, circuit, observable, method, opts): output_inds=(), optimize=tree, backend="torch", + implementation=torch_contract_implementation(self.backend), ) ) else: @@ -637,10 +683,10 @@ def _expectation_parallel(self, circuit, observable, method, opts): all_exp = comm.gather(my_exp, root=0) if rank == 0: total_exp = sum(all_exp) - return self.real(total_exp) + return _real_scalar(total_exp) return 0.0 - return self.real(my_exp) + return _real_scalar(my_exp) CLASSES_ROOTS = {"numpy": "Numpy", "torch": "PyTorch", "jax": "Jax"} @@ -701,3 +747,876 @@ def __getattr__(name): return BACKENDS[name] except KeyError: raise AttributeError(f"module {__name__!r} has no attribute {name!r}") from None + + +@dataclass(frozen=True) +class CircuitBuildResult: + quimb_circuit: object + build_seconds: float + + +@dataclass(frozen=True) +class ExpectationTN: + coeff: complex + factors: tuple + tn: object + quimb_circuit: object + build_seconds: float + tn_seconds: float + + +@dataclass(frozen=True) +class TreeSearchResult: + tree: object + seconds: float + costs: dict + stats: dict + + +@dataclass(frozen=True) +class QuimbTorchRunResult: + built: ExpectationTN + search: TreeSearchResult + value: complex + contract_seconds: float + + +@dataclass(frozen=True) +class QuimbCircuitStats: + build_seconds: float + num_gates: int + num_tensors: int + num_indices: int + + +@dataclass(frozen=True) +class QuimbTNProfile: + value: complex + build_seconds: float + expectation_tn_seconds: float + search_seconds: float + contract_seconds: float + circuit_num_gates: int + circuit_num_tensors: int + tn_num_tensors: int + tn_num_indices: int + tn_outer_indices: int + search_costs: dict + search_stats: dict + + +@dataclass(frozen=True) +class QuimbContractTiming: + implementation: str | None + sort_indices: bool + value: complex + best_seconds: float + mean_seconds: float + + +@dataclass(frozen=True) +class QuimbGateMergeComparison: + merge_stats: QuimbCircuitStats + nomerge_stats: QuimbCircuitStats + tensor_reduction: float + build_speedup: float + + +@dataclass(frozen=True) +class QuimbGateMergeExpectationComparison: + merge: QuimbTorchRunResult + nomerge: QuimbTorchRunResult + value_diff: float + total_speedup: float + build_speedup: float + tensor_reduction: float + + +def make_quimb_backend( + *, + quimb_backend="torch", + contraction_optimizer="auto-hq", + dtype="complex128", +): + backend = BACKENDS[quimb_backend]( + quimb_backend=quimb_backend, + contraction_optimizer=contraction_optimizer, + ) + backend.dtype = dtype + return backend + + +def torch_quimb_backend(dtype="complex128", contraction_optimizer="auto-hq"): + return make_quimb_backend( + quimb_backend="torch", + contraction_optimizer=contraction_optimizer, + dtype=dtype, + ) + + +def build_quimb_backend_circuit( + circuit, + *, + quimb_backend="torch", + ansatz="tn", + dtype="complex128", + max_bond=None, + cutoff=1e-12, + merge_1q="auto", + merge_2q="auto", + contraction_optimizer="auto-hq", +): + backend = make_quimb_backend( + quimb_backend=quimb_backend, + contraction_optimizer=contraction_optimizer, + dtype=dtype, + ) + start = time.perf_counter() + backend.configure_tn_simulation( + ansatz="mps" if ansatz == "mps" else None, + max_bond_dimension=max_bond, + svd_cutoff=cutoff, + ) + qc = backend._qibo_circuit_to_quimb( + circuit, + quimb_circuit_type=backend.circuit_ansatz, + gate_opts={"max_bond": max_bond, "cutoff": cutoff}, + dtype=dtype, + merge_1q=merge_1q, + merge_2q=merge_2q, + ) + return CircuitBuildResult(qc, time.perf_counter() - start) + + +def quimb_circuit_stats(quimb_circuit, build_seconds=0.0): + return QuimbCircuitStats( + build_seconds=float(build_seconds), + num_gates=int(getattr(quimb_circuit, "num_gates", 0)), + num_tensors=len(quimb_circuit.psi.tensor_map), + num_indices=len(quimb_circuit.psi.ind_map), + ) + + +def build_quimb_circuit_stats(circuit, **kwargs): + built = build_quimb_backend_circuit(circuit, **kwargs) + return quimb_circuit_stats(built.quimb_circuit, built.build_seconds) + + +def compare_quimb_gate_merge(circuit, **kwargs): + merge_kwargs = dict(kwargs) + nomerge_kwargs = dict(kwargs) + merge_kwargs.update({"merge_1q": True, "merge_2q": True}) + nomerge_kwargs.update({"merge_1q": False, "merge_2q": False}) + merge_stats = build_quimb_circuit_stats(circuit, **merge_kwargs) + nomerge_stats = build_quimb_circuit_stats(circuit, **nomerge_kwargs) + tensor_reduction = ( + float(nomerge_stats.num_tensors) / max(float(merge_stats.num_tensors), 1.0) + ) + build_speedup = ( + float(nomerge_stats.build_seconds) / max(float(merge_stats.build_seconds), 1e-15) + ) + return QuimbGateMergeComparison( + merge_stats=merge_stats, + nomerge_stats=nomerge_stats, + tensor_reduction=tensor_reduction, + build_speedup=build_speedup, + ) + + +def build_quimb_torch_circuit( + circuit, + *, + ansatz="tn", + dtype="complex128", + max_bond=None, + cutoff=1e-12, + merge_1q="auto", + merge_2q="auto", + contraction_optimizer="auto-hq", +): + return build_quimb_backend_circuit( + circuit, + quimb_backend="torch", + ansatz=ansatz, + dtype=dtype, + max_bond=max_bond, + cutoff=cutoff, + merge_1q=merge_1q, + merge_2q=merge_2q, + contraction_optimizer=contraction_optimizer, + ) + + +def qibo_circuit_to_quimb_torch( + circuit, + *, + ansatz="tn", + dtype="complex128", + max_bond=None, + cutoff=1e-12, + merge_1q="auto", + merge_2q="auto", + contraction_optimizer="auto-hq", +): + return build_quimb_torch_circuit( + circuit, + ansatz=ansatz, + dtype=dtype, + max_bond=max_bond, + cutoff=cutoff, + merge_1q=merge_1q, + merge_2q=merge_2q, + contraction_optimizer=contraction_optimizer, + ).quimb_circuit + + +def pauli_term_expectation_tn( + quimb_circuit, + factors, + *, + dtype="complex128", + simplify_sequence="ADCRS", + simplify_atol=1e-12, +): + if len(factors) > PAULI_DENSE_MAX_QUBITS: + tn = pauli_product_expectation_tn( + quimb_circuit, + factors, + simplify_sequence=simplify_sequence, + simplify_atol=simplify_atol, + ) + else: + op, where = _pauli_term_to_dense_operator(factors) + op = _torch_cpu_array(op, dtype=_torch_dtype(dtype)) + tn = quimb_circuit.local_expectation( + op, + where, + rehearse="tn", + simplify_sequence=simplify_sequence, + simplify_atol=simplify_atol, + ) + ensure_torch_tn(tn, dtype=dtype) + return tn + + +def build_expectation_tn( + circuit, + observable, + *, + term_index=0, + ansatz="tn", + dtype="complex128", + max_bond=None, + cutoff=1e-12, + merge_1q="auto", + merge_2q="auto", + contraction_optimizer="auto-hq", +): + terms = extract_gates_and_qubits(observable) + coeff, factors = terms[term_index] + built = build_quimb_torch_circuit( + circuit, + ansatz=ansatz, + dtype=dtype, + max_bond=max_bond, + cutoff=cutoff, + merge_1q=merge_1q, + merge_2q=merge_2q, + contraction_optimizer=contraction_optimizer, + ) + start = time.perf_counter() + tn = pauli_term_expectation_tn(built.quimb_circuit, factors, dtype=dtype) + return ExpectationTN( + coeff=coeff, + factors=tuple(factors), + tn=tn, + quimb_circuit=built.quimb_circuit, + build_seconds=built.build_seconds, + tn_seconds=time.perf_counter() - start, + ) + + +def ensure_torch_tn(tn, dtype="complex128"): + target_dtype = _torch_dtype(dtype) + for tensor in tn.tensors: + tensor._data = _torch_cpu_array(tensor._data, dtype=target_dtype) + return tn + + +def term_arrays(tn, dtype="complex128"): + return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in tn.arrays] + + +def search_contraction_tree( + tn, + *, + method="processpool", + total_repeats=128, + max_time=60, + n_workers=4, + slicing_opts=None, + trial_timeout=None, + search_backend=None, + dask_address=None, + debug_trials=False, + dask_close_workers=False, + expected_workers=None, + search_seed=0, + sort_indices=False, + sort_priority="flops", + dtype="complex128", +): + start = time.perf_counter() + tree = parallel_path_search( + tn, + tn.outer_inds(), + method=method, + total_repeats=total_repeats, + max_time=max_time, + n_workers=n_workers, + slicing_opts=slicing_opts, + trial_timeout=trial_timeout, + search_backend=search_backend, + dask_address=dask_address, + debug_trials=debug_trials, + dask_close_workers=dask_close_workers, + expected_workers=expected_workers, + search_seed=search_seed, + ) + if sort_indices and hasattr(tree, "sort_contraction_indices"): + tree.sort_contraction_indices( + priority=sort_priority, + make_output_contig=True, + make_contracted_contig=True, + reset=True, + ) + costs = contraction_tree_costs( + tree, + dtype_bytes=8 if dtype in ("complex64", "single", np.complex64) else 16, + ) + return TreeSearchResult( + tree=tree, + seconds=time.perf_counter() - start, + costs=costs, + stats=getattr(tree, "qibotn_search_stats", {}) or {}, + ) + + +def sorted_tree(tree, enabled=True, priority="flops"): + work_tree = copy.deepcopy(tree) + if enabled and hasattr(work_tree, "sort_contraction_indices"): + work_tree.sort_contraction_indices( + priority=priority, + make_output_contig=True, + make_contracted_contig=True, + reset=True, + ) + return work_tree + + +def contract_tn( + tn, + tree, + *, + dtype="complex128", + backend="torch", + implementation=None, +): + if backend == "torch": + ensure_torch_tn(tn, dtype=dtype) + return tn.contract( + all, + output_inds=(), + optimize=tree, + backend=backend, + implementation=torch_contract_implementation(backend, implementation), + ) + + +def run_quimb_backend_expectation( + circuit, + observable, + *, + quimb_backend="torch", + ansatz="tn", + dtype="complex128", + max_bond=None, + cutoff=1e-12, + contraction_optimizer="auto-hq", +): + backend = make_quimb_backend( + quimb_backend=quimb_backend, + contraction_optimizer=contraction_optimizer, + dtype=dtype, + ) + backend.configure_tn_simulation( + ansatz="mps" if ansatz == "mps" else None, + max_bond_dimension=max_bond, + svd_cutoff=cutoff, + ) + start = time.perf_counter() + value = backend.expectation(circuit, observable) + return value, time.perf_counter() - start + + +def run_quimb_torch_expectation( + circuit, + observable, + *, + term_index=0, + ansatz="tn", + dtype="complex128", + max_bond=None, + cutoff=1e-12, + merge_1q="auto", + merge_2q="auto", + contraction_optimizer="auto-hq", + search_method="processpool", + total_repeats=128, + max_time=60, + n_workers=4, + slicing_opts=None, + trial_timeout=None, + search_backend=None, + dask_address=None, + debug_trials=False, + dask_close_workers=False, + expected_workers=None, + search_seed=0, + sort_indices=False, + sort_priority="flops", + contract_backend="torch", + contract_implementation=None, +): + built = build_expectation_tn( + circuit, + observable, + term_index=term_index, + ansatz=ansatz, + dtype=dtype, + max_bond=max_bond, + cutoff=cutoff, + merge_1q=merge_1q, + merge_2q=merge_2q, + contraction_optimizer=contraction_optimizer, + ) + search = search_contraction_tree( + built.tn, + method=search_method, + total_repeats=total_repeats, + max_time=max_time, + n_workers=n_workers, + slicing_opts=slicing_opts, + trial_timeout=trial_timeout, + search_backend=search_backend, + dask_address=dask_address, + debug_trials=debug_trials, + dask_close_workers=dask_close_workers, + expected_workers=expected_workers, + search_seed=search_seed, + sort_indices=sort_indices, + sort_priority=sort_priority, + dtype=dtype, + ) + start = time.perf_counter() + value = contract_tn( + built.tn, + search.tree, + dtype=dtype, + backend=contract_backend, + implementation=contract_implementation, + ) + return QuimbTorchRunResult( + built=built, + search=search, + value=built.coeff * complex(value), + contract_seconds=time.perf_counter() - start, + ) + + +def profile_quimb_torch_expectation(circuit, observable, **kwargs): + result = run_quimb_torch_expectation(circuit, observable, **kwargs) + return QuimbTNProfile( + value=result.value, + build_seconds=result.built.build_seconds, + expectation_tn_seconds=result.built.tn_seconds, + search_seconds=result.search.seconds, + contract_seconds=result.contract_seconds, + circuit_num_gates=int(result.built.quimb_circuit.num_gates), + circuit_num_tensors=len(result.built.quimb_circuit.psi.tensor_map), + tn_num_tensors=len(result.built.tn.tensor_map), + tn_num_indices=len(result.built.tn.ind_map), + tn_outer_indices=len(result.built.tn.outer_inds()), + search_costs=result.search.costs, + search_stats=result.search.stats, + ) + + +def compare_quimb_gate_merge_expectation(circuit, observable, **kwargs): + """Run the quimb+torch expectation pipeline with gate merging on and off. + + Each variant builds its own tensor network and contraction tree. Trees are + structure-specific, so callers should compare the returned ``merge`` and + ``nomerge`` results rather than reusing a tree between variants. + """ + merge_kwargs = dict(kwargs) + nomerge_kwargs = dict(kwargs) + merge_kwargs.update({"merge_1q": True, "merge_2q": True}) + nomerge_kwargs.update({"merge_1q": False, "merge_2q": False}) + nomerge = run_quimb_torch_expectation(circuit, observable, **nomerge_kwargs) + merge = run_quimb_torch_expectation(circuit, observable, **merge_kwargs) + + merge_total = ( + merge.built.build_seconds + merge.built.tn_seconds + merge.search.seconds + + merge.contract_seconds + ) + nomerge_total = ( + nomerge.built.build_seconds + nomerge.built.tn_seconds + + nomerge.search.seconds + nomerge.contract_seconds + ) + return QuimbGateMergeExpectationComparison( + merge=merge, + nomerge=nomerge, + value_diff=abs(merge.value - nomerge.value), + total_speedup=nomerge_total / max(merge_total, 1e-15), + build_speedup=nomerge.built.build_seconds / max(merge.built.build_seconds, 1e-15), + tensor_reduction=( + len(nomerge.built.quimb_circuit.psi.tensor_map) + / max(len(merge.built.quimb_circuit.psi.tensor_map), 1) + ), + ) + + +def time_quimb_contract_implementations( + expectation_tn, + tree, + *, + dtype="complex128", + implementations=("autoray", "cotengra"), + sort_options=(False, True), + repeats=3, +): + timings = [] + for sort_indices in sort_options: + work_tree = sorted_tree(tree, sort_indices) + for implementation in implementations: + value = None + samples = [] + for _ in range(repeats): + start = time.perf_counter() + value = contract_tn( + expectation_tn, + work_tree, + dtype=dtype, + implementation=implementation, + ) + samples.append(time.perf_counter() - start) + timings.append( + QuimbContractTiming( + implementation=implementation, + sort_indices=bool(sort_indices), + value=complex(value), + best_seconds=min(samples), + mean_seconds=sum(samples) / len(samples), + ) + ) + return tuple(timings) + + +def quimb_torch_parallel_opts( + *, + target_slices=None, + target_size=None, + search_workers=None, + torch_threads=1, + search_repeats=128, + search_time=60.0, + search_seed=0, + merge_gates=True, + search_backend="processpool", + dask_address=None, + dask_expected_workers=None, + dask_close_workers=False, + debug_trials=False, + search_only=False, + save_tree_path=None, + load_tree_path=None, + print_stats=False, +): + slicing_opts = {} + if target_slices is not None: + slicing_opts["target_slices"] = target_slices + if target_size is not None: + slicing_opts["target_size"] = target_size + + opts = { + "slicing_opts": slicing_opts or None, + "search_workers": search_workers or torch_threads, + "max_repeats": search_repeats, + "max_time": search_time, + "search_seed": search_seed, + "merge_1q": merge_gates, + "merge_2q": merge_gates, + "print_stats": print_stats, + } + if search_backend is not None: + opts["search_backend"] = search_backend + if dask_address is not None: + opts["dask_address"] = dask_address + if dask_expected_workers is not None: + opts["dask_expected_workers"] = dask_expected_workers + if dask_close_workers: + opts["dask_close_workers"] = True + if debug_trials: + opts["debug_trials"] = True + if search_only: + opts["search_only"] = True + opts["save_tree_path"] = save_tree_path + elif load_tree_path is not None: + opts["load_tree_path"] = load_tree_path + return opts + + +def load_custom_case_module(path): + """Load a user-provided Python module with circuit/observable builders.""" + path = Path(path).resolve() + spec = importlib.util.spec_from_file_location(path.stem, path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Cannot import case module from {path}.") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def _call_builder(fn, **kwargs): + sig = inspect.signature(fn) + if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()): + return fn(**kwargs) + return fn(**{name: value for name, value in kwargs.items() if name in sig.parameters}) + + +def load_custom_observable( + module, + *, + nqubits, + nlayers=0, + seed=42, + pauli_pattern=None, + observable_json=None, +): + """Load an observable from a custom module, JSON file, or Pauli pattern.""" + if pauli_pattern: + return {"pauli_string_pattern": pauli_pattern} + if observable_json: + with Path(observable_json).open(encoding="utf-8") as f: + return json.load(f) + if hasattr(module, "build_observable"): + return _call_builder( + module.build_observable, + nqubits=nqubits, + nlayers=nlayers, + seed=seed, + ) + if hasattr(module, "OBSERVABLE"): + return module.OBSERVABLE + raise ValueError( + "No observable supplied. Define build_observable/OBSERVABLE in the case " + "module, or pass pauli_pattern / observable_json." + ) + + +def run_custom_tn_expectation( + case_module, + *, + nqubits, + nlayers=0, + seed=42, + observable=None, + pauli_pattern=None, + observable_json=None, + mpi=False, + exact=False, + exact_max_qubits=24, + bond=1024, + cut_ratio=1e-12, + torch_threads=8, + quimb_backend="torch", + dtype="complex128", + parallel_opts=None, +): + """Run a quimb+torch TN expectation for a custom circuit module.""" + from qibotn.expectation_runner import ( + ExpectationConfig, + exact_for_observable, + run_cpu_expectation, + ) + + module = load_custom_case_module(case_module) + if not hasattr(module, "build_circuit"): + raise ValueError("case_module must define build_circuit.") + + circuit = _call_builder( + module.build_circuit, + nqubits=nqubits, + nlayers=nlayers, + seed=seed, + ) + if observable is None: + observable = load_custom_observable( + module, + nqubits=nqubits, + nlayers=nlayers, + seed=seed, + pauli_pattern=pauli_pattern, + observable_json=observable_json, + ) + + rank = 0 + if mpi: + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + + exact_value = None + if exact and rank == 0: + if nqubits > exact_max_qubits: + raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.") + exact_value = exact_for_observable(circuit, observable, nqubits) + + config = ExpectationConfig( + ansatz="tn", + mpi=mpi, + bond=bond, + cut_ratio=cut_ratio, + tensor_module="torch", + quimb_backend=quimb_backend, + dtype=dtype, + torch_threads=torch_threads, + parallel_opts=parallel_opts or {}, + ) + result = run_cpu_expectation(circuit, observable, config) + if mpi and result.rank != 0: + return None + return { + "circuit": circuit, + "observable": observable, + "exact": exact_value, + "result": result, + "abs_error": None if exact_value is None else abs(result.value - exact_value), + "rel_error": ( + None + if exact_value is None + else abs(result.value - exact_value) / max(abs(exact_value), 1e-15) + ), + } + + +def run_contest_tn_case( + case_name, + obs_name, + *, + mode="contract", + tree_dir="trees/contest_tn", + nqubits=None, + nlayers=None, + seed=None, + mpi=False, + exact=False, + exact_max_qubits=24, + bond=1024, + cut_ratio=1e-12, + torch_threads=8, + quimb_backend="torch", + dtype="complex64", + target_slices=None, + target_size=2**34, + search_workers=None, + search_repeats=2048, + search_time=300.0, + search_seed=0, + merge_gates=True, + search_backend="dask", + dask_address=None, + dask_expected_workers=None, + dask_close_workers=False, + debug_trials=False, +): + """Run one shared contest-style quimb+torch TN search/contract case.""" + from qibotn.contest_cases import CASES, build_contest_circuit, build_contest_observable, tree_path + from qibotn.expectation_runner import ( + ExpectationConfig, + exact_for_observable, + run_cpu_expectation, + ) + + case = CASES[case_name] + nqubits = case.nqubits if nqubits is None else nqubits + nlayers = case.nlayers if nlayers is None else nlayers + seed = case.seed if seed is None else seed + target_slices = case.target_slices if target_slices is None else target_slices + + circuit = build_contest_circuit(case.circuit_kind, nqubits, nlayers, seed) + observable = build_contest_observable(obs_name, nqubits, seed) + path = tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices, merge_gates) + path.parent.mkdir(parents=True, exist_ok=True) + if mode == "contract" and not path.exists(): + raise FileNotFoundError(f"Missing tree file: {path}. Run search first.") + + rank = 0 + if mpi: + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + + exact_value = None + if exact and rank == 0 and mode != "search": + if nqubits > exact_max_qubits: + raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.") + exact_value = exact_for_observable(circuit, observable, nqubits) + + config = ExpectationConfig( + ansatz="tn", + mpi=mpi, + bond=bond, + cut_ratio=cut_ratio, + tensor_module="torch", + quimb_backend=quimb_backend, + dtype=dtype, + torch_threads=torch_threads, + parallel_opts=quimb_torch_parallel_opts( + target_slices=target_slices, + target_size=target_size, + search_workers=search_workers, + torch_threads=torch_threads, + search_repeats=search_repeats, + search_time=search_time, + search_seed=search_seed, + merge_gates=merge_gates, + search_backend=search_backend, + dask_address=dask_address, + dask_expected_workers=dask_expected_workers, + dask_close_workers=dask_close_workers, + debug_trials=debug_trials, + search_only=(mode == "search"), + save_tree_path=str(path), + load_tree_path=str(path), + print_stats=False, + ), + ) + result = run_cpu_expectation(circuit, observable, config) + if mpi and result.rank != 0: + return None + return { + "case": case, + "tree_path": path, + "circuit": circuit, + "observable": observable, + "exact": exact_value, + "result": result, + "abs_error": None if exact_value is None else abs(result.value - exact_value), + "rel_error": ( + None + if exact_value is None + else abs(result.value - exact_value) / max(abs(exact_value), 1e-15) + ), + } diff --git a/src/qibotn/backends/vidal.py b/src/qibotn/backends/vidal.py index 8fbd4d5..70b57b8 100644 --- a/src/qibotn/backends/vidal.py +++ b/src/qibotn/backends/vidal.py @@ -9,6 +9,7 @@ usable while the fast path is expanded. from __future__ import annotations import re +import time from dataclasses import dataclass import numpy as np @@ -475,3 +476,511 @@ class VidalBackend(QibotnBackend, NumpyBackend): return_array=return_array, **prob_kwargs, ) + + +@dataclass(frozen=True) +class VidalExpectationResult: + value: float + seconds: float + backend: object + + +@dataclass(frozen=True) +class VidalBackendComparisonResult: + circuit: object + observable: object + exact: float | None + qmatchatea: VidalExpectationResult | None + vidal: VidalExpectationResult + qmatchatea_error: float | None + vidal_error: float | None + + +@dataclass(frozen=True) +class VidalProfileResult: + value: float + trace_path: object + table_path: object + table: str + + +def make_vidal_backend( + *, + bond=10, + cut_ratio=1e-9, + tensor_module="torch", + compile_circuit=False, + mpi_approach="SR", + mpi_num_procs=1, + mpi_where_barriers=-1, + mpi_isometrization=-1, + mpi_term_batch_size=None, + fallback=True, +): + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + compile_circuit=compile_circuit, + mpi_approach=mpi_approach, + mpi_num_procs=mpi_num_procs, + mpi_where_barriers=mpi_where_barriers, + mpi_isometrization=mpi_isometrization, + mpi_term_batch_size=mpi_term_batch_size, + fallback=fallback, + ) + return backend + + +def run_vidal_expectation( + circuit, + observable, + *, + bond=10, + cut_ratio=1e-9, + tensor_module="torch", + compile_circuit=False, + preprocess=True, + mpi_approach="SR", + mpi_num_procs=1, + mpi_where_barriers=-1, + mpi_isometrization=-1, + mpi_term_batch_size=None, + fallback=True, +): + backend = make_vidal_backend( + bond=bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + compile_circuit=compile_circuit, + mpi_approach=mpi_approach, + mpi_num_procs=mpi_num_procs, + mpi_where_barriers=mpi_where_barriers, + mpi_isometrization=mpi_isometrization, + mpi_term_batch_size=mpi_term_batch_size, + fallback=fallback, + ) + start = time.perf_counter() + value = backend.expectation( + circuit, + observable, + preprocess=preprocess, + compile_circuit=compile_circuit, + ) + return VidalExpectationResult( + value=float(np.real(value)), + seconds=time.perf_counter() - start, + backend=backend, + ) + + +def run_segmented_vidal_ring_xz( + circuit, + *, + max_bond=10, + cut_ratio=1e-9, + tensor_module="torch", + comm, +): + from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz + + start = time.perf_counter() + value, timings = run_segment_vidal_mpi_ring_xz( + circuit, + max_bond=max_bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + comm=comm, + ) + return VidalExpectationResult( + value=float(np.real(value)), + seconds=time.perf_counter() - start, + backend=timings, + ) + + +def compare_vidal_backend_qmatchatea( + circuit, + observable, + *, + bond=512, + cut_ratio=1e-12, + tensor_module="torch", + exact=None, + skip_qmatchatea=False, + qmatchatea_compile_circuit=True, + qmatchatea_svd_control="E!", + vidal_compile_circuit=True, + vidal_fallback=True, +): + qmatchatea_result = None + if not skip_qmatchatea: + qmatchatea_backend = QMatchaTeaBackend() + qmatchatea_backend.configure_tn_simulation( + ansatz="MPS", + max_bond_dimension=bond, + cut_ratio=cut_ratio, + svd_control=qmatchatea_svd_control, + tensor_module=tensor_module, + compile_circuit=qmatchatea_compile_circuit, + track_memory=False, + ) + start = time.perf_counter() + qmatchatea_value = qmatchatea_backend.expectation( + circuit, + observable, + preprocess=False, + compile_circuit=qmatchatea_compile_circuit, + ) + qmatchatea_result = VidalExpectationResult( + value=float(np.real(qmatchatea_value)), + seconds=time.perf_counter() - start, + backend=qmatchatea_backend, + ) + + vidal_backend = VidalBackend() + vidal_backend.configure_tn_simulation( + ansatz="MPS", + max_bond_dimension=bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + compile_circuit=vidal_compile_circuit, + fallback=vidal_fallback, + ) + start = time.perf_counter() + vidal_value = vidal_backend.expectation( + circuit, + observable, + preprocess=False, + compile_circuit=vidal_compile_circuit, + ) + vidal_result = VidalExpectationResult( + value=float(np.real(vidal_value)), + seconds=time.perf_counter() - start, + backend=vidal_backend, + ) + + qmatchatea_error = None + vidal_error = None + if exact is not None: + if qmatchatea_result is not None: + qmatchatea_error = abs(qmatchatea_result.value - exact) + vidal_error = abs(vidal_result.value - exact) + + return VidalBackendComparisonResult( + circuit=circuit, + observable=observable, + exact=exact, + qmatchatea=qmatchatea_result, + vidal=vidal_result, + qmatchatea_error=qmatchatea_error, + vidal_error=vidal_error, + ) + + +def profile_vidal_expectation( + circuit, + observable, + *, + bond=512, + cut_ratio=1e-12, + torch_threads=32, + trace_path, + table_path, + profile_memory=False, + rows=60, +): + import torch + from torch.profiler import ProfilerActivity, profile + + from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation + + torch.set_num_threads(torch_threads) + config = ExpectationConfig( + ansatz="mps", + bond=bond, + cut_ratio=cut_ratio, + tensor_module="torch", + torch_threads=torch_threads, + ) + + with profile( + activities=[ProfilerActivity.CPU], + record_shapes=profile_memory, + profile_memory=profile_memory, + with_stack=profile_memory, + ) as prof: + result = run_cpu_expectation(circuit, observable, config) + + table = ( + f"expval={result.value:.16e}\n\n" + f"# sorted by self_cpu_time_total\n" + f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=rows)}\n\n" + f"# sorted by cpu_time_total\n" + f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=rows)}\n" + ) + table_path.parent.mkdir(parents=True, exist_ok=True) + table_path.write_text(table, encoding="utf-8") + prof.export_chrome_trace(str(trace_path)) + return VidalProfileResult( + value=result.value, + trace_path=trace_path, + table_path=table_path, + table=table, + ) + + +CONTEST_MPS_BONDS = {"main1": 512, "main2": 1024, "strong": 2048} +CONTEST_VIDAL_OBSERVABLES = ( + "boundary_ZZ_q1", + "boundary_ZZ_q2", + "boundary_ZZ_q3", + "long_Z_5_sites", + "mixed_XZYZX", + "ring_xz", + "open_zz", + "range2_xx", + "complex_iZ0", + "dense2_mid", + "dense3_spread", +) + + +def run_contest_mps_case( + case_name="main1", + *, + observables=None, + obs_filter="", + nqubits=None, + nlayers=None, + bond="case-default", + cut_ratio=1e-12, + seed=None, + torch_threads=8, + exact=False, + exact_max_qubits=24, +): + """Run a shared contest-style Vidal/MPS expectation case.""" + from qibotn.contest_cases import CASES, build_contest_circuit, build_contest_observable + from qibotn.expectation_runner import exact_for_observable + from qibotn.torch_utils import set_torch_threads + + from mpi4py import MPI + + set_torch_threads(torch_threads) + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + case = CASES[case_name] + nqubits = case.nqubits if nqubits is None else nqubits + nlayers = case.nlayers if nlayers is None else nlayers + seed = case.seed if seed is None else seed + if bond == "case-default": + bond = CONTEST_MPS_BONDS.get(case_name, 1024) + if observables is None: + observables = tuple(x.strip() for x in obs_filter.split(",") if x.strip()) or case.observables + + circuit = build_contest_circuit(case.circuit_kind, nqubits, nlayers, seed) + records = [] + for obs_name in observables: + observable = build_contest_observable(obs_name, nqubits, seed) + exact_value = None + if exact and rank == 0: + if nqubits > exact_max_qubits: + raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.") + exact_value = exact_for_observable(circuit, observable, nqubits) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=bond, + cut_ratio=cut_ratio, + tensor_module="torch", + mpi_approach="CT", + mpi_num_procs=size, + fallback=False, + ) + + comm.Barrier() + start = time.perf_counter() + value = backend.expectation( + circuit, + observable, + preprocess=True, + compile_circuit=False, + ) + seconds = time.perf_counter() - start + if rank == 0: + records.append( + { + "case": case, + "observable": obs_name, + "value": value, + "seconds": seconds, + "exact": exact_value, + "abs_error": None if exact_value is None else abs(value - exact_value), + "rel_error": ( + None + if exact_value is None + else abs(value - exact_value) / max(abs(exact_value), 1e-15) + ), + "truncation_error": backend.last_truncation_error, + "max_truncation_error": backend.last_max_truncation_error, + } + ) + return records + + +def run_vidal_mpi_contest_case( + *, + label, + kind, + nqubits, + nlayers, + bond, + cut_ratio, + seed, + torch_threads, + obs_filter="", +): + """Run the direct Vidal MPI contest observable sweep.""" + from qibotn.contest_cases import build_contest_circuit, build_contest_observable + from qibotn.torch_utils import set_torch_threads + + from mpi4py import MPI + + del label + set_torch_threads(torch_threads) + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + circuit = build_contest_circuit(kind, nqubits, nlayers, seed) + names = CONTEST_VIDAL_OBSERVABLES + if obs_filter: + wanted = set(obs_filter.split(",")) + names = tuple(name for name in names if name in wanted) + if not names: + raise ValueError(f"obs_filter matched no observables: {obs_filter!r}") + + records = [] + for obs_name in names: + observable = build_contest_observable(obs_name, nqubits, seed) + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=bond, + cut_ratio=cut_ratio, + tensor_module="torch", + mpi_approach="CT", + mpi_num_procs=size, + fallback=False, + ) + comm.Barrier() + start = time.perf_counter() + value = backend.expectation( + circuit, + observable, + preprocess=True, + compile_circuit=False, + ) + seconds = time.perf_counter() - start + if rank == 0: + records.append( + { + "observable": obs_name, + "value": value, + "seconds": seconds, + "truncation_error": backend.last_truncation_error, + "max_truncation_error": backend.last_max_truncation_error, + } + ) + return records + + +def build_vidal_validation_circuit(kind, nqubits, nlayers, seed): + """Build the circuit family used by Vidal correctness checks.""" + from qibotn.benchmark_cases import build_circuit + + aliases = {"brickwall": "brickwall_cnot"} + return build_circuit(aliases.get(kind, kind), nqubits, nlayers, seed) + + +def run_vidal_validation_cases( + *, + nqubits=16, + nlayers=6, + bond=512, + seed=42, + tensor_module="torch", + torch_threads=32, + mpi=False, + circuits=("brickwall", "reversed_cnot", "rx_ry_cz"), + observables=("ring_xz", "open_zz", "mixed_local"), +): + """Run Vidal/TEBD correctness checks against dense statevector references.""" + from qibotn.benchmark_cases import exact_pauli_sum, observable_terms + from qibotn.backends.vidal_tebd import VidalTEBDExecutor + from qibotn.torch_utils import set_torch_threads + + set_torch_threads(torch_threads) + comm = None + rank = 0 + if mpi: + from mpi4py import MPI + + from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor + + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + else: + SegmentVidalMPIExecutor = None + + records = [] + for circuit_kind in circuits: + circuit = build_vidal_validation_circuit(circuit_kind, nqubits, nlayers, seed) + if rank == 0: + exact_values = { + obs: exact_pauli_sum(circuit, observable_terms(obs, nqubits), nqubits) + for obs in observables + } + else: + exact_values = None + if comm is not None: + exact_values = comm.bcast(exact_values, root=0) + + for obs_kind in observables: + terms = observable_terms(obs_kind, nqubits) + start = time.perf_counter() + if mpi: + executor = SegmentVidalMPIExecutor( + nqubits=nqubits, + max_bond=bond, + cut_ratio=1e-12, + tensor_module=tensor_module, + comm=comm, + ) + executor.run_circuit(circuit) + value = executor.expectation_pauli_sum_root(terms) + else: + executor = VidalTEBDExecutor( + nqubits=nqubits, + max_bond=bond, + cut_ratio=1e-12, + tensor_module=tensor_module, + ) + executor.run_circuit(circuit) + value = float(executor.expectation_pauli_sum(terms)) + if rank != 0: + continue + seconds = time.perf_counter() - start + exact = exact_values[obs_kind] + records.append( + { + "circuit": circuit_kind, + "observable": obs_kind, + "exact": exact, + "value": value, + "abs_error": abs(value - exact), + "seconds": seconds, + } + ) + return records diff --git a/src/qibotn/benchmark_cases.py b/src/qibotn/benchmark_cases.py index cee08dc..e3c3d25 100644 --- a/src/qibotn/benchmark_cases.py +++ b/src/qibotn/benchmark_cases.py @@ -12,6 +12,7 @@ CIRCUITS = ( "brickwall_cnot", "reversed_cnot", "shifted_cz", + "rx_ry_cz", "rxx_rzz", "swap_scramble", "ghz_ladder", @@ -49,14 +50,14 @@ def build_circuit(kind, nqubits, nlayers, seed): for qubit in range(nqubits): circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) - if kind in ("rxx_rzz", "swap_scramble"): + if kind in ("rx_ry_cz", "rxx_rzz", "swap_scramble"): circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi))) if kind == "brickwall_cnot": add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False) elif kind == "reversed_cnot": add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True) - elif kind == "shifted_cz": + elif kind in ("shifted_cz", "rx_ry_cz"): for qubit in range(layer % 2, nqubits - 1, 2): circuit.add(gates.CZ(qubit, qubit + 1)) elif kind == "rxx_rzz": @@ -149,3 +150,22 @@ def exact_pauli_sum(circuit, terms, nqubits): raise ValueError(f"Unsupported Pauli {name!r}.") value += coeff * np.vdot(state[flipped], phase * state) return float(value.real) + + +def ring_xz_statevector_expectation(state, nqubits, chunk_size=1 << 20): + """Compute ``0.5 * sum_i X_i Z_(i+1)`` from a dense state vector.""" + state = np.asarray(state).reshape(-1) + value = 0.0 + for qubit in range(nqubits): + next_qubit = (qubit + 1) % nqubits + x_flip = 1 << (nqubits - 1 - qubit) + z_shift = nqubits - 1 - next_qubit + term = 0.0 + for start in range(0, state.size, chunk_size): + stop = min(start + chunk_size, state.size) + indices = np.arange(start, stop, dtype=np.int64) + z_bit = (indices >> z_shift) & 1 + z_phase = 1 - 2 * z_bit + term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real + value += 0.5 * term + return float(value) diff --git a/src/qibotn/circuit_convertor.py b/src/qibotn/circuit_convertor.py deleted file mode 100644 index 900cdf7..0000000 --- a/src/qibotn/circuit_convertor.py +++ /dev/null @@ -1,263 +0,0 @@ -import numpy as np - -try: - import cupy as cp -except ImportError: # pragma: no cover - exercised on CPU-only installations - cp = None - - -def _require_cupy(): - if cp is None: - raise ImportError( - "The cuQuantum circuit converter requires cupy. " - "Install the GPU dependencies or use the CPU backend." - ) - return cp - -# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter - - -class QiboCircuitToEinsum: - """Convert a circuit to a Tensor Network (TN) representation. - - The circuit is first processed to an intermediate form by grouping each gate matrix - with its corresponding qubit it is acting on to a list. It is then converted to an - equivalent TN expression through the class function state_vector_operands() - following the Einstein summation convention in the interleave format. - - See document for detail of the format: https://docs.nvidia.com/cuda/cuquantum/python/api/generated/cuquantum.contract.html - - The output is to be used by cuQuantum's contract() for computation of the - state vectors of the circuit. - """ - - def __init__(self, circuit, dtype="complex128"): - self.backend = _require_cupy() - self.dtype = getattr(self.backend, dtype) - self.init_basis_map(self.backend, dtype) - self.init_intermediate_circuit(circuit) - self.circuit = circuit - - def state_vector_operands(self): - """Create the operands for dense vector computation in the interleave - format. - - Returns: - Operands for the contraction in the interleave format. - """ - input_bitstring = "0" * len(self.active_qubits) - - input_operands = self._get_bitstring_tensors(input_bitstring) - - ( - mode_labels, - qubits_frontier, - next_frontier, - ) = self._init_mode_labels_from_qubits(self.active_qubits) - - gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( - self.gate_tensors, qubits_frontier, next_frontier - ) - - operands = input_operands + gate_operands - mode_labels += gate_mode_labels - - out_list = [] - for key in qubits_frontier: - out_list.append(qubits_frontier[key]) - - operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] - operand_exp_interleave.append(out_list) - return operand_exp_interleave - - def _init_mode_labels_from_qubits(self, qubits): - n = len(qubits) - frontier_dict = {q: i for i, q in enumerate(qubits)} - mode_labels = [[i] for i in range(n)] - return mode_labels, frontier_dict, n - - def _get_bitstring_tensors(self, bitstring): - return [self.basis_map[ibit] for ibit in bitstring] - - def _parse_gates_to_mode_labels_operands( - self, gates, qubits_frontier, next_frontier - ): - mode_labels = [] - operands = [] - - for tensor, gate_qubits in gates: - operands.append(tensor) - input_mode_labels = [] - output_mode_labels = [] - for q in gate_qubits: - input_mode_labels.append(qubits_frontier[q]) - output_mode_labels.append(next_frontier) - qubits_frontier[q] = next_frontier - next_frontier += 1 - mode_labels.append(output_mode_labels + input_mode_labels) - return mode_labels, operands - - def op_shape_from_qubits(self, nqubits): - """Modify tensor to cuQuantum shape. - - Parameters: - nqubits (int): The number of qubits in quantum circuit. - - Returns: - (qubit_states,input_output) * nqubits - """ - return (2, 2) * nqubits - - def init_intermediate_circuit(self, circuit): - """Initialize the intermediate circuit representation. - - This method initializes the intermediate circuit representation by extracting gate matrices and qubit IDs - from the given quantum circuit. - - Parameters: - circuit (object): The quantum circuit object. - """ - self.gate_tensors = [] - gates_qubits = [] - - for gate in circuit.queue: - gate_qubits = gate.control_qubits + gate.target_qubits - gates_qubits.extend(gate_qubits) - - # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on - # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32 - required_shape = self.op_shape_from_qubits(len(gate_qubits)) - self.gate_tensors.append( - ( - self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape( - required_shape - ), - gate_qubits, - ) - ) - - # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit. - self.active_qubits = np.unique(gates_qubits) - - def init_basis_map(self, backend, dtype): - """Initialize the basis map for the quantum circuit. - - This method initializes a basis map for the quantum circuit, which maps binary - strings representing qubit states to their corresponding quantum state vectors. - - Parameters: - backend (object): The backend object providing the array conversion method. - dtype (object): The data type for the quantum state vectors. - """ - asarray = backend.asarray - state_0 = asarray([1, 0], dtype=dtype) - state_1 = asarray([0, 1], dtype=dtype) - - self.basis_map = {"0": state_0, "1": state_1} - - def init_inverse_circuit(self, circuit): - """Initialize the inverse circuit representation. - - This method initializes the inverse circuit representation by extracting gate matrices and qubit IDs - from the given quantum circuit. - - Parameters: - circuit (object): The quantum circuit object. - """ - self.gate_tensors_inverse = [] - gates_qubits_inverse = [] - - for gate in circuit.queue: - gate_qubits = gate.control_qubits + gate.target_qubits - gates_qubits_inverse.extend(gate_qubits) - - # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on - # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32 - required_shape = self.op_shape_from_qubits(len(gate_qubits)) - self.gate_tensors_inverse.append( - ( - self.backend.asarray(gate.matrix()).reshape(required_shape), - gate_qubits, - ) - ) - - # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit. - self.active_qubits_inverse = np.unique(gates_qubits_inverse) - - def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None): - """Populate the gates for all pauli operators. - - Parameters: - pauli_map: A dictionary mapping qubits to pauli operators. - dtype: Data type for the tensor operands. - backend: The package the tensor operands belong to. - - Returns: - A sequence of pauli gates. - """ - if backend is None: - backend = _require_cupy() - asarray = backend.asarray - pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype) - pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype) - pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype) - pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype) - - operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z} - gates = [] - for qubit, pauli_char in pauli_map.items(): - operand = operand_map.get(pauli_char) - if operand is None: - raise ValueError("pauli string character must be one of I/X/Y/Z") - gates.append((operand, (qubit,))) - return gates - - def expectation_operands(self, ham_gates): - """Create the operands for pauli string expectation computation in the - interleave format. - - Parameters: - ham_gates: A list of gates derived from Qibo hamiltonian object. - - Returns: - Operands for the contraction in the interleave format. - """ - input_bitstring = "0" * self.circuit.nqubits - - input_operands = self._get_bitstring_tensors(input_bitstring) - - ( - mode_labels, - qubits_frontier, - next_frontier, - ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits)) - - gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands( - self.gate_tensors, qubits_frontier, next_frontier - ) - - operands = input_operands + gate_operands - mode_labels += gate_mode_labels - - self.init_inverse_circuit(self.circuit.invert()) - - next_frontier = max(qubits_frontier.values()) + 1 - - gates_inverse = ham_gates + self.gate_tensors_inverse - - ( - gate_mode_labels_inverse, - gate_operands_inverse, - ) = self._parse_gates_to_mode_labels_operands( - gates_inverse, qubits_frontier, next_frontier - ) - mode_labels = ( - mode_labels - + gate_mode_labels_inverse - + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)] - ) - operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits] - - operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y] - - return operand_exp_interleave diff --git a/src/qibotn/circuit_to_mps.py b/src/qibotn/circuit_to_mps.py deleted file mode 100644 index 48cf55d..0000000 --- a/src/qibotn/circuit_to_mps.py +++ /dev/null @@ -1,63 +0,0 @@ -import numpy as np - -from qibotn.circuit_convertor import QiboCircuitToEinsum -from qibotn.mps_utils import apply_gate, initial - -try: - import cupy as cp - import cuquantum.bindings.cutensornet as cutn -except ImportError: # pragma: no cover - exercised on CPU-only installations - cp = None - cutn = None - - -def _require_cuquantum(): - if cp is None or cutn is None: - raise ImportError( - "The cuQuantum MPS converter requires cupy and cuquantum. " - "Install the GPU dependencies or use the CPU backend." - ) - - -class QiboCircuitToMPS: - """A helper class to convert Qibo circuit to MPS. - - Parameters: - circ_qibo: The quantum circuit object. - gate_algo(dict): Dictionary for SVD and QR settings. - datatype (str): Either single ("complex64") or double (complex128) precision. - rand_seed(int): Seed for random number generator. - """ - - def __init__( - self, - circ_qibo, - gate_algo, - dtype="complex128", - rand_seed=0, - ): - _require_cuquantum() - np.random.seed(rand_seed) - cp.random.seed(rand_seed) - - self.num_qubits = circ_qibo.nqubits - self.handle = cutn.create() - self.dtype = dtype - self.mps_tensors = initial(self.num_qubits, dtype=dtype) - circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype) - - for gate, qubits in circuitconvertor.gate_tensors: - # mapping from qubits to qubit indices - # apply the gate in-place - apply_gate( - self.mps_tensors, - gate, - qubits, - algorithm=gate_algo, - options={"handle": self.handle}, - ) - - def __del__(self): - handle = getattr(self, "handle", None) - if cutn is not None and handle is not None: - cutn.destroy(handle) diff --git a/src/qibotn/contest_cases.py b/src/qibotn/contest_cases.py new file mode 100644 index 0000000..dfb7962 --- /dev/null +++ b/src/qibotn/contest_cases.py @@ -0,0 +1,241 @@ +"""Shared contest-style circuits and observables for qibotn tools.""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from pathlib import Path + +import numpy as np +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Y, Z +from qibotn.backends.quimb import quimb_torch_parallel_opts + + +@dataclass(frozen=True) +class CaseSpec: + circuit_kind: str + observables: tuple[str, ...] + nqubits: int + nlayers: int + seed: int + target_slices: int | None = None + + +CASES = { + "main1": CaseSpec( + circuit_kind="rxx_rzz_chain", + observables=("ring_xz",), + nqubits=37, + nlayers=20, + seed=31001, + target_slices=None, + ), + "main2": CaseSpec( + circuit_kind="scramble_chain", + observables=("open_zz", "range2_xx"), + nqubits=36, + nlayers=18, + seed=31002, + target_slices=None, + ), + "strong": CaseSpec( + circuit_kind="reversed_cnot", + observables=("ring_xz", "long_z_string"), + nqubits=40, + nlayers=24, + seed=41001, + target_slices=None, + ), +} + + +def _add_single_qubit_layer(circuit, nqubits, rng, include_rx=False): + for qubit in range(nqubits): + circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) + if include_rx: + circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi))) + + +def _add_brickwall(circuit, nqubits, gate, layer, reverse=False): + for qubit in range(0, nqubits - 1, 2): + if reverse and layer % 2: + circuit.add(gate(qubit + 1, qubit)) + else: + circuit.add(gate(qubit, qubit + 1)) + for qubit in range(1, nqubits - 1, 2): + if reverse and not layer % 2: + circuit.add(gate(qubit + 1, qubit)) + else: + circuit.add(gate(qubit, qubit + 1)) + + +def build_contest_circuit(kind, nqubits, nlayers, seed): + """Build one of the contest-style benchmark circuits.""" + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + + if kind == "ghz_ladder": + circuit.add(gates.H(0)) + for qubit in range(nqubits - 1): + circuit.add(gates.CNOT(qubit, qubit + 1)) + return circuit + + for layer in range(nlayers): + if kind in {"brickwall_cnot", "reversed_cnot", "shifted_cz"}: + _add_single_qubit_layer(circuit, nqubits, rng) + elif kind in {"rxx_rzz", "swap_scramble"}: + _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) + elif kind in {"rxx_rzz_chain", "scramble_chain", "scramble"}: + _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + + if kind == "brickwall_cnot": + _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False) + elif kind == "reversed_cnot": + _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True) + elif kind == "shifted_cz": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CZ(qubit, qubit + 1)) + elif kind == "rxx_rzz": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) + elif kind == "swap_scramble": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CZ(qubit, qubit + 1)) + if layer % 4 == 3: + circuit.add(gates.SWAP(qubit, qubit + 1)) + elif kind == "rxx_rzz_chain": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) + elif kind == "scramble_chain": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) + if layer % 5 == 4: + circuit.add(gates.SWAP(qubit, qubit + 1)) + elif kind == "scramble": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) + if layer % 5 == 4: + circuit.add(gates.SWAP(qubit, qubit + 1)) + + return circuit + + +def _dense_observable(nqubits, qubits, seed, dim): + del nqubits + rng = np.random.default_rng(seed) + raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim)) + matrix = (raw + raw.conj().T) / 2.0 + matrix = matrix / np.linalg.norm(matrix) + return {"matrix": matrix, "qubits": list(qubits)} + + +def build_contest_observable(kind, nqubits, seed=0): + """Build one of the shared contest observables.""" + q1 = nqubits // 4 + q2 = nqubits // 2 + q3 = (3 * nqubits) // 4 + last = nqubits - 1 + + if kind == "ring_xz": + form = 0 + for qubit in range(nqubits): + form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits) + return hamiltonians.SymbolicHamiltonian(form=form) + if kind == "open_zz": + form = 0 + for qubit in range(nqubits - 1): + form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1) + return hamiltonians.SymbolicHamiltonian(form=form) + if kind == "range2_xx": + form = 0 + for qubit in range(nqubits - 2): + form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2) + return hamiltonians.SymbolicHamiltonian(form=form) + if kind == "mixed_local": + form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3) + return hamiltonians.SymbolicHamiltonian(form=form) + if kind == "long_z_string": + stride = max(1, nqubits // 16) + form = None + for qubit in range(0, nqubits, stride): + form = Z(qubit) if form is None else form * Z(qubit) + return hamiltonians.SymbolicHamiltonian(form=form) + if kind == "boundary_ZZ_q1": + return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1)) + if kind == "boundary_ZZ_q2": + return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2)) + if kind == "boundary_ZZ_q3": + return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3)) + if kind == "long_Z_5_sites": + return hamiltonians.SymbolicHamiltonian( + form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last) + ) + if kind == "mixed_XZYZX": + return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)) + if kind == "complex_iZ0": + return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0)) + if kind == "dense2_mid": + return _dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4) + if kind == "dense3_spread": + return _dense_observable(nqubits, (q1, q2, q3), seed + 202, 8) + raise ValueError(f"Unknown observable kind {kind!r}.") + + +def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices, merge_gates=True): + slice_label = "auto" if target_slices is None else f"s{target_slices}" + merge_label = "merge" if merge_gates else "nomerge" + return ( + Path(tree_dir) + / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}_{merge_label}.pkl" + ) + + +def selected_observables(args, case): + if args.observables: + return tuple(args.observables) + if args.obs_filter: + return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip()) + return case.observables + + +def apply_case_defaults(args): + case = CASES[args.case] + if args.nqubits is None: + args.nqubits = case.nqubits + if args.nlayers is None: + args.nlayers = case.nlayers + if args.seed is None: + args.seed = case.seed + if args.tn_target_slices is None: + args.tn_target_slices = case.target_slices + args.observables = selected_observables(args, case) + + +def build_parallel_opts(args, tree_file=None, search_only=False): + return quimb_torch_parallel_opts( + target_slices=args.tn_target_slices, + target_size=args.tn_target_size, + search_workers=args.tn_search_workers, + torch_threads=args.torch_threads, + search_repeats=args.tn_search_repeats, + search_time=args.tn_search_time, + search_seed=args.tn_search_seed, + merge_gates=args.merge_gates, + search_backend=args.tn_search_backend, + dask_address=args.dask_address, + dask_expected_workers=args.dask_expected_workers, + dask_close_workers=args.dask_close_workers, + debug_trials=args.tn_debug_trials, + search_only=search_only, + save_tree_path=str(tree_file) if tree_file is not None else None, + load_tree_path=str(tree_file) if tree_file is not None else None, + print_stats=False, + ) diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py index f2fbf71..144e1f8 100644 --- a/src/qibotn/eval.py +++ b/src/qibotn/eval.py @@ -1,8 +1,10 @@ from mpi4py import MPI -from qibotn.circuit_convertor import QiboCircuitToEinsum -from qibotn.circuit_to_mps import QiboCircuitToMPS -from qibotn.mps_contraction_helper import MPSContractionHelper +from qibotn.backends.cutensornet_helpers import ( + MPSContractionHelper, + QiboCircuitToEinsum, + QiboCircuitToMPS, +) from qibotn.observables import ( build_observable, check_observable, diff --git a/src/qibotn/expectation_runner.py b/src/qibotn/expectation_runner.py index 9592974..59ef1b7 100644 --- a/src/qibotn/expectation_runner.py +++ b/src/qibotn/expectation_runner.py @@ -8,7 +8,15 @@ from dataclasses import dataclass import numpy as np from qibo.backends import construct_backend -from qibotn.benchmark_cases import exact_pauli_sum +from qibotn.benchmark_cases import ( + CIRCUITS, + OBSERVABLES, + build_circuit, + exact_pauli_sum, + observable_terms, + parse_names, + terms_to_dict, +) from qibotn.observables import check_observable @@ -77,6 +85,18 @@ class ExpectationResult: parallel_stats: list | None = None +@dataclass +class BenchmarkExpectationRecord: + circuit: str + observable: str + value: float + seconds: float + exact: float | None = None + abs_error: float | None = None + rel_error: float | None = None + parallel_stats: list | None = None + + def _config_from_kwargs(**kwargs): fields = ExpectationConfig.__dataclass_fields__ config_kwargs = {name: kwargs.pop(name) for name in list(kwargs) if name in fields} @@ -155,3 +175,148 @@ def mps_expectation(circuit, observable=None, *, return_result=False, **kwargs): return_result=return_result, **kwargs, ) + + +def cpu_benchmark_parallel_opts( + *, + target_slices=None, + target_size=2**32, + search_workers=None, + torch_threads=8, + search_repeats=128, + search_time=60.0, + search_backend="dask", + dask_address=None, + dask_close_workers=False, + save_tree_path=None, + load_tree_path=None, + search_only=False, + debug_trials=False, + contract_implementation=None, + print_stats=True, +): + """Build parallel TN options for the CPU expectation backend.""" + slicing_opts = {} + if target_slices is not None: + slicing_opts["target_slices"] = target_slices + if target_size is not None: + slicing_opts["target_size"] = target_size + + opts = { + "slicing_opts": slicing_opts or None, + "search_workers": search_workers or torch_threads, + "max_repeats": search_repeats, + "max_time": search_time, + "print_stats": print_stats, + } + if search_backend is not None: + opts["search_backend"] = search_backend + if dask_address is not None: + opts["dask_address"] = dask_address + if save_tree_path is not None: + opts["save_tree_path"] = save_tree_path + if load_tree_path is not None: + opts["load_tree_path"] = load_tree_path + if search_only: + opts["search_only"] = True + if debug_trials: + opts["debug_trials"] = True + if contract_implementation is not None: + opts["contract_implementation"] = contract_implementation + if dask_close_workers: + opts["dask_close_workers"] = True + return opts + + +def run_cpu_benchmark_cases( + *, + nqubits=40, + nlayers=30, + bond=1024, + cut_ratio=1e-12, + seed=42, + torch_threads=8, + quimb_backend="torch", + dtype="complex128", + ansatz="tn", + mpi=False, + exact=False, + exact_max_qubits=24, + circuits=("brickwall_cnot",), + observables=("ring_xz",), + pauli_pattern=None, + parallel_opts=None, +): + """Run the reusable CPU TN/MPS benchmark cases. + + This is the importable library entrypoint for reusable CPU benchmark cases. + """ + selected_circuits = parse_names(list(circuits), CIRCUITS, "circuits") + selected_observables = ( + [] + if pauli_pattern + else parse_names(list(observables), OBSERVABLES, "observables") + ) + + rank = 0 + if mpi: + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + + config = ExpectationConfig( + ansatz=ansatz, + mpi=mpi, + bond=bond, + cut_ratio=cut_ratio, + tensor_module="torch", + quimb_backend=quimb_backend, + dtype=dtype, + torch_threads=torch_threads, + parallel_opts=parallel_opts or {}, + ) + + records = [] + for circuit_kind in selected_circuits: + circuit = build_circuit(circuit_kind, nqubits, nlayers, seed) + named_observables = ( + [(f"pattern:{pauli_pattern}", {"pauli_string_pattern": pauli_pattern})] + if pauli_pattern + else [ + (obs_kind, terms_to_dict(observable_terms(obs_kind, nqubits))) + for obs_kind in selected_observables + ] + ) + + for obs_name, observable in named_observables: + exact_value = None + if exact and rank == 0: + if nqubits > exact_max_qubits: + raise ValueError( + f"exact reference is limited to {exact_max_qubits} qubits." + ) + exact_value = exact_for_observable(circuit, observable, nqubits) + + result = run_cpu_expectation(circuit, observable, config) + if mpi and result.rank != 0: + continue + + abs_error = None if exact_value is None else abs(result.value - exact_value) + rel_error = ( + None + if exact_value is None + else abs_error / max(abs(exact_value), 1e-15) + ) + records.append( + BenchmarkExpectationRecord( + circuit=circuit_kind, + observable=obs_name, + value=result.value, + seconds=result.seconds, + exact=exact_value, + abs_error=abs_error, + rel_error=rel_error, + parallel_stats=result.parallel_stats, + ) + ) + return records diff --git a/src/qibotn/mps_contraction_helper.py b/src/qibotn/mps_contraction_helper.py deleted file mode 100644 index b44cfb2..0000000 --- a/src/qibotn/mps_contraction_helper.py +++ /dev/null @@ -1,131 +0,0 @@ -try: - from cuquantum.tensornet import contract, contract_path -except ImportError: # pragma: no cover - exercised on CPU-only installations - contract = None - contract_path = None - - -def _require_cuquantum(): - if contract is None or contract_path is None: - raise ImportError( - "The cuQuantum MPS contraction helper requires cuquantum. " - "Install the GPU dependencies or use the CPU backend." - ) - -# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb - - -class MPSContractionHelper: - """A helper class to compute various quantities for a given MPS. - - Interleaved format is used to construct the input args for `cuquantum.contract`. - - Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb - - The following compute quantities are supported: - - - the norm of the MPS. - - the equivalent state vector from the MPS. - - the expectation value for a given operator. - - the equivalent state vector after multiplying an MPO to an MPS. - - Parameters: - num_qubits: The number of qubits for the MPS. - """ - - def __init__(self, num_qubits): - self.num_qubits = num_qubits - self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)] - offset = 2 * num_qubits + 1 - self.ket_modes = [ - (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits) - ] - - def contract_norm(self, mps_tensors, options=None): - """Contract the corresponding tensor network to form the norm of the - MPS. - - Parameters: - mps_tensors: A list of rank-3 ndarray-like tensor objects. - The indices of the ith tensor are expected to be bonding index to the i-1 tensor, - the physical mode, and then the bonding index to the i+1th tensor. - options: Specify the contract and decompose options. - - Returns: - The norm of the MPS. - """ - interleaved_inputs = [] - for i, o in enumerate(mps_tensors): - interleaved_inputs.extend( - [o, self.bra_modes[i], o.conj(), self.ket_modes[i]] - ) - interleaved_inputs.append([]) # output - return self._contract(interleaved_inputs, options=options).real - - def contract_state_vector(self, mps_tensors, options=None): - """Contract the corresponding tensor network to form the state vector - representation of the MPS. - - Parameters: - mps_tensors: A list of rank-3 ndarray-like tensor objects. - The indices of the ith tensor are expected to be bonding index to the i-1 tensor, - the physical mode, and then the bonding index to the i+1th tensor. - options: Specify the contract and decompose options. - - Returns: - An ndarray-like object as the state vector. - """ - interleaved_inputs = [] - for i, o in enumerate(mps_tensors): - interleaved_inputs.extend([o, self.bra_modes[i]]) - output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes]) - interleaved_inputs.append(output_modes) # output - return self._contract(interleaved_inputs, options=options) - - def contract_expectation( - self, mps_tensors, operator, qubits, options=None, normalize=False - ): - """Contract the corresponding tensor network to form the expectation of - the MPS. - - Parameters: - mps_tensors: A list of rank-3 ndarray-like tensor objects. - The indices of the ith tensor are expected to be bonding index to the i-1 tensor, - the physical mode, and then the bonding index to the i+1th tensor. - operator: A ndarray-like tensor object. - The modes of the operator are expected to be output qubits followed by input qubits, e.g, - ``A, B, a, b`` where `a, b` denotes the inputs and `A, B'` denotes the outputs. - qubits: A sequence of integers specifying the qubits that the operator is acting on. - options: Specify the contract and decompose options. - normalize: Whether to scale the expectation value by the normalization factor. - - Returns: - An ndarray-like object as the state vector. - """ - - interleaved_inputs = [] - extra_mode = 3 * self.num_qubits + 2 - operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits] - qubits = list(qubits) - for i, o in enumerate(mps_tensors): - interleaved_inputs.extend([o, self.bra_modes[i]]) - k_modes = self.ket_modes[i] - if i in qubits: - k_modes = (k_modes[0], extra_mode, k_modes[2]) - q = qubits.index(i) - operator_modes[q] = extra_mode # output modes - extra_mode += 1 - interleaved_inputs.extend([o.conj(), k_modes]) - interleaved_inputs.extend([operator, tuple(operator_modes)]) - interleaved_inputs.append([]) # output - if normalize: - norm = self.contract_norm(mps_tensors, options=options) - else: - norm = 1 - return self._contract(interleaved_inputs, options=options) / norm - - def _contract(self, interleaved_inputs, options=None): - _require_cuquantum() - path = contract_path(*interleaved_inputs, options=options)[0] - - return contract(*interleaved_inputs, options=options, optimize={"path": path}) diff --git a/src/qibotn/mps_utils.py b/src/qibotn/mps_utils.py deleted file mode 100644 index ff3d010..0000000 --- a/src/qibotn/mps_utils.py +++ /dev/null @@ -1,111 +0,0 @@ -try: - import cupy as cp - from cuquantum.tensornet import contract - from cuquantum.tensornet.experimental import contract_decompose -except ImportError: # pragma: no cover - exercised on CPU-only installations - cp = None - contract = None - contract_decompose = None - - -def _require_cuquantum(): - if cp is None or contract is None or contract_decompose is None: - raise ImportError( - "The cuQuantum MPS helpers require cupy and cuquantum. " - "Install the GPU dependencies or use the CPU backend." - ) - - -def initial(num_qubits, dtype): - r"""Generate the MPS with an initial state of :math:`\ket{00...00}` - - Parameters: - num_qubits: Number of qubits in the Quantum Circuit. - dtype: Either single ("complex64") or double (complex128) precision. - - Returns: - The initial MPS tensors. - """ - _require_cuquantum() - state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1) - mps_tensors = [state_tensor] * num_qubits - return mps_tensors - - -def mps_site_right_swap(mps_tensors, i, **kwargs): - """Perform the swap operation between the ith and i+1th MPS tensors. - - Parameters: - mps_tensors: Tensors representing MPS - i (int): index of the tensor to swap - - Returns: - The updated MPS tensors. - """ - _require_cuquantum() - # contraction followed by QR decomposition - a, _, b = contract_decompose( - "ipj,jqk->iqj,jpk", - *mps_tensors[i : i + 2], - algorithm=kwargs.get("algorithm", None), - options=kwargs.get("options", None), - ) - mps_tensors[i : i + 2] = (a, b) - return mps_tensors - - -def apply_gate(mps_tensors, gate, qubits, **kwargs): - """Apply the gate operand to the MPS tensors in-place. - - # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb - - Parameters: - mps_tensors: A list of rank-3 ndarray-like tensor objects. - The indices of the ith tensor are expected to be the bonding index to the i-1 tensor, - the physical mode, and then the bonding index to the i+1th tensor. - gate: A ndarray-like tensor object representing the gate operand. - The modes of the gate is expected to be output qubits followed by input qubits, e.g, - ``A, B, a, b`` where ``a, b`` denotes the inputs and ``A, B`` denotes the outputs. - qubits: A sequence of integers denoting the qubits that the gate is applied onto. - algorithm: The contract and decompose algorithm to use for gate application. - Can be either a `dict` or a `ContractDecomposeAlgorithm`. - options: Specify the contract and decompose options. - - Returns: - The updated MPS tensors. - """ - - _require_cuquantum() - n_qubits = len(qubits) - if n_qubits == 1: - # single-qubit gate - i = qubits[0] - mps_tensors[i] = contract( - "ipj,qp->iqj", mps_tensors[i], gate, options=kwargs.get("options", None) - ) # in-place update - elif n_qubits == 2: - # two-qubit gate - i, j = qubits - if i > j: - # swap qubits order - return apply_gate(mps_tensors, gate.transpose(1, 0, 3, 2), (j, i), **kwargs) - elif i + 1 == j: - # two adjacent qubits - a, _, b = contract_decompose( - "ipj,jqk,rspq->irj,jsk", - *mps_tensors[i : i + 2], - gate, - algorithm=kwargs.get("algorithm", None), - options=kwargs.get("options", None), - ) - mps_tensors[i : i + 2] = (a, b) # in-place update - else: - # non-adjacent two-qubit gate - # step 1: swap i with i+1 - mps_site_right_swap(mps_tensors, i, **kwargs) - # step 2: apply gate to (i+1, j) pair. This amounts to a recursive swap until the two qubits are adjacent - apply_gate(mps_tensors, gate, (i + 1, j), **kwargs) - # step 3: swap back i and i+1 - mps_site_right_swap(mps_tensors, i, **kwargs) - else: - raise NotImplementedError("Only one- and two-qubit gates supported") diff --git a/src/qibotn/observables.py b/src/qibotn/observables.py index 7f3c242..b90a2da 100644 --- a/src/qibotn/observables.py +++ b/src/qibotn/observables.py @@ -35,7 +35,17 @@ def check_observable(observable, circuit_nqubit): if isinstance(observable, dict): return create_hamiltonian_from_dict(observable, circuit_nqubit) if isinstance(observable, hamiltonians.SymbolicHamiltonian): - return observable + if observable.nqubits == circuit_nqubit: + return observable + if observable.nqubits > circuit_nqubit: + raise ValueError( + "Observable has more qubits than the circuit: " + f"{observable.nqubits} > {circuit_nqubit}." + ) + return hamiltonians.SymbolicHamiltonian( + form=observable.form, + nqubits=circuit_nqubit, + ) try: return hamiltonians.SymbolicHamiltonian(form=observable) except Exception as exc: diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py index 0fd577c..d7746b1 100644 --- a/src/qibotn/parallel.py +++ b/src/qibotn/parallel.py @@ -1,12 +1,16 @@ """Parallel path search and contraction utilities for tensor networks.""" +import importlib import os import pickle import signal import time -from math import log2, log10 -import numpy as np -from dataclasses import dataclass +from collections import Counter, defaultdict from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed +from dataclasses import dataclass +from math import log2, log10 +from pathlib import Path + +import numpy as np try: from mpi4py import MPI @@ -40,6 +44,12 @@ def _optimizer_search_stats(opt): } +def _tree_search_stats(tree): + if tree is None: + return {} + return getattr(tree, "qibotn_search_stats", {}) or {} + + def _attach_search_stats(tree, opt): try: tree.qibotn_search_stats = _optimizer_search_stats(opt) @@ -48,6 +58,47 @@ def _attach_search_stats(tree, opt): return tree +def _search_seed_kwargs(optlib, seed): + if optlib == "random": + return {"seed": seed} + if optlib is None: + return {"sampler_opts": {"seed": seed}} + return {} + + +def _fallback_greedy_tree(tn, output_inds, slicing_opts=None, error=None): + import cotengra as ctg + + tree = tn.contraction_tree( + output_inds=output_inds, + optimize=ctg.GreedyOptimizer(), + ) + if slicing_opts: + target_size = slicing_opts.get("target_size") + target_slices = slicing_opts.get("target_slices") + if target_size is not None: + tree.slice_(target_size=target_size) + elif target_slices is not None: + tree.slice_(target_slices=target_slices) + try: + tree.qibotn_search_stats = { + "completed_trials": 0, + "finite_trials": 0, + "failed_trials": 0, + "requested_trials": 0, + "trial_seconds_sum": 0.0, + "best_score": float("nan"), + "best_flops": float("nan"), + "best_write": float("nan"), + "best_size": float("nan"), + "fallback": "greedy", + "fallback_error": repr(error) if error is not None else None, + } + except Exception: + pass + return tree + + def _dask_worker_slots(client): info = client.scheduler_info(n_workers=-1) workers = info.get("workers", {}) @@ -218,13 +269,18 @@ def _search_chunk( slicing_opts, optlib=None, ): - import random, cotengra as ctg + import random + import cotengra as ctg + + seed = int(seed) random.seed(seed) + np.random.seed(seed % (2**32)) tn = pickle.loads(tn_bytes) kwargs = {} if optlib is not None: kwargs["optlib"] = optlib + kwargs.update(_search_seed_kwargs(optlib, seed)) opt = ctg.HyperOptimizer( methods=SEARCH_METHODS, max_repeats=repeats, @@ -266,7 +322,15 @@ def _kill_pool(pool): pool.shutdown(wait=False) -def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=None, trial_timeout=None): +def _serial_search( + tn_bytes, + output_inds, + repeats, + seed, + max_time, + slicing_opts=None, + trial_timeout=None, +): import time if trial_timeout is None: @@ -287,7 +351,13 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts= break timeout = min(trial_timeout, deadline - time.time()) pool = ProcessPoolExecutor(max_workers=1) - fut = pool.submit(_run_single_trial, tn_bytes, output_inds, seed * 10000 + i, slicing_opts) + fut = pool.submit( + _run_single_trial, + tn_bytes, + output_inds, + seed * 10000 + i, + slicing_opts, + ) try: cost, tree = fut.result(timeout=timeout) if cost < best_cost: @@ -304,15 +374,30 @@ def _split_repeats(total_repeats, n_workers): n_workers = max(1, int(n_workers)) total_repeats = max(1, int(total_repeats)) chunk, extra = divmod(total_repeats, n_workers) - return [chunk + (1 if i < extra else 0) for i in range(n_workers) if chunk + (1 if i < extra else 0) > 0] + return [ + chunk + (1 if i < extra else 0) + for i in range(n_workers) + if chunk + (1 if i < extra else 0) > 0 + ] -def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None): +def _processpool_search( + tn, + output_inds, + total_repeats, + n_workers, + max_time, + slicing_opts=None, + trial_timeout=None, + search_seed=0, +): tn_bytes = pickle.dumps(tn) repeat_chunks = _split_repeats(total_repeats, n_workers) pool = ProcessPoolExecutor(max_workers=len(repeat_chunks)) futures = [] - for seed, repeats in enumerate(repeat_chunks): + errors = [] + for worker_id, repeats in enumerate(repeat_chunks): + seed = int(search_seed) + worker_id futures.append( pool.submit( _serial_search, @@ -334,14 +419,34 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, sli cost, tree = fut.result() if cost < best_cost: best_cost, best_tree = cost, tree - except Exception: - pass + except Exception as exc: + errors.append(repr(exc)) except TimeoutError: - pass + errors.append("TimeoutError()") finally: for fut in futures: fut.cancel() _kill_pool(pool) + if best_tree is None: + if errors: + print( + "qibotn_search_failed " + f"backend=processpool errors={errors[:3]} " + f"num_errors={len(errors)} fallback=greedy", + flush=True, + ) + else: + print( + "qibotn_search_failed " + "backend=processpool errors=[] fallback=greedy", + flush=True, + ) + return _fallback_greedy_tree( + tn, + output_inds, + slicing_opts=slicing_opts, + error=errors[:3], + ) return best_tree @@ -357,6 +462,7 @@ def _dask_search( debug_trials=False, close_workers=False, expected_workers=None, + search_seed=0, ): """Run one centralized cotengra hyper-optimizer over a dask pool. @@ -371,8 +477,14 @@ def _dask_search( "`pip install distributed` or the package extra that provides it." ) from exc + import random + import cotengra as ctg + search_seed = int(search_seed) + random.seed(search_seed) + np.random.seed(search_seed % (2**32)) + _patch_cotengra_dask_as_completed() _patch_cotengra_dask_submit(debug_trials=debug_trials) @@ -400,6 +512,7 @@ def _dask_search( kwargs = {} if optlib is not None: kwargs["optlib"] = optlib + kwargs.update(_search_seed_kwargs(optlib, search_seed)) retire_workers = [] try: @@ -470,10 +583,12 @@ def _mpi_search( dask_address=None, debug_trials=False, dask_close_workers=False, + search_seed=0, ): comm = MPI.COMM_WORLD rank, size = comm.Get_rank(), comm.Get_size() search_backend = search_backend or "processpool" + search_seed = int(search_seed) if search_backend == "dask": if not dask_address: @@ -496,6 +611,7 @@ def _mpi_search( n_workers=n_workers, debug_trials=debug_trials, close_workers=dask_close_workers, + search_seed=search_seed, ) payload = ("ok", tree) except Exception as exc: @@ -518,6 +634,7 @@ def _mpi_search( max_time, slicing_opts, trial_timeout, + search_seed=search_seed + rank * max(1, n_workers or 1), ) local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf") @@ -531,11 +648,22 @@ def _mpi_search( return comm.bcast(best_tree, root=0) -def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024, - max_time=300, n_workers=48, slicing_opts=None, - trial_timeout=None, search_backend=None, - dask_address=None, debug_trials=False, - dask_close_workers=False, expected_workers=None): +def parallel_path_search( + tn, + output_inds, + method="processpool", + total_repeats=1024, + max_time=300, + n_workers=48, + slicing_opts=None, + trial_timeout=None, + search_backend=None, + dask_address=None, + debug_trials=False, + dask_close_workers=False, + expected_workers=None, + search_seed=0, +): """Parallel contraction path search. Args: @@ -546,11 +674,32 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10 slicing_opts: cotengra slicing options for memory control trial_timeout: Per-trial timeout (seconds); kills and skips hung trials """ - if method == 'serial': + if method == "serial": tn_bytes = pickle.dumps(tn) - _, tree = _serial_search(tn_bytes, output_inds, total_repeats, 0, max_time, slicing_opts, trial_timeout) + try: + _, tree = _serial_search( + tn_bytes, + output_inds, + total_repeats, + search_seed, + max_time, + slicing_opts, + trial_timeout, + ) + except Exception as exc: + print( + "qibotn_search_failed " + f"backend=serial error={exc!r} fallback=greedy", + flush=True, + ) + return _fallback_greedy_tree( + tn, + output_inds, + slicing_opts=slicing_opts, + error=exc, + ) return tree - elif method == 'mpi': + if method == "mpi": if not _HAVE_MPI: raise ImportError("mpi4py not available") return _mpi_search( @@ -565,10 +714,20 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10 dask_address=dask_address, debug_trials=debug_trials, dask_close_workers=dask_close_workers, + search_seed=search_seed, ) - elif method == 'processpool': - return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout) - elif method == 'dask': + if method == "processpool": + return _processpool_search( + tn, + output_inds, + total_repeats, + n_workers, + max_time, + slicing_opts, + trial_timeout, + search_seed=search_seed, + ) + if method == "dask": return _dask_search( tn, output_inds, @@ -580,9 +739,9 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10 debug_trials=debug_trials, close_workers=dask_close_workers, expected_workers=expected_workers, + search_seed=search_seed, ) - else: - raise ValueError(f"Unknown method: {method}") + raise ValueError(f"Unknown method: {method}") def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256): @@ -615,6 +774,171 @@ def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256): } +def load_tree_payload(path, index=0): + with Path(path).open("rb") as f: + payload = pickle.load(f) + trees = payload["trees"] if isinstance(payload, dict) else payload + if not isinstance(trees, (list, tuple)): + trees = [trees] + return payload, trees[index] + + +def save_tree_payload(path, payload): + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("wb") as f: + pickle.dump(payload, f) + + +def slice_tree_payload(path, output_path, *, term=0, target_slices=2, max_repeats=64, seed=42): + payload, tree = load_tree_payload(path, index=term) + original_costs = contraction_tree_costs(tree) + sliced_tree = tree.slice( + target_slices=target_slices, + max_repeats=max_repeats, + seed=seed, + ) + sliced_costs = contraction_tree_costs(sliced_tree) + + if isinstance(payload, dict): + out_payload = dict(payload) + trees = payload["trees"] if isinstance(payload["trees"], (list, tuple)) else [payload["trees"]] + new_trees = list(trees) + new_trees[term] = sliced_tree + out_payload["trees"] = new_trees + out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees] + out_payload["nterms"] = len(new_trees) + else: + trees = payload if isinstance(payload, (list, tuple)) else [payload] + new_trees = list(trees) + new_trees[term] = sliced_tree + out_payload = new_trees + + save_tree_payload(output_path, out_payload) + return TreePayloadSliceResult( + payload=payload, + tree=tree, + sliced_tree=sliced_tree, + original_costs=original_costs, + sliced_costs=sliced_costs, + ) + + +def _prod(values): + out = 1 + for value in values: + out *= int(value) + return out + + +def _broadcast_batch(a_batch, b_batch): + if a_batch == b_batch: + return _prod(a_batch) + if not a_batch: + return _prod(b_batch) + if not b_batch: + return _prod(a_batch) + ndim = max(len(a_batch), len(b_batch)) + a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch) + b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch) + return _prod(max(a, b) for a, b in zip(a_batch, b_batch)) + + +def analyze_contraction_tree(tree): + contract_mod = importlib.import_module("cotengra.contract") + contractions = contract_mod.extract_contractions(tree) + size_dict = tree.size_dict + ops = [] + counts = Counter() + + for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions): + if left is None and right is None: + counts["preprocess"] += 1 + continue + + left_inds = tree.get_inds(left) + right_inds = tree.get_inds(right) + parent_inds = tree.get_inds(parent) + left_shape = tuple(size_dict[ix] for ix in left_inds) + right_shape = tuple(size_dict[ix] for ix in right_inds) + + if tdot: + parsed = contract_mod._parse_tensordot_axes_to_matmul( + arg, + left_shape, + right_shape, + ) + else: + parsed = contract_mod._parse_eq_to_batch_matmul( + arg, + left_shape, + right_shape, + ) + + ( + _eq_a, + _eq_b, + new_shape_a, + new_shape_b, + _new_shape_ab, + _perm_ab, + pure_multiplication, + ) = parsed + + matmul_shape = None + matmul_flops = 0 + if pure_multiplication: + kind = "mul" + else: + a_shape = tuple(new_shape_a or left_shape) + b_shape = tuple(new_shape_b or right_shape) + batch = _broadcast_batch(a_shape[:-2], b_shape[:-2]) + m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1]) + kind = "mm" if batch == 1 else "bmm" + matmul_shape = (batch, m, k, n) + matmul_flops = batch * m * k * n + + tree_flops = int(tree.get_flops(parent)) + out_size = int(tree.get_size(parent)) + ops.append( + ContractionOpInfo( + index=op_index, + kind=kind, + matmul_shape=matmul_shape, + matmul_flops=matmul_flops, + tree_flops=tree_flops, + out_size=out_size, + left_shape=left_shape, + right_shape=right_shape, + left_rank=len(left_inds), + right_rank=len(right_inds), + out_rank=len(parent_inds), + perm=perm, + ) + ) + counts[kind] += 1 + + nslices = int(getattr(tree, "multiplicity", 1)) + per_slice_flops = sum(op.tree_flops for op in ops) + per_slice_write = sum(op.out_size for op in ops) + max_out = max((op.out_size for op in ops), default=0) + dtype_bytes = 16 + return TreeInspectionResult( + tree=tree, + contractions=tuple(contractions), + operations=tuple(ops), + counts=dict(counts), + nslices=nslices, + per_slice_flops=per_slice_flops, + per_slice_write=per_slice_write, + max_output_size=max_out, + all_slice_flops=per_slice_flops * nslices, + all_slice_write=per_slice_write * nslices, + dtype_bytes=dtype_bytes, + max_output_gib=max_out * dtype_bytes / 1024**3, + ) + + @dataclass(frozen=True) class SlicePlan: """Slice ownership for one MPI rank.""" @@ -637,6 +961,49 @@ class SlicedContractStats: assignment: str +@dataclass(frozen=True) +class TreePayloadSliceResult: + """Result of slicing one tree stored in a tree payload.""" + + payload: object + tree: object + sliced_tree: object + original_costs: dict + sliced_costs: dict + + +@dataclass(frozen=True) +class ContractionOpInfo: + index: int + kind: str + matmul_shape: tuple | None + matmul_flops: int + tree_flops: int + out_size: int + left_shape: tuple + right_shape: tuple + left_rank: int + right_rank: int + out_rank: int + perm: object + + +@dataclass(frozen=True) +class TreeInspectionResult: + tree: object + contractions: tuple + operations: tuple + counts: dict + nslices: int + per_slice_flops: int + per_slice_write: int + max_output_size: int + all_slice_flops: int + all_slice_write: int + dtype_bytes: int + max_output_gib: float + + def mpi_slice_plan(nslices, rank, size, assignment="block"): """Return the contraction slice ids assigned to one MPI rank. diff --git a/src/qibotn/torch_utils.py b/src/qibotn/torch_utils.py new file mode 100644 index 0000000..98cd19c --- /dev/null +++ b/src/qibotn/torch_utils.py @@ -0,0 +1,90 @@ +"""Shared torch helpers for qibotn CPU tensor-network code.""" + +from __future__ import annotations + +import numpy as np + + +def torch_dtype(dtype): + """Return the torch dtype used by qibotn complex CPU contractions.""" + import torch + + if dtype in ("complex64", "single", np.complex64): + return torch.complex64 + return torch.complex128 + + +def numpy_dtype(dtype): + """Return the numpy dtype matching qibotn's complex dtype names.""" + if dtype in ("complex64", "single", np.complex64): + return np.complex64 + return np.complex128 + + +def torch_cpu_array(data, dtype=None): + """Convert array-like data to a contiguous CPU torch tensor. + + ``torch.from_numpy`` rejects negative strides and read-only arrays in common + quimb paths, so this helper normalizes both cases before handing data to + torch. + """ + import torch + + if isinstance(data, torch.Tensor): + tensor = data + else: + array = np.asarray(data) + if any(stride < 0 for stride in array.strides): + array = np.ascontiguousarray(array) + elif not array.flags.writeable: + array = array.copy() + tensor = torch.from_numpy(array) + + if tensor.device.type != "cpu": + tensor = tensor.cpu() + target_dtype = torch_dtype(dtype) if isinstance(dtype, str) else dtype + if target_dtype is not None and tensor.dtype != target_dtype: + tensor = tensor.to(target_dtype) + if not tensor.is_contiguous(): + tensor = tensor.contiguous() + return tensor + + +def arrays_to_torch(arrays, dtype="complex128"): + """Convert an iterable of arrays to CPU torch tensors.""" + target_dtype = torch_dtype(dtype) + return [torch_cpu_array(array, dtype=target_dtype) for array in arrays] + + +def arrays_to_numpy(arrays, dtype="complex128"): + """Convert an iterable of arrays to numpy arrays with qibotn dtype names.""" + target_dtype = numpy_dtype(dtype) + return [np.asarray(array, dtype=target_dtype) for array in arrays] + + +def arrays_to_backend(arrays, backend, engine=None, dtype="complex128"): + """Convert arrays to the backend representation used by quimb/cotengra.""" + if backend == "torch": + return arrays_to_torch(arrays, dtype=dtype) + if engine is not None: + return [engine.asarray(array, dtype=numpy_dtype(dtype)) for array in arrays] + return arrays_to_numpy(arrays, dtype=dtype) + + +def set_torch_threads(nthreads=None, interop_threads=None): + """Set torch CPU thread counts and return the active intra-op thread count.""" + import torch + + if nthreads is not None: + torch.set_num_threads(max(1, int(nthreads))) + if interop_threads is not None: + try: + torch.set_num_interop_threads(max(1, int(interop_threads))) + except RuntimeError: + pass + return torch.get_num_threads() + + +def is_torch_array(value): + """Return whether *value* looks like a torch tensor without importing torch.""" + return type(value).__module__.startswith("torch") diff --git a/tests/test_cpu_backend.py b/tests/test_cpu_backend.py index e5ea781..5041869 100644 --- a/tests/test_cpu_backend.py +++ b/tests/test_cpu_backend.py @@ -10,6 +10,11 @@ from qibotn.benchmark_cases import ( exact_pauli_sum, ) from qibotn import cpu_expectation, mps_expectation, pauli_pattern, pauli_sum +from qibotn.backends.quimb import ( + build_expectation_tn, + contract_tn, + search_contraction_tree, +) def build_circuit(nqubits=6): @@ -61,6 +66,31 @@ def test_public_cpu_expectation_api_matches_statevector(): assert math.isclose(value, exact, abs_tol=1e-12) +def test_public_quimb_torch_pipeline_matches_statevector(): + circuit = build_circuit(nqubits=4) + observable = hamiltonians.SymbolicHamiltonian(form=X(0) * Z(1)) + exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4) + + built = build_expectation_tn( + circuit, + observable, + dtype="complex128", + merge_1q=True, + merge_2q=True, + ) + search = search_contraction_tree( + built.tn, + method="serial", + total_repeats=1, + max_time=30, + n_workers=1, + search_seed=0, + ) + value = built.coeff * complex(contract_tn(built.tn, search.tree)) + + assert math.isclose(value.real, exact, abs_tol=1e-12) + + def test_public_mps_expectation_api_accepts_pauli_pattern(): circuit = build_circuit() exact_hamiltonian = hamiltonians.SymbolicHamiltonian( diff --git a/tools/README.md b/tools/README.md deleted file mode 100644 index 284a712..0000000 --- a/tools/README.md +++ /dev/null @@ -1,19 +0,0 @@ -# Tools - -Auxiliary scripts for profiling, legacy comparisons, and scale probes. - -The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`. -For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`. - -Files here are intentionally secondary: - -- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea. -- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path. -- `run_cpu_single_cases.sh`: single-node scale probes. -- `run_cpu_large_cases.sh`: two-node MPI scale probes. -- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI. -- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands. -- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments. -- `qibojit_reference_expectation.py`: state-vector reference helper. -- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper. -- `mpi_torch_thread_probe.py`: MPI + torch OpenMP affinity and threading probe. diff --git a/tools/baseline_mps_expectation.py b/tools/baseline_mps_expectation.py deleted file mode 100644 index ef12ae3..0000000 --- a/tools/baseline_mps_expectation.py +++ /dev/null @@ -1,201 +0,0 @@ -"""MPS expectation benchmark for qmatchatea and Vidal backends.""" - -import argparse -import json -import logging -import os -import socket -import time - -import numpy as np - -from qibotn.benchmark_cases import ( - build_circuit as build_benchmark_circuit, - exact_pauli_sum, - observable_terms, - terms_to_dict, -) -from qibotn.backends.qmatchatea import QMatchaTeaBackend -from qibotn.backends.vidal_tebd import run_vidal_ring_xz - - -def optional_int(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return int(text) - - -def optional_float(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return float(text) - - -def format_optional(value, fmt="g"): - return "None" if value is None else format(value, fmt) - - -def build_circuit(nqubits, nlayers, seed): - return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed) - - -def build_observable(nqubits): - return terms_to_dict(observable_terms("ring_xz", nqubits)) - - -def exact_expectation(circuit, nqubits): - return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=40) - parser.add_argument("--nlayers", type=int, default=30) - parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=512) - parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch") - parser.add_argument("--torch-threads", type=int, default=32) - parser.add_argument( - "--executor", - choices=("qmatchatea", "vidal", "vidal-mpi"), - default="qmatchatea", - ) - parser.add_argument("--mpi-ct", action="store_true") - parser.add_argument("--mpi-barriers", type=int, default=-1) - parser.add_argument("--mpi-isometrization", type=int, default=-1) - parser.add_argument("--exact", action="store_true") - parser.add_argument("--exact-max-qubits", type=int, default=24) - parser.add_argument("--reference-file") - parser.add_argument( - "--mpi-rank-map", - action="store_true", - help="Print MPI rank, host, pid, and torch thread placement metadata.", - ) - args = parser.parse_args() - logging.getLogger("qibo.config").setLevel(logging.ERROR) - logging.getLogger("qtealeaves").setLevel(logging.ERROR) - import torch - - torch.set_num_threads(args.torch_threads) - rank = 0 - size = 1 - if args.mpi_ct: - from mpi4py import MPI - - rank = MPI.COMM_WORLD.Get_rank() - size = MPI.COMM_WORLD.Get_size() - if args.mpi_rank_map: - rank_info = { - "rank": rank, - "size": size, - "host": socket.gethostname(), - "pid": os.getpid(), - "torch_threads": args.torch_threads, - "omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""), - "mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""), - } - rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0) - if rank == 0: - print("mpi_rank_map") - for item in sorted(rank_infos, key=lambda row: row["rank"]): - print( - "rank={rank} size={size} host={host} pid={pid} " - "torch_threads={torch_threads} " - "OMP_NUM_THREADS={omp_num_threads} " - "MKL_NUM_THREADS={mkl_num_threads}".format(**item) - ) - - circuit = build_circuit(args.nqubits, args.nlayers, args.seed) - observable = build_observable(args.nqubits) - exact = None - if args.reference_file: - with open(args.reference_file, "r", encoding="utf-8") as f: - exact = float(json.load(f)["expectation"]) - elif args.exact: - if args.nqubits > args.exact_max_qubits: - raise ValueError( - f"--exact is limited to {args.exact_max_qubits} qubits by default." - ) - exact = exact_expectation(circuit, args.nqubits) - - if rank == 0: - if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"): - mpi_label = f"VidalSegment/{size}" - else: - mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR" - print( - f"nqubits={args.nqubits} nlayers={args.nlayers} " - f"bond={format_optional(args.bond)} " - f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} " - f"tensor_module={args.tensor_module} svd_control=E! " - f"compile_circuit=True mpi={mpi_label} executor={args.executor}" - ) - if exact is not None: - print(f"exact={exact:.16e}") - print("expval abs_error rel_error seconds") - - start = time.perf_counter() - timings = None - if args.executor in ("vidal", "vidal-mpi"): - if args.executor == "vidal-mpi" and not args.mpi_ct: - raise ValueError("--executor vidal-mpi requires --mpi-ct.") - if args.mpi_ct: - from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz - - value, timings = run_segment_vidal_mpi_ring_xz( - circuit, - max_bond=args.bond, - cut_ratio=args.cut_ratio, - tensor_module=args.tensor_module, - comm=MPI.COMM_WORLD, - ) - else: - value = run_vidal_ring_xz( - circuit, - max_bond=args.bond, - cut_ratio=args.cut_ratio, - tensor_module=args.tensor_module, - ) - else: - backend = QMatchaTeaBackend() - backend.configure_tn_simulation( - ansatz="MPS", - max_bond_dimension=args.bond, - cut_ratio=args.cut_ratio, - svd_control="E!", - tensor_module=args.tensor_module, - compile_circuit=True, - track_memory=False, - mpi_approach="CT" if args.mpi_ct else "SR", - mpi_num_procs=size, - mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1, - mpi_isometrization=args.mpi_isometrization, - ) - value = backend.expectation( - circuit, - observable, - preprocess=False, - compile_circuit=True, - ) - max_timings = None - if timings: - max_timings = { - key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0) - for key, local_value in timings.items() - } - if rank != 0: - return - value = float(np.real(value)) - elapsed = time.perf_counter() - start - abs_error = float("nan") if exact is None else abs(value - exact) - rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) - print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}") - if max_timings: - print("timing_section max_seconds") - for key, max_value in max_timings.items(): - print(f"{key} {max_value:.6f}") - - -if __name__ == "__main__": - main() diff --git a/tools/benchmark_contract_sliced.py b/tools/benchmark_contract_sliced.py deleted file mode 100644 index a089546..0000000 --- a/tools/benchmark_contract_sliced.py +++ /dev/null @@ -1,56 +0,0 @@ -"""MPI parallel sliced contraction using pre-sliced tree.""" -import time, pickle, os -import numpy as np -from mpi4py import MPI - -NQUBITS, NLAYERS, NCORES = 25, 10, 48 - -comm = MPI.COMM_WORLD -rank, size = comm.Get_rank(), comm.Get_size() - -os.environ['OMP_NUM_THREADS'] = str(NCORES) -os.environ['MKL_NUM_THREADS'] = str(NCORES) - -import torch -import qibo, quimb as qu -from qibotn.observables import build_random_circuit - -torch.set_num_threads(NCORES) - -circuit = build_random_circuit(NQUBITS, NLAYERS) -qibo.set_backend("qibotn", platform="quimb") -backend = qibo.get_backend() -backend.configure_tn_simulation(ansatz="tn") -qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz) -tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn') - -if rank == 0: - with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f: - tree = pickle.load(f) -else: - tree = None -tree = comm.bcast(tree, root=0) - -arrays = [torch.from_numpy(np.asarray(t._data)) for t in tn.tensors] -n_slices = tree.multiplicity - -if rank == 0: - print(f"Slices: {n_slices}, Ranks: {size}, " - f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, " - f"Threads/rank: {NCORES}, Backend: torch") - -t0 = time.time() -result = None -for i in range(rank, n_slices, size): - val = tree.contract_slice(arrays, i, backend='torch') - val_np = val.cpu().numpy().reshape(-1) - result = val_np if result is None else result + val_np - -if result is None: - result = np.zeros(1, dtype=np.complex128) - -total = np.zeros_like(result) if rank == 0 else None -comm.Reduce(result, total, root=0) - -if rank == 0: - print(f"Contract: {time.time() - t0:.4f}s Expectation: {0.5 * total[0].real:.10f}") diff --git a/tools/benchmark_qredtea_svd_controls.py b/tools/benchmark_qredtea_svd_controls.py deleted file mode 100644 index 4111c48..0000000 --- a/tools/benchmark_qredtea_svd_controls.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python -"""Benchmark qredtea/qtealeaves SVD control modes. - -This isolates the tensor split used by MPS updates: a rank-2 tensor is split -with singular values contracted either left or right, then reconstructed to -measure numerical error and timing. -""" - -from __future__ import annotations - -import argparse -import gc -import statistics -import time - -import torch - -import qmatchatea -from qredtea.torchapi import QteaTorchTensor - - -def _dtype(name: str): - return { - "complex64": torch.complex64, - "complex128": torch.complex128, - "float64": torch.float64, - "float32": torch.float32, - }[name] - - -def _random_matrix(shape, dtype, seed): - gen = torch.Generator(device="cpu") - gen.manual_seed(seed) - if dtype.is_complex: - real_dtype = torch.float32 if dtype == torch.complex64 else torch.float64 - real = torch.randn(shape, dtype=real_dtype, generator=gen) - imag = torch.randn(shape, dtype=real_dtype, generator=gen) - return torch.complex(real, imag).to(dtype) - return torch.randn(shape, dtype=dtype, generator=gen) - - -def _sync(): - if torch.cuda.is_available(): - torch.cuda.synchronize() - - -def run_one(matrix, ctrl, max_bond, contract_singvals, repeats): - conv = qmatchatea.QCConvergenceParameters( - max_bond_dimension=max_bond, - cut_ratio=0.0, - svd_ctrl=ctrl, - ) - qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu") - - times = [] - rel_error = None - kept = None - status = "ok" - error = "" - - for i in range(repeats): - gc.collect() - _sync() - t0 = time.perf_counter() - try: - left, right, singvals, _ = qtensor.split_svd( - [0], - [1], - contract_singvals=contract_singvals, - conv_params=conv, - ) - except Exception as exc: # noqa: BLE001 - benchmark should keep going - status = "error" - error = repr(exc) - break - _sync() - times.append(time.perf_counter() - t0) - - if i == repeats - 1: - left_matrix = left.elem.reshape(matrix.shape[0], -1) - right_matrix = right.elem.reshape(-1, matrix.shape[1]) - recon = left_matrix @ right_matrix - rel_error = ( - torch.linalg.vector_norm(matrix - recon) - / torch.linalg.vector_norm(matrix) - ).item() - kept = int(singvals.numel()) - - return { - "ctrl": ctrl, - "contract_singvals": contract_singvals, - "status": status, - "median_ms": float("nan") if not times else statistics.median(times) * 1000, - "min_ms": float("nan") if not times else min(times) * 1000, - "rel_error": rel_error, - "kept": kept, - "error": error, - } - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--shapes", nargs="+", default=("256x1024", "1024x256", "512x512")) - parser.add_argument("--max-bond", type=int, default=128) - parser.add_argument("--dtype", choices=("complex64", "complex128", "float32", "float64"), default="complex128") - parser.add_argument("--threads", type=int, default=8) - parser.add_argument("--repeats", type=int, default=3) - parser.add_argument( - "--controls", - nargs="+", - default=("A", "D", "V", "R", "E", "E!", "X", "X!"), - ) - args = parser.parse_args() - - torch.set_num_threads(args.threads) - dtype = _dtype(args.dtype) - - print( - "svd_benchmark " - f"dtype={args.dtype} threads={torch.get_num_threads()} " - f"max_bond={args.max_bond} repeats={args.repeats}", - flush=True, - ) - print( - "columns shape contract ctrl status median_ms min_ms kept rel_error error", - flush=True, - ) - - for shape_text in args.shapes: - m_text, n_text = shape_text.lower().split("x", 1) - shape = (int(m_text), int(n_text)) - matrix = _random_matrix(shape, dtype, seed=sum(shape)) - for contract_singvals in ("L", "R"): - for ctrl in args.controls: - result = run_one( - matrix, - ctrl=ctrl, - max_bond=args.max_bond, - contract_singvals=contract_singvals, - repeats=args.repeats, - ) - print( - f"row shape={shape_text} " - f"contract={contract_singvals} " - f"ctrl={ctrl} " - f"status={result['status']} " - f"median_ms={result['median_ms']:.3f} " - f"min_ms={result['min_ms']:.3f} " - f"kept={result['kept']} " - f"rel_error={result['rel_error']} " - f"error={result['error']}", - flush=True, - ) - - -if __name__ == "__main__": - main() diff --git a/tools/benchmark_search.py b/tools/benchmark_search.py deleted file mode 100644 index f0bc464..0000000 --- a/tools/benchmark_search.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Search contraction path and save.""" -import time, os, pickle -from qibotn.parallel import parallel_path_search -from qibotn.observables import build_random_circuit -import qibo, quimb as qu - -from mpi4py import MPI - -NQUBITS, NLAYERS, WORKERS = 20, 10, 96 - -comm = MPI.COMM_WORLD -rank, size = comm.Get_rank(), comm.Get_size() -method = 'mpi' if size > 1 else 'processpool' - -circuit = build_random_circuit(NQUBITS, NLAYERS) -qibo.set_backend("qibotn", platform="quimb") -backend = qibo.get_backend() -backend.configure_tn_simulation(ansatz="tn") -qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz) -tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn') - -if rank == 0: - print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...") -t0 = time.time() -tree = parallel_path_search(tn, tn.outer_inds(), method=method, - total_repeats=1024, max_time=300, n_workers=WORKERS,trial_timeout=60) -t_search = time.time() - t0 - -if rank == 0: - os.makedirs('data', exist_ok=True) - path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl" - with open(path, 'wb') as f: - pickle.dump(tree, f) - print(f"Search: {t_search:.2f}s Peak: {tree.max_size() * 16 / 1e9:.2f} GB Saved: {path}") diff --git a/tools/benchmark_slice.py b/tools/benchmark_slice.py deleted file mode 100644 index b398857..0000000 --- a/tools/benchmark_slice.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Slice saved tree and save.""" -import pickle - -NQUBITS, NLAYERS = 25, 10 - -with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f: - tree = pickle.load(f) - -print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB") - -tree_sliced = tree.slice_and_reconfigure(target_size=2**28) - -with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f: - pickle.dump(tree_sliced, f) - -print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB Slices: {tree_sliced.multiplicity}") diff --git a/tools/benchmark_tn_mpi.py b/tools/benchmark_tn_mpi.py deleted file mode 100644 index 8dc80d1..0000000 --- a/tools/benchmark_tn_mpi.py +++ /dev/null @@ -1,378 +0,0 @@ -"""MPI-parallel TN benchmark: path search + contraction via MPI.""" -import json -import pickle -import time -import argparse -import numpy as np -import cotengra as ctg -import qibo -from qibo import Circuit, gates -from mpi4py import MPI -from concurrent.futures import ProcessPoolExecutor, as_completed -from qibotn.observables import check_observable, extract_gates_and_qubits - - -def _load_observable(observable_file=None, observable_json=None): - if observable_file: - with open(observable_file, "r", encoding="utf8") as f: - return json.load(f) - if observable_json: - return json.loads(observable_json) - return None - - -def _term_to_quimb_operator(term): - """Convert one extracted Hamiltonian term to a quimb operator.""" - import quimb as qu - - coeff = complex(term[0][2]) if term else 1.0 - op = None - where = [] - - for qubit, gate_name, _ in term: - qubit = int(qubit) - gate_name = str(gate_name).upper() - if gate_name == "I": - continue - where.append(qubit) - op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower()) - - return complex(coeff), op, tuple(where) - - -def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time): - import pickle, cotengra as ctg, random - random.seed(seed) - tn = pickle.loads(tn_bytes) - opt = ctg.HyperOptimizer( - methods=['kahypar', 'kahypar-agglom', 'spinglass'], - max_repeats=repeats, - parallel=False, - minimize='combo-256', - max_time=max_time, - optlib="random", - slicing_opts={'target_size': 2**29, 'allow_outer': True}, - progbar=False, - ) - tree = tn.contraction_tree(optimize=opt, output_inds=output_inds) - return tree.combo_cost(factor=256), tree - - -def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks, - timeout): - import pickle, os, signal - from concurrent.futures import ProcessPoolExecutor, as_completed - tn_bytes = pickle.dumps(tn) - if n_workers <= 1: - return _run_serial_search( - tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout - )[1] - repeats_per = max(1, total_repeats // n_workers) - best_cost, best_tree = float('inf'), None - - pool = ProcessPoolExecutor(max_workers=n_workers) - futures = [ - pool.submit(_run_serial_search, tn_bytes, output_inds, - repeats_per, seed, num_slices, n_ranks, timeout) - for seed in range(n_workers) - ] - try: - for fut in as_completed(futures, timeout=timeout + 5): - try: - cost, tree = fut.result() - if cost < best_cost: - best_cost, best_tree = cost, tree - except Exception as e: - print(f" [worker failed] {e}") - except TimeoutError: - pass - finally: - for fut in futures: - fut.cancel() - for pid in list(pool._processes.keys()): - try: - os.kill(pid, signal.SIGKILL) - except ProcessLookupError: - pass - pool.shutdown(wait=False) - - return best_tree - - -def make_circuit(circuit_type, nqubits, nlayers=1): - c = Circuit(nqubits) - if circuit_type == "qft": - from qibo.models import QFT - return QFT(nqubits) - elif circuit_type == "variational": - for layer in range(nlayers): - for q in range(nqubits): - c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi))) - offset = layer % 2 - for q in range(offset, nqubits - 1, 2): - c.add(gates.CZ(q, q + 1)) - elif circuit_type == "ghz": - c.add(gates.H(0)) - for q in range(nqubits - 1): - c.add(gates.CNOT(q, q + 1)) - elif circuit_type == "brickwork": - for q in range(nqubits): - c.add(gates.H(q)) - for layer in range(nlayers): - offset = layer % 2 - for q in range(offset, nqubits - 1, 2): - c.add(gates.CNOT(q, q + 1)) - c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi))) - c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi))) - else: - raise ValueError(f"Unknown circuit: {circuit_type}") - return c - - -def _contract_mpi(tree, arrays, comm, root=0): - rank = comm.Get_rank() - size = comm.Get_size() - is_torch = type(arrays[0]).__module__.startswith("torch") - - result_np = None - for i in range(rank, tree.multiplicity, size): - x = tree.contract_slice(arrays, i) - x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x)) - result_np = x_np if result_np is None else result_np + x_np - - if result_np is None: - result_np = np.zeros(1, dtype=np.complex128) - - result = np.zeros_like(result_np) if rank == root else None - comm.Reduce(result_np, result, root=root) - - if rank == root: - import torch - return torch.from_numpy(np.asarray(result)) if is_torch else result - return None - - -def run_mpi(circuit, nqubits, num_slices, total_repeats=1024, - load_path=None, save_path=None): - """Each MPI rank runs serial path search over total_repeats/size trials, - rank 0 picks the global best, then all ranks contract in parallel.""" - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - qibo.set_backend("qibotn", platform="quimb") - b = qibo.get_backend() - b.configure_tn_simulation(ansatz="tn") - - import torch - qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz, - gate_opts={"max_bond": None, "cutoff": 1e-10}) - qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex128) - - # --- path search: each rank serial, gather best to rank 0 --- - if load_path: - if rank == 0: - with open(load_path, "rb") as f: - saved = pickle.load(f) - tree, psi, t_search = saved["tree"], saved["psi"], 0.0 - print(f" [path loaded] {load_path}") - else: - tree = psi = None - t_search = 0.0 - else: - rank_repeats = max(1, total_repeats // size) - t0 = time.time() - # get TN object first (no contraction), then run parallel search - psi_tn = qc.to_dense(rehearse="tn") - local_tree = parallel_search( - psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48, - num_slices=num_slices, n_ranks=size, timeout=600, - ) - t_search = time.time() - t0 - local_psi = psi_tn - - all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree, local_psi), root=0) - if rank == 0: - _, tree, psi = min(all_results, key=lambda x: x[0]) - print(f" [path search] {t_search:.3f}s " - f"flops~2^{tree.contraction_cost(log=2):.2f} " - f"size~2^{tree.contraction_width():.2f} " - f"slices={tree.multiplicity}") - if save_path: - with open(save_path, "wb") as f: - pickle.dump({"tree": tree, "psi": psi}, f) - print(f" [path saved] {save_path}") - else: - tree = psi = None - - if save_path: - t_search = comm.bcast(t_search, root=0) - return None, t_search - - tree = comm.bcast(tree, root=0) - psi = comm.bcast(psi, root=0) - t_search = comm.bcast(t_search, root=0) - - # --- contraction: all ranks work in parallel --- - import torch - torch.set_num_threads(max(1, 96 // size)) - arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays] - t0 = time.time() - sv = _contract_mpi(tree, arrays, comm, root=0) - t_contract = time.time() - t0 - - if rank == 0: - print(f" [contraction] {t_contract:.3f}s") - return np.array(sv).reshape(-1), t_search + t_contract - return None, t_search + t_contract - - -def run_mpi_expval( - circuit, - nqubits, - observable=None, - total_repeats=1024, - search_workers=1, - search_timeout=300, -): - """Compute a Hamiltonian expectation value directly from TN via MPI. - MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps - path search for each term.""" - import torch - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - qibo.set_backend("qibotn", platform="quimb") - b = qibo.get_backend() - b.configure_tn_simulation(ansatz="tn") - - observable = check_observable(observable, nqubits) - ham_gate_map = extract_gates_and_qubits(observable) - - qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz, - gate_opts={"max_bond": None, "cutoff": 1e-10}) - - my_terms = ham_gate_map[rank::size] - torch.set_num_threads(max(1, 96 // size)) - t0 = time.time() - - my_exp = 0.0 + 0.0j - for term in my_terms: - coeff, op, where = _term_to_quimb_operator(term) - if op is None: - my_exp += coeff - continue - tn = qc.local_expectation_tn(op, where=where) - if len(tn.outer_inds()) == 0: - val = complex(tn.contract()) - else: - tree = parallel_search( - tn, - tn.outer_inds(), - total_repeats, - n_workers=search_workers, - num_slices=1, - n_ranks=size, - timeout=search_timeout, - ) - if tree is None: - raise RuntimeError("Failed to find a contraction tree for expectation TN.") - arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays] - acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity)) - val = complex(acc.item() if hasattr(acc, 'item') else acc) - my_exp += coeff * val - - t_total = time.time() - t0 - - all_results = comm.gather(my_exp, root=0) - if rank == 0: - total_exp = sum(all_results) - print(f"\n[TN expval] time={t_total:.4f}s expval={total_exp.real:.12f}") - return np.real_if_close(total_exp), t_total - return None, t_total - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=30) - parser.add_argument("--circuit", type=str, default="qft", - choices=["qft", "variational", "ghz", "brickwork"]) - parser.add_argument("--nlayers", type=int, default=3) - parser.add_argument("--num-slices", type=int, default=1) - parser.add_argument("--total-repeats", type=int, default=1024) - parser.add_argument("--search-workers", type=int, default=1) - parser.add_argument("--search-timeout", type=int, default=300) - parser.add_argument("--observable-file", type=str, default=None) - parser.add_argument("--observable-json", type=str, default=None) - parser.add_argument("--save-path", type=str, default=None) - parser.add_argument("--load-path", type=str, default=None) - parser.add_argument("--no-compare", action="store_true") - parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"]) - args = parser.parse_args() - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - - if rank == 0: - print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, " - f"nlayers={args.nlayers}, ranks={comm.Get_size()}") - - np.random.seed(42) - circuit = make_circuit(args.circuit, args.nqubits, args.nlayers) - observable = _load_observable(args.observable_file, args.observable_json) - - if args.mode == "expval": - try: - expval, t_total = run_mpi_expval( - circuit, - args.nqubits, - observable=observable, - total_repeats=args.total_repeats, - search_workers=args.search_workers, - search_timeout=args.search_timeout, - ) - except Exception as e: - if rank == 0: - print(f"[FAILED] {e}") - raise - if rank == 0: - np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval)) - if not args.no_compare: - print("No built-in reference comparison for arbitrary observables.") - return - - try: - sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices, - total_repeats=args.total_repeats, - load_path=args.load_path, save_path=args.save_path) - except Exception as e: - if rank == 0: - print(f"[FAILED] {e}") - raise - - if rank == 0 and sv is not None: - print(f"\n[quimb TN MPI] time={t_total:.4f}s shape={sv.shape}") - np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv) - - if not args.no_compare: - from qibotn.bak.benchmark_tn import run_qibojit - import gc - np.random.seed(42) - circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers) - sv_ref, t_ref = run_qibojit(circuit_ref) - np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref) - print(f"[qibojit] time={t_ref:.4f}s") - # free memory before loading via mmap for expval comparison - del sv, sv_ref - gc.collect() - from compare_jit_tn_quimb import check_results - ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy" - tn_path = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy" - check_results(ref_path, tn_path, args.nqubits) - if t_total > 0: - print(f"Speedup : {t_ref/t_total:.2f}x") - - -if __name__ == "__main__": - main() diff --git a/tools/check_tree.py b/tools/check_tree.py deleted file mode 100644 index 935f952..0000000 --- a/tools/check_tree.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Check contraction tree statistics.""" -import pickle, sys - -path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl" -with open(path, 'rb') as f: - tree = pickle.load(f) - -# Intel 8558P: 96 cores, 2.1GHz, AVX-512 (16 FP64/cycle), FMA x2 -# complex128 multiply-add = 6 real FLOPs -CORES = 96 -FREQ = 2.1e9 -AVX512_FP64 = 16 -TFLOPS = CORES * FREQ * AVX512_FP64 * 2 / 1e12 # ~6.45 TFLOPS real FP64 -COMPLEX_FLOPS = TFLOPS / 6 # complex128 effective - -flops = tree.total_flops() -slices = tree.multiplicity -est_seconds = flops * slices / (COMPLEX_FLOPS * 1e12) - -print(f"File: {path}") -print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}") -print(f"Total FLOPs: {flops:.2e} x{slices} slices = {flops*slices:.2e}") -print(f"Contraction width: {tree.contraction_width()}") -print(f"Multiplicity (slices): {slices}") -print(f"Estimated time (96 cores): {est_seconds:.1f}s ({est_seconds/3600:.2f}h)") diff --git a/tools/compare_vidal_backend_qmatchatea.py b/tools/compare_vidal_backend_qmatchatea.py deleted file mode 100644 index b5050cf..0000000 --- a/tools/compare_vidal_backend_qmatchatea.py +++ /dev/null @@ -1,137 +0,0 @@ -"""Compare QMatchaTeaBackend with the VidalBackend fast path.""" - -from __future__ import annotations - -import argparse -import json -import math -import time - -import numpy as np -import torch -from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Y, Z - -from qibotn.backends.qmatchatea import QMatchaTeaBackend -from qibotn.backends.vidal import VidalBackend - - -def build_circuit(nqubits, nlayers, seed, kind): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - for layer in range(nlayers): - for q in range(nqubits): - circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) - if kind == "brickwall": - for q in range(0, nqubits - 1, 2): - circuit.add(gates.CNOT(q, q + 1)) - for q in range(1, nqubits - 1, 2): - circuit.add(gates.CNOT(q, q + 1)) - elif kind == "shifted-cz": - for q in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.CZ(q, q + 1)) - elif kind == "reversed-cnot": - for q in range(0, nqubits - 1, 2): - circuit.add(gates.CNOT(q + 1, q)) - for q in range(1, nqubits - 1, 2): - circuit.add(gates.CNOT(q, q + 1)) - else: - raise ValueError(f"Unknown circuit kind {kind!r}.") - return circuit - - -def build_observable(nqubits, kind): - form = 0 - if kind == "ring-xz": - for q in range(nqubits): - form += 0.5 * X(q) * Z((q + 1) % nqubits) - elif kind == "open-zz": - for q in range(nqubits - 1): - form += Z(q) * Z(q + 1) / (nqubits - 1) - elif kind == "mixed": - form += 0.25 * X(0) - 0.5 * Z(nqubits - 1) - for q in range(0, nqubits - 1, 3): - form += 0.125 * Y(q) * Y(q + 1) - else: - raise ValueError(f"Unknown observable kind {kind!r}.") - return hamiltonians.SymbolicHamiltonian(form=form) - - -def run_backend(backend, circuit, observable): - start = time.perf_counter() - value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True) - return float(np.real(value)), time.perf_counter() - start - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=34) - parser.add_argument("--nlayers", type=int, default=20) - parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch") - parser.add_argument("--torch-threads", type=int, default=32) - parser.add_argument( - "--circuit-kind", - choices=("brickwall", "shifted-cz", "reversed-cnot"), - default="brickwall", - ) - parser.add_argument( - "--observable-kind", - choices=("ring-xz", "open-zz", "mixed"), - default="ring-xz", - ) - parser.add_argument("--reference-file") - parser.add_argument("--skip-qmatchatea", action="store_true") - args = parser.parse_args() - - torch.set_num_threads(args.torch_threads) - circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind) - observable = build_observable(args.nqubits, args.observable_kind) - - exact = None - if args.reference_file: - with open(args.reference_file, "r", encoding="utf-8") as f: - exact = float(json.load(f)["expectation"]) - - print( - f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} " - f"circuit={args.circuit_kind} observable={args.observable_kind} " - f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}" - ) - if exact is not None: - print(f"exact={exact:.16e}") - print("backend value abs_error seconds") - - if not args.skip_qmatchatea: - qmt = QMatchaTeaBackend() - qmt.configure_tn_simulation( - ansatz="MPS", - max_bond_dimension=args.bond, - cut_ratio=1e-12, - svd_control="E!", - tensor_module=args.tensor_module, - compile_circuit=True, - track_memory=False, - ) - value, seconds = run_backend(qmt, circuit, observable) - error = float("nan") if exact is None else abs(value - exact) - print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}") - - vidal = VidalBackend() - vidal.configure_tn_simulation( - ansatz="MPS", - max_bond_dimension=args.bond, - cut_ratio=1e-12, - tensor_module=args.tensor_module, - compile_circuit=True, - fallback=True, - ) - value, seconds = run_backend(vidal, circuit, observable) - error = float("nan") if exact is None else abs(value - exact) - print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}") - - -if __name__ == "__main__": - main() diff --git a/tools/example_tn_case.py b/tools/example_tn_case.py deleted file mode 100644 index c35f057..0000000 --- a/tools/example_tn_case.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Example custom case for tools/run_tn_custom.py.""" - -from __future__ import annotations - -import math - -import numpy as np -from qibo import Circuit, gates - - -def build_circuit(nqubits, nlayers, seed): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - for layer in range(nlayers): - for qubit in range(nqubits): - circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) - for qubit in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) - circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) - return circuit - - -def build_observable(nqubits, seed): - return { - "terms": [ - { - "coefficient": 1.0 / max(1, nqubits - 1), - "operators": [("Z", site), ("Z", site + 1)], - } - for site in range(nqubits - 1) - ] - } diff --git a/tools/inspect_contraction_tree.py b/tools/inspect_contraction_tree.py deleted file mode 100644 index a6422ba..0000000 --- a/tools/inspect_contraction_tree.py +++ /dev/null @@ -1,208 +0,0 @@ -"""Inspect cotengra contraction trees for dominant torch matmul shapes.""" - -from __future__ import annotations - -import argparse -import importlib -import math -import pickle -from collections import Counter, defaultdict -from pathlib import Path - - -def _prod(values): - out = 1 - for value in values: - out *= int(value) - return out - - -def _broadcast_batch(a_batch, b_batch): - if a_batch == b_batch: - return _prod(a_batch) - if not a_batch: - return _prod(b_batch) - if not b_batch: - return _prod(a_batch) - - ndim = max(len(a_batch), len(b_batch)) - a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch) - b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch) - return _prod(max(a, b) for a, b in zip(a_batch, b_batch)) - - -def _load_tree(path, index): - with Path(path).open("rb") as f: - payload = pickle.load(f) - trees = payload["trees"] if isinstance(payload, dict) else payload - if not isinstance(trees, (list, tuple)): - trees = [trees] - return trees[index] - - -def _analyze_tree(tree): - contract_mod = importlib.import_module("cotengra.contract") - contractions = contract_mod.extract_contractions(tree) - size_dict = tree.size_dict - ops = [] - counts = Counter() - - for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions): - if left is None and right is None: - counts["preprocess"] += 1 - continue - - left_inds = tree.get_inds(left) - right_inds = tree.get_inds(right) - parent_inds = tree.get_inds(parent) - left_shape = tuple(size_dict[ix] for ix in left_inds) - right_shape = tuple(size_dict[ix] for ix in right_inds) - - if tdot: - parsed = contract_mod._parse_tensordot_axes_to_matmul( - arg, - left_shape, - right_shape, - ) - else: - parsed = contract_mod._parse_eq_to_batch_matmul( - arg, - left_shape, - right_shape, - ) - - ( - _eq_a, - _eq_b, - new_shape_a, - new_shape_b, - _new_shape_ab, - _perm_ab, - pure_multiplication, - ) = parsed - - matmul_shape = None - matmul_flops = 0 - if pure_multiplication: - kind = "mul" - else: - a_shape = tuple(new_shape_a or left_shape) - b_shape = tuple(new_shape_b or right_shape) - batch = _broadcast_batch(a_shape[:-2], b_shape[:-2]) - m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1]) - kind = "mm" if batch == 1 else "bmm" - matmul_shape = (batch, m, k, n) - matmul_flops = batch * m * k * n - - tree_flops = int(tree.get_flops(parent)) - out_size = int(tree.get_size(parent)) - ops.append( - { - "index": op_index, - "kind": kind, - "matmul_shape": matmul_shape, - "matmul_flops": matmul_flops, - "tree_flops": tree_flops, - "out_size": out_size, - "left_shape": left_shape, - "right_shape": right_shape, - "left_rank": len(left_inds), - "right_rank": len(right_inds), - "out_rank": len(parent_inds), - "perm": perm, - } - ) - counts[kind] += 1 - - return contractions, ops, counts - - -def _format_log(value, base): - return "-inf" if value <= 0 else f"{math.log(value, base):.3f}" - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("tree", help="Pickle file containing one tree or {'trees': [...]}.") - parser.add_argument("--index", type=int, default=0, help="Tree index in the file.") - parser.add_argument("--top", type=int, default=20, help="Number of top ops to print.") - parser.add_argument( - "--dtype-bytes", - type=int, - default=8, - help="Bytes per element for memory estimates, for example 8 for complex64.", - ) - args = parser.parse_args() - - tree = _load_tree(args.tree, args.index) - contractions, ops, counts = _analyze_tree(tree) - nslices = int(getattr(tree, "multiplicity", 1)) - per_slice_flops = sum(op["tree_flops"] for op in ops) - per_slice_write = sum(op["out_size"] for op in ops) - max_out = max((op["out_size"] for op in ops), default=0) - all_flops = per_slice_flops * nslices - all_write = per_slice_write * nslices - - print(f"tree={args.tree} index={args.index}") - print( - "summary " - f"slices={nslices} contractions={len(contractions)} " - f"counts={dict(counts)}" - ) - print( - "per_slice " - f"log10_flops={_format_log(per_slice_flops, 10)} " - f"log10_write={_format_log(per_slice_write, 10)} " - f"log2_max_output={_format_log(max_out, 2)} " - f"max_output_gib={max_out * args.dtype_bytes / 1024**3:.6g}" - ) - print( - "all_slices " - f"log10_flops={_format_log(all_flops, 10)} " - f"log10_write={_format_log(all_write, 10)}" - ) - - print(f"\ntop_{args.top}_ops_by_flops") - for op in sorted(ops, key=lambda item: item["tree_flops"], reverse=True)[: args.top]: - print( - f"op={op['index']} kind={op['kind']} " - f"flops={op['tree_flops']:.6e} out={op['out_size']:.6e} " - f"matmul={op['matmul_shape']} " - f"ranks=({op['left_rank']},{op['right_rank']}->{op['out_rank']}) " - f"lhs={op['left_shape']} rhs={op['right_shape']}" - ) - - by_shape = defaultdict(lambda: [0, 0, 0]) - for op in ops: - shape = op["matmul_shape"] - if shape is None: - continue - by_shape[shape][0] += 1 - by_shape[shape][1] += op["tree_flops"] - by_shape[shape][2] += op["out_size"] - - print(f"\ntop_{args.top}_matmul_shapes_by_flops") - for shape, (count, flops, out_size) in sorted( - by_shape.items(), - key=lambda item: item[1][1], - reverse=True, - )[: args.top]: - print( - f"shape={shape} count={count} " - f"flops={flops:.6e} output={out_size:.6e}" - ) - - print(f"\ntop_{args.top}_matmul_shapes_by_count") - for shape, (count, flops, out_size) in sorted( - by_shape.items(), - key=lambda item: item[1][0], - reverse=True, - )[: args.top]: - print( - f"shape={shape} count={count} " - f"flops={flops:.6e} output={out_size:.6e}" - ) - - -if __name__ == "__main__": - main() diff --git a/tools/manage_tn_dask_cluster.sh b/tools/manage_tn_dask_cluster.sh deleted file mode 100755 index 20c4e01..0000000 --- a/tools/manage_tn_dask_cluster.sh +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Manage the dask cluster used by TN path search. -# -# Defaults target two servers: -# scheduler: 10.20.1.103:8786 -# workers: 10.20.1.103, 10.20.6.101 -# -# Usage: -# tools/manage_tn_dask_cluster.sh start -# tools/manage_tn_dask_cluster.sh status -# tools/manage_tn_dask_cluster.sh stop -# -# Common overrides: -# SCHEDULER_HOST=10.20.1.103 -# WORKER_HOSTS="10.20.1.103 10.20.6.101" -# NWORKERS=48 -# NTHREADS=1 -# ROOT_DIR=/home/qibo/qibotn -# PYTHON_BIN=.venv/bin/python - -ROOT_DIR="${ROOT_DIR:-/home/qibo/qibotn}" -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}" -SCHEDULER_PORT="${SCHEDULER_PORT:-8786}" -DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}" -WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.6.101}" -NWORKERS="${NWORKERS:-84}" -NTHREADS="${NTHREADS:-1}" -MEMORY_LIMIT="${MEMORY_LIMIT:-0}" -LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}" -LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}" -SSH_BIN="${SSH_BIN:-ssh}" -DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}" -DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}" -DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}" - -SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}" - -is_local_host() { - local host="$1" - [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0 - [[ "$host" == "$(hostname)" ]] && return 0 - [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0 - hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host" -} - -run_on_host() { - local host="$1" - shift - local cmd="$*" - if is_local_host "$host"; then - bash -lc "$cmd" - else - "$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")" - fi -} - -start_scheduler() { - local host="$SCHEDULER_HOST" - local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log" - local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" - run_on_host "$host" " - set -euo pipefail - cd '$ROOT_DIR' - mkdir -p '$LOG_DIR' - if [[ -s '$pid_file' ]]; then - pid=\$(cat '$pid_file') - if kill -0 \"\$pid\" 2>/dev/null; then - echo \"scheduler already running on $host pid=\$pid\" - exit 0 - fi - fi - DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \ - DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \ - DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \ - setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \ - --host '$SCHEDULER_HOST' \ - --port '$SCHEDULER_PORT' \ - --dashboard-address '$DASHBOARD_ADDRESS' \ - > '$log' 2>&1 < /dev/null & - pid=\$! - echo \"\$pid\" > '$pid_file' - echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\" - " -} - -start_worker() { - local host="$1" - local log="$LOG_DIR/worker_${host}.log" - local pid_file="$LOG_DIR/worker_${host}.pid" - run_on_host "$host" " - set -euo pipefail - cd '$ROOT_DIR' - mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY' - if [[ -s '$pid_file' ]]; then - pid=\$(cat '$pid_file') - if kill -0 \"\$pid\" 2>/dev/null; then - echo \"worker already running on $host pid=\$pid\" - exit 0 - fi - fi - TCM_ENABLE=1 \ - DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \ - DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \ - DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \ - setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \ - '$SCHEDULER_ADDR' \ - --host '$host' \ - --nworkers '$NWORKERS' \ - --nthreads '$NTHREADS' \ - --memory-limit '$MEMORY_LIMIT' \ - --local-directory '$LOCAL_DIRECTORY' \ - > '$log' 2>&1 < /dev/null & - pid=\$! - echo \"\$pid\" > '$pid_file' - echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\" - " -} - -stop_host() { - local host="$1" - local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" - local worker_pid_file="$LOG_DIR/worker_${host}.pid" - run_on_host "$host" " - set +e - for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do - [[ -f \"\$pid_file\" ]] || continue - if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then - continue - fi - pid=\$(cat \"\$pid_file\") - kill \"\$pid\" 2>/dev/null || true - rm -f \"\$pid_file\" - done - pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR' - pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT' - true - " -} - -status_host() { - local host="$1" - local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid" - local worker_pid_file="$LOG_DIR/worker_${host}.pid" - echo "--------------------------------------------------------------------------------" - echo "host=$host" - run_on_host "$host" " - set +e - for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do - [[ -f \"\$pid_file\" ]] || continue - if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then - continue - fi - pid=\$(cat \"\$pid_file\") - if kill -0 \"\$pid\" 2>/dev/null; then - ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers - else - echo \"stale pid_file=\$pid_file pid=\$pid\" - fi - done - pgrep -af '[d]istributed.cli.dask' || true - " -} - -case "${1:-help}" in - start) - start_scheduler - sleep 2 - for host in $WORKER_HOSTS; do - start_worker "$host" - done - echo - echo "Dask scheduler: $SCHEDULER_ADDR" - echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS" - ;; - stop) - for host in $WORKER_HOSTS; do - stop_host "$host" - done - stop_host "$SCHEDULER_HOST" - ;; - status) - status_host "$SCHEDULER_HOST" - for host in $WORKER_HOSTS; do - [[ "$host" == "$SCHEDULER_HOST" ]] && continue - status_host "$host" - done - ;; - restart) - "$0" stop - sleep 2 - "$0" start - ;; - help|*) - cat < 0 else float("inf") - print( - f"{label} wall={wall:.3f}s cpu={cpu:.3f}s cpu_over_wall={ratio:.2f} " - f"checksum={checksum:.6e}", - flush=True, - ) - - -def _visible_numa_nodes(): - nodes = [] - for path in sorted(Path("/sys/devices/system/node").glob("node[0-9]*")): - cpulist = path / "cpulist" - if cpulist.exists(): - nodes.append(f"{path.name}:{cpulist.read_text(encoding='utf-8').strip()}") - return ",".join(nodes) if nodes else "unknown" - - -def _dtype_nbytes(name): - return { - "float32": 4, - "float64": 8, - "complex64": 8, - "complex128": 16, - }[name] - - -def _format_gib(nbytes): - return f"{nbytes / (1024 ** 3):.2f}GiB" - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--threads", type=int, default=48) - parser.add_argument("--n", type=int, default=4096) - parser.add_argument("--iters", type=int, default=4) - parser.add_argument("--dtype", choices=("float32", "float64", "complex64", "complex128"), default="float32") - parser.add_argument("--op", choices=("matmul", "tensordot", "both"), default="both") - parser.add_argument( - "--affinity-only", - action="store_true", - help="Print MPI/torch placement diagnostics without allocating tensors.", - ) - args = parser.parse_args() - - os.environ.setdefault("OMP_NUM_THREADS", str(args.threads)) - os.environ.setdefault("MKL_NUM_THREADS", str(args.threads)) - os.environ.setdefault("OMP_PROC_BIND", "close") - os.environ.setdefault("OMP_PLACES", "cores") - - import torch - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - torch.set_num_threads(args.threads) - try: - torch.set_num_interop_threads(1) - except Exception: - pass - - dtype = _dtype_from_name(args.dtype) - affinity = sorted(os.sched_getaffinity(0)) - allowed_list = "" - try: - with open("/proc/self/status", encoding="utf-8") as f: - for line in f: - if line.startswith("Cpus_allowed_list:"): - allowed_list = line.split(":", 1)[1].strip() - break - except OSError: - pass - - print( - f"rank={rank}/{size} host={socket.gethostname()} pid={os.getpid()} " - f"affinity_len={len(affinity)} allowed={allowed_list} " - f"torch_threads={torch.get_num_threads()} " - f"torch_interop={torch.get_num_interop_threads()} " - f"OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')} " - f"MKL_NUM_THREADS={os.environ.get('MKL_NUM_THREADS')} " - f"OMP_PROC_BIND={os.environ.get('OMP_PROC_BIND')} " - f"OMP_PLACES={os.environ.get('OMP_PLACES')} " - f"visible_numa={_visible_numa_nodes()}", - flush=True, - ) - - if rank == 0: - print(torch.__config__.parallel_info(), flush=True) - input_bytes = args.n * args.n * _dtype_nbytes(args.dtype) - min_live_bytes = 3 * input_bytes - print( - f"matrix_n={args.n} dtype={args.dtype} " - f"one_matrix={_format_gib(input_bytes)} " - f"approx_min_live_per_rank={_format_gib(min_live_bytes)} " - f"approx_min_live_all_ranks={_format_gib(min_live_bytes * size)}", - flush=True, - ) - comm.Barrier() - if args.affinity_only: - return - - a = _make_tensor((args.n, args.n), dtype) - b = _make_tensor((args.n, args.n), dtype) - - def run_matmul(): - value = (a @ b).sum() - return value.real.item() if value.is_complex() else value.item() - - def run_tensordot(): - value = torch.tensordot(a, b, dims=1) - value = value.sum() - return value.real.item() if value.is_complex() else value.item() - - if args.op in ("matmul", "both"): - _bench("matmul", run_matmul, args.iters) - if args.op in ("tensordot", "both"): - _bench("tensordot", run_tensordot, args.iters) - - -if __name__ == "__main__": - main() diff --git a/tools/mps_contest_runner.py b/tools/mps_contest_runner.py deleted file mode 100644 index 353cc3e..0000000 --- a/tools/mps_contest_runner.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -"""Contest-style multi-node Vidal/MPS expectation runner.""" - -from __future__ import annotations - -import argparse -import math -import sys -import time -from dataclasses import dataclass -from pathlib import Path - -import numpy as np -from mpi4py import MPI -from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Y, Z - -ROOT = Path(__file__).resolve().parents[1] -SRC = ROOT / "src" -if str(SRC) not in sys.path: - sys.path.insert(0, str(SRC)) - -from qibotn.backends.vidal import VidalBackend # noqa: E402 -from qibotn.expectation_runner import exact_for_observable # noqa: E402 - - -@dataclass(frozen=True) -class CaseSpec: - circuit_kind: str - observables: tuple[str, ...] - nqubits: int - nlayers: int - bond: int | None - seed: int - - -CASES = { - "main1": CaseSpec( - circuit_kind="reversed_cnot", - observables=("ring_xz",), - nqubits=128, - nlayers=24, - bond=512, - seed=31001, - ), - "main2": CaseSpec( - circuit_kind="rxx_rzz", - observables=("open_zz", "range2_xx", "mixed_local"), - nqubits=128, - nlayers=32, - bond=1024, - seed=31002, - ), - "strong": CaseSpec( - circuit_kind="scramble", - observables=("ring_xz", "long_z_string", "dense3_spread"), - nqubits=256, - nlayers=48, - bond=2048, - seed=41001, - ), -} - - -def optional_int(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return int(text) - - -def optional_float(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return float(text) - - -def format_optional(value, fmt="g"): - return "None" if value is None else format(value, fmt) - - -def set_torch_threads(nthreads): - try: - import torch - - torch.set_num_threads(nthreads) - except Exception: - pass - - -def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False): - for qubit in range(nqubits): - circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) - if include_rx: - circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi))) - - -def build_circuit(kind, nqubits, nlayers, seed): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - - for layer in range(nlayers): - if kind == "reversed_cnot": - add_single_qubit_layer(circuit, nqubits, rng) - for qubit in range(0, nqubits - 1, 2): - gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1) - circuit.add(gate) - for qubit in range(1, nqubits - 1, 2): - gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1) - circuit.add(gate) - - elif kind == "rxx_rzz": - add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) - for qubit in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) - circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) - - elif kind == "scramble": - add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) - for qubit in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) - circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) - if layer % 5 == 4: - circuit.add(gates.SWAP(qubit, qubit + 1)) - - else: - raise ValueError(f"Unknown circuit kind {kind!r}.") - - return circuit - - -def dense_observable(nqubits, qubits, seed, dim): - del nqubits - rng = np.random.default_rng(seed) - raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim)) - matrix = (raw + raw.conj().T) / 2.0 - matrix = matrix / np.linalg.norm(matrix) - return {"matrix": matrix, "qubits": list(qubits)} - - -def observable(kind, nqubits, seed): - q1 = nqubits // 4 - q2 = nqubits // 2 - q3 = (3 * nqubits) // 4 - last = nqubits - 1 - - if kind == "boundary_ZZ_q1": - return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1)) - if kind == "boundary_ZZ_q2": - return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2)) - if kind == "boundary_ZZ_q3": - return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3)) - if kind == "long_Z_5_sites": - return hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)) - if kind == "mixed_XZYZX": - return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)) - if kind == "ring_xz": - form = 0 - for qubit in range(nqubits): - form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits) - return hamiltonians.SymbolicHamiltonian(form=form) - if kind == "open_zz": - form = 0 - for qubit in range(nqubits - 1): - form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1) - return hamiltonians.SymbolicHamiltonian(form=form) - if kind == "range2_xx": - form = 0 - for qubit in range(nqubits - 2): - form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2) - return hamiltonians.SymbolicHamiltonian(form=form) - if kind == "mixed_local": - form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3) - return hamiltonians.SymbolicHamiltonian(form=form) - if kind == "complex_iZ0": - return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0)) - if kind == "dense2_mid": - return dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4) - if kind == "dense3_spread": - return dense_observable(nqubits, (q1, q2, q3), seed + 202, 8) - raise ValueError(f"Unknown observable kind {kind!r}.") - - -def selected_observables(args, case): - if args.observables: - return tuple(args.observables) - if args.obs_filter: - return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip()) - return case.observables - - -def apply_case_defaults(args): - case = CASES[args.case] - if args.nqubits is None: - args.nqubits = case.nqubits - if args.nlayers is None: - args.nlayers = case.nlayers - if args.bond == "case-default": - args.bond = case.bond - if args.seed is None: - args.seed = case.seed - args.observables = selected_observables(args, case) - - -def run_case(args): - set_torch_threads(args.torch_threads) - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - case = CASES[args.case] - circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed) - - if rank == 0: - print("=" * 88, flush=True) - print( - "backend=vidal_mps " - f"case={args.case} circuit={case.circuit_kind} ranks={size} " - f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} " - f"bond={format_optional(args.bond)} cut_ratio={format_optional(args.cut_ratio)} " - f"torch_threads={args.torch_threads} seed={args.seed} " - f"observables={','.join(args.observables)}", - flush=True, - ) - print("observable exact value abs_error rel_error seconds trunc_sum trunc_max status", flush=True) - - for obs_name in args.observables: - obs = observable(obs_name, args.nqubits, args.seed) - exact = None - if args.exact and rank == 0: - if args.nqubits > args.exact_max_qubits: - raise ValueError( - f"--exact is limited to {args.exact_max_qubits} qubits by default." - ) - exact = exact_for_observable(circuit, obs, args.nqubits) - - backend = VidalBackend() - backend.configure_tn_simulation( - max_bond_dimension=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - mpi_approach="CT", - mpi_num_procs=size, - fallback=False, - ) - - comm.Barrier() - start = time.perf_counter() - try: - value = backend.expectation( - circuit, - obs, - preprocess=True, - compile_circuit=False, - ) - status = "ok" - except Exception as exc: - value = np.nan - status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0] - seconds = time.perf_counter() - start - - if rank == 0: - abs_error = float("nan") if exact is None else abs(value - exact) - rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) - exact_text = "nan" if exact is None else f"{exact:.16e}" - print( - f"{obs_name} {exact_text} {value!r} " - f"{abs_error:.6e} {rel_error:.6e} {seconds:.3f} " - f"{backend.last_truncation_error:.6e} " - f"{backend.last_max_truncation_error:.6e} {status}", - flush=True, - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("mode", choices=("run", "validate", "list")) - parser.add_argument("--case", choices=sorted(CASES), default="main1") - parser.add_argument("--observables", nargs="+") - parser.add_argument("--obs-filter", default="") - parser.add_argument("--nqubits", type=int) - parser.add_argument("--nlayers", type=int) - parser.add_argument("--bond", "--bonds", dest="bond", default="case-default") - parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) - parser.add_argument("--seed", type=int) - parser.add_argument("--torch-threads", type=int, default=8) - parser.add_argument("--exact", action="store_true") - parser.add_argument("--exact-max-qubits", type=int, default=24) - args = parser.parse_args() - - if args.mode == "list": - for name, case in CASES.items(): - print( - f"{name}: circuit={case.circuit_kind} " - f"observables={','.join(case.observables)} " - f"nqubits={case.nqubits} nlayers={case.nlayers} " - f"bond={case.bond} seed={case.seed}" - ) - return - - apply_case_defaults(args) - if isinstance(args.bond, str): - args.bond = optional_int(args.bond) - - if args.mode == "validate": - args.exact = True - args.nqubits = min(args.nqubits, args.exact_max_qubits) - - run_case(args) - - -if __name__ == "__main__": - main() diff --git a/tools/profile_vidal_chrome.py b/tools/profile_vidal_chrome.py deleted file mode 100644 index bf22276..0000000 --- a/tools/profile_vidal_chrome.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Chrome trace profiler for the VidalBackend fast path.""" - -from __future__ import annotations - -import argparse -from pathlib import Path - -import torch -from torch.profiler import ProfilerActivity, profile - -from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms -from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=34) - parser.add_argument("--nlayers", type=int, default=20) - parser.add_argument("--bond", type=int, default=512) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--torch-threads", type=int, default=32) - parser.add_argument("--cut-ratio", type=float, default=1e-12) - parser.add_argument("--profile-memory", action="store_true") - parser.add_argument("--rows", type=int, default=60) - args = parser.parse_args() - - torch.set_num_threads(args.torch_threads) - - prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}" - trace_path = Path(f"{prefix}.json") - table_path = Path(f"{prefix}.txt") - trace_path.parent.mkdir(parents=True, exist_ok=True) - - circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed) - observable = terms_to_dict(observable_terms("ring_xz", args.nqubits)) - config = ExpectationConfig( - ansatz="mps", - bond=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - torch_threads=args.torch_threads, - ) - - print( - f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} " - f"bond={args.bond} threads={args.torch_threads}" - ) - - with profile( - activities=[ProfilerActivity.CPU], - record_shapes=args.profile_memory, - profile_memory=args.profile_memory, - with_stack=args.profile_memory, - ) as prof: - result = run_cpu_expectation(circuit, observable, config) - - table = ( - f"expval={result.value:.16e}\n\n" - f"# sorted by self_cpu_time_total\n" - f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n" - f"# sorted by cpu_time_total\n" - f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n" - ) - - print(table, end="") - table_path.write_text(table, encoding="utf-8") - prof.export_chrome_trace(str(trace_path)) - print(f"trace={trace_path}\ntable={table_path}") - - -if __name__ == "__main__": - main() diff --git a/tools/qibojit_reference_expectation.py b/tools/qibojit_reference_expectation.py deleted file mode 100644 index 429855a..0000000 --- a/tools/qibojit_reference_expectation.py +++ /dev/null @@ -1,109 +0,0 @@ -"""Compute and cache a qibojit state-vector reference for the ring-XZ observable.""" - -import argparse -import json -import math -import time -from pathlib import Path - -import numpy as np -import qibo -from qibo import Circuit, gates - - -def build_circuit(nqubits, nlayers, seed): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - for _ in range(nlayers): - for qubit in range(nqubits): - circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) - for qubit in range(0, nqubits - 1, 2): - circuit.add(gates.CNOT(qubit, qubit + 1)) - for qubit in range(1, nqubits - 1, 2): - circuit.add(gates.CNOT(qubit, qubit + 1)) - return circuit - - -def ring_xz_expectation(state, nqubits, chunk_size): - value = 0.0 - for qubit in range(nqubits): - next_qubit = (qubit + 1) % nqubits - x_flip = 1 << (nqubits - 1 - qubit) - z_shift = nqubits - 1 - next_qubit - term = 0.0 - for start in range(0, state.size, chunk_size): - stop = min(start + chunk_size, state.size) - indices = np.arange(start, stop, dtype=np.int64) - z_bit = (indices >> z_shift) & 1 - z_phase = 1 - 2 * z_bit - term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real - value += 0.5 * term - return float(value) - - -def default_output_path(nqubits, nlayers, seed): - return Path("references") / ( - f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json" - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=32) - parser.add_argument("--nlayers", type=int, default=3) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--output") - parser.add_argument("--force", action="store_true") - parser.add_argument("--allow-large", action="store_true") - parser.add_argument("--max-state-gb", type=float, default=32.0) - parser.add_argument("--chunk-size", type=int, default=1 << 20) - args = parser.parse_args() - - output = Path(args.output) if args.output else default_output_path( - args.nqubits, args.nlayers, args.seed - ) - if output.exists() and not args.force: - with open(output, "r", encoding="utf-8") as f: - data = json.load(f) - print(f"loaded {output}") - print(f"expectation={float(data['expectation']):.16e}") - return - - state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3) - if state_gb > args.max_state_gb and not args.allow_large: - raise MemoryError( - f"Estimated state vector alone is {state_gb:.1f} GiB. " - "Pass --allow-large after confirming the node has enough memory." - ) - - qibo.set_backend("qibojit") - circuit = build_circuit(args.nqubits, args.nlayers, args.seed) - - start = time.perf_counter() - state = circuit().state(numpy=True).reshape(-1) - expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size) - elapsed = time.perf_counter() - start - - data = { - "backend": "qibojit", - "observable": "0.5 * sum_i X_i Z_((i+1) mod n)", - "nqubits": args.nqubits, - "nlayers": args.nlayers, - "seed": args.seed, - "expectation": expectation, - "seconds": elapsed, - "state_vector_gib_estimate": state_gb, - } - output.parent.mkdir(parents=True, exist_ok=True) - with open(output, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2, sort_keys=True) - f.write("\n") - - print(f"saved {output}") - print(f"expectation={expectation:.16e}") - print(f"seconds={elapsed:.3f}") - - -if __name__ == "__main__": - main() diff --git a/tools/qibotn_torch_mt_env.sh b/tools/qibotn_torch_mt_env.sh deleted file mode 100644 index 838cdef..0000000 --- a/tools/qibotn_torch_mt_env.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -# Shared runtime setup for CPU torch TN/MPS runs. -# -# This makes AOCL BLIS use the multithreaded library when available, which is -# required for complex64 tensordot/cgemm to actually use all cores on this host. - -QIBOTN_BLIS_MT="${QIBOTN_BLIS_MT:-/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5}" - -export BLIS_NUM_THREADS="${BLIS_NUM_THREADS:-${OMP_NUM_THREADS:-1}}" - -if [[ -f "$QIBOTN_BLIS_MT" ]]; then - case ":${LD_PRELOAD:-}:" in - *":$QIBOTN_BLIS_MT:"*) - ;; - *) - export LD_PRELOAD="${LD_PRELOAD:+$LD_PRELOAD:}$QIBOTN_BLIS_MT" - ;; - esac -fi - -export OMP_PROC_BIND="${OMP_PROC_BIND:-close}" -export OMP_PLACES="${OMP_PLACES:-cores}" diff --git a/tools/run_cpu_large_cases.sh b/tools/run_cpu_large_cases.sh deleted file mode 100755 index 59be311..0000000 --- a/tools/run_cpu_large_cases.sh +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Large CPU expectation benchmarks for two-server runs. -# -# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM -# each. Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as -# needed. - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -cd "$ROOT_DIR" - -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -MPIEXEC="${MPIEXEC:-mpiexec}" -HOSTFILE="${HOSTFILE:-hostfile}" - -MPS_RANKS="${MPS_RANKS:-8}" -MPS_THREADS="${MPS_THREADS:-12}" -TN_RANKS="${TN_RANKS:-12}" -TN_THREADS="${TN_THREADS:-8}" - -export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}" -export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}" -source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh" - -run_mpi() { - local ranks="$1" - shift - "$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@" -} - -run_case() { - local title="$1" - shift - echo - echo "================================================================================" - echo "$title" - echo "================================================================================" - echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC" - echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS" - echo "$*" - "$@" -} - -case "${1:-help}" in - smoke) - run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \ - run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \ - --mpi --mps \ - --nqubits "${MPS_SMOKE_NQ:-40}" \ - --nlayers "${MPS_SMOKE_LAYERS:-30}" \ - --bond "${MPS_SMOKE_BOND:-2048}" \ - --torch-threads "$MPS_THREADS" \ - --circuits brickwall_cnot reversed_cnot shifted_cz \ - --observables ring_xz open_zz range2_xx - - run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \ - run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \ - --mpi \ - --nqubits "${TN_SMOKE_NQ:-32}" \ - --nlayers "${TN_SMOKE_LAYERS:-16}" \ - --torch-threads "$TN_THREADS" \ - --circuits brickwall_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz range2_xx \ - --tn-target-slices "${TN_SMOKE_SLICES:-12}" - ;; - - mps-long) - run_case "MPS MPI long: n=64 layers=48 bond=4096" \ - run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \ - --mpi --mps \ - --nqubits "${MPS_LONG_NQ:-64}" \ - --nlayers "${MPS_LONG_LAYERS:-48}" \ - --bond "${MPS_LONG_BOND:-4096}" \ - --torch-threads "$MPS_THREADS" \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz mixed_local range2_xx - ;; - - mps-pressure) - run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \ - run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \ - --mpi --mps \ - --nqubits "${MPS_PRESSURE_NQ:-80}" \ - --nlayers "${MPS_PRESSURE_LAYERS:-64}" \ - --bond "${MPS_PRESSURE_BOND:-4096}" \ - --torch-threads "$MPS_THREADS" \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \ - --observables ring_xz open_zz mixed_local range2_xx long_z_string - ;; - - tn-long) - run_case "TN MPI long: n=36 layers=20 target_slices=24" \ - run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \ - --mpi \ - --nqubits "${TN_LONG_NQ:-36}" \ - --nlayers "${TN_LONG_LAYERS:-20}" \ - --torch-threads "$TN_THREADS" \ - --circuits brickwall_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz range2_xx \ - --tn-target-slices "${TN_LONG_SLICES:-24}" - ;; - - all) - "$0" smoke - "$0" mps-long - "$0" tn-long - ;; - - help|*) - cat >&2 <<'EOF' -Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all] - -Common overrides: - HOSTFILE=hostfile - PYTHON_BIN=.venv/bin/python - MPIEXEC=mpiexec - MPS_RANKS=8 MPS_THREADS=12 - TN_RANKS=12 TN_THREADS=8 - -Scale overrides: - MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096 - MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096 - TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24 -EOF - exit 2 - ;; -esac diff --git a/tools/run_cpu_single_cases.sh b/tools/run_cpu_single_cases.sh deleted file mode 100755 index b7f23e7..0000000 --- a/tools/run_cpu_single_cases.sh +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Single-node CPU scale probes for expectation benchmarks. -# -# Intended for one 96-core / ~500 GiB RAM node. The default "probe" mode runs -# moderate MPS and TN cases first. Larger modes are available after checking -# runtime and memory from the probe output. - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -cd "$ROOT_DIR" - -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -PYTHON_FLAGS="${PYTHON_FLAGS:--u}" -MPIEXEC="${MPIEXEC:-mpiexec}" -TIME_BIN="${TIME_BIN:-/usr/bin/time}" - -MPS_RANKS="${MPS_RANKS:-8}" -MPS_THREADS="${MPS_THREADS:-12}" -TN_RANKS="${TN_RANKS:-8}" -TN_THREADS="${TN_THREADS:-12}" - -export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}" -export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}" -source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh" - -estimate_mps_memory() { - local nqubits="$1" - local bond="$2" - "$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY' -import sys -n = int(sys.argv[1]) -chi = int(sys.argv[2]) -ranks = int(sys.argv[3]) -resident = n * 2 * chi * chi * 16 -per_rank = resident / ranks -print( - "MPS rough resident memory: " - f"total={resident / 1024**3:.1f} GiB " - f"per_rank={per_rank / 1024**3:.1f} GiB " - "(temporary eig/SVD workspaces are additional)" -) -PY -} - -run_timed() { - echo - echo "--------------------------------------------------------------------------------" - echo "$*" - echo "--------------------------------------------------------------------------------" - "$TIME_BIN" -v "$@" -} - -run_mps_case() { - local label="$1" - local nqubits="$2" - local nlayers="$3" - local bond="$4" - shift 4 - echo - echo "================================================================================" - echo "$label" - echo "================================================================================" - echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC" - echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS" - echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS" - estimate_mps_memory "$nqubits" "$bond" - run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \ - --mpi --mps \ - --nqubits "$nqubits" \ - --nlayers "$nlayers" \ - --bond "$bond" \ - --torch-threads "$MPS_THREADS" \ - "$@" -} - -run_tn_case() { - local label="$1" - local nqubits="$2" - local nlayers="$3" - shift 3 - echo - echo "================================================================================" - echo "$label" - echo "================================================================================" - echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC" - echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS" - echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS" - echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high." - run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \ - --mpi \ - --nqubits "$nqubits" \ - --nlayers "$nlayers" \ - --torch-threads "$TN_THREADS" \ - "$@" -} - -case "${1:-help}" in - probe) - run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \ - --circuits brickwall_cnot \ - --observables ring_xz - - run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \ - --circuits brickwall_cnot \ - --observables ring_xz \ - --tn-target-slices 8 - ;; - - mps-medium) - run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz mixed_local range2_xx - ;; - - mps-long) - run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \ - --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz mixed_local range2_xx - ;; - - tn-medium) - run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \ - --circuits brickwall_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz range2_xx \ - --tn-target-slices 16 - ;; - - tn-long) - run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \ - --circuits brickwall_cnot shifted_cz rxx_rzz \ - --observables ring_xz open_zz range2_xx \ - --tn-target-slices 32 - ;; - - help|*) - cat >&2 <<'EOF' -Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long] - -Common overrides: - PYTHON_BIN=.venv/bin/python - MPIEXEC=mpiexec - MPS_RANKS=8 MPS_THREADS=12 - TN_RANKS=8 TN_THREADS=12 - OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 -EOF - exit 2 - ;; -esac diff --git a/tools/run_tn_custom.py b/tools/run_tn_custom.py deleted file mode 100644 index 049ebed..0000000 --- a/tools/run_tn_custom.py +++ /dev/null @@ -1,243 +0,0 @@ -#!/usr/bin/env python -"""Run TN expectation for a user-provided circuit and observable. - -The case module should define: - - def build_circuit(nqubits, nlayers, seed): ... - def build_observable(nqubits, seed): ... - -``build_observable`` may return a Qibo SymbolicHamiltonian/form or the qibotn -dict form: - - {"terms": [ - {"coefficient": 1.0, "operators": [("X", 0), ("Z", 1)]}, - ]} - -For a single repeated Pauli string, pass ``--pauli-pattern`` instead of -defining ``build_observable``. -""" - -from __future__ import annotations - -import argparse -import importlib.util -import inspect -import json -import sys -from pathlib import Path - -ROOT = Path(__file__).resolve().parents[1] -SRC = ROOT / "src" -if str(SRC) not in sys.path: - sys.path.insert(0, str(SRC)) - -from qibotn.expectation_runner import ( # noqa: E402 - ExpectationConfig, - exact_for_observable, - run_cpu_expectation, -) - - -def optional_int(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return int(text) - - -def optional_float(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return float(text) - - -def load_module(path): - path = Path(path).resolve() - spec = importlib.util.spec_from_file_location(path.stem, path) - if spec is None or spec.loader is None: - raise RuntimeError(f"Cannot import case module from {path}.") - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def call_builder(fn, **kwargs): - sig = inspect.signature(fn) - if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()): - return fn(**kwargs) - accepted = { - name: value - for name, value in kwargs.items() - if name in sig.parameters - } - return fn(**accepted) - - -def load_observable(args, module): - if args.pauli_pattern: - return {"pauli_string_pattern": args.pauli_pattern} - if args.observable_json: - with Path(args.observable_json).open() as f: - return json.load(f) - if hasattr(module, "build_observable"): - return call_builder( - module.build_observable, - nqubits=args.nqubits, - nlayers=args.nlayers, - seed=args.seed, - ) - if hasattr(module, "OBSERVABLE"): - return module.OBSERVABLE - raise ValueError( - "No observable supplied. Define build_observable/OBSERVABLE in the case " - "module, or pass --pauli-pattern / --observable-json." - ) - - -def build_parallel_opts(args): - slicing_opts = {} - if args.tn_target_slices is not None: - slicing_opts["target_slices"] = args.tn_target_slices - if args.tn_target_size is not None: - slicing_opts["target_size"] = args.tn_target_size - - opts = { - "slicing_opts": slicing_opts or None, - "search_workers": args.tn_search_workers or args.torch_threads, - "max_repeats": args.tn_search_repeats, - "max_time": args.tn_search_time, - "print_stats": not args.no_tn_stats, - } - if args.tn_search_backend is not None: - opts["search_backend"] = args.tn_search_backend - if args.dask_address is not None: - opts["dask_address"] = args.dask_address - if args.dask_close_workers: - opts["dask_close_workers"] = True - if args.tn_save_tree is not None: - opts["save_tree_path"] = args.tn_save_tree - if args.tn_load_tree is not None: - opts["load_tree_path"] = args.tn_load_tree - if args.tn_search_only: - opts["search_only"] = True - return opts - - -def main(): - parser = argparse.ArgumentParser( - description="Run CPU TN expectation for a custom qibo circuit module." - ) - parser.add_argument("case_module", help="Python file defining build_circuit.") - parser.add_argument("--nqubits", type=int, required=True) - parser.add_argument("--nlayers", type=int, default=0) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--mpi", action="store_true") - parser.add_argument("--exact", action="store_true") - parser.add_argument("--exact-max-qubits", type=int, default=24) - parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024) - parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) - parser.add_argument("--torch-threads", type=int, default=8) - parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch") - parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex128") - parser.add_argument("--pauli-pattern") - parser.add_argument("--observable-json") - parser.add_argument("--tn-target-slices", type=int) - parser.add_argument("--tn-target-size", type=int, default=2**32) - parser.add_argument("--tn-search-workers", type=int) - parser.add_argument("--tn-search-repeats", type=int, default=128) - parser.add_argument("--tn-search-time", type=float, default=60.0) - parser.add_argument("--tn-search-backend", choices=("processpool", "dask")) - parser.add_argument("--dask-address") - parser.add_argument("--dask-close-workers", action="store_true") - parser.add_argument("--tn-save-tree") - parser.add_argument("--tn-load-tree") - parser.add_argument("--tn-search-only", action="store_true") - parser.add_argument("--no-tn-stats", action="store_true") - args = parser.parse_args() - - rank = 0 - if args.mpi: - from mpi4py import MPI - - rank = MPI.COMM_WORLD.Get_rank() - - module = load_module(args.case_module) - if not hasattr(module, "build_circuit"): - raise ValueError("case_module must define build_circuit.") - - circuit = call_builder( - module.build_circuit, - nqubits=args.nqubits, - nlayers=args.nlayers, - seed=args.seed, - ) - observable = load_observable(args, module) - - config = ExpectationConfig( - ansatz="tn", - mpi=args.mpi, - bond=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - quimb_backend=args.quimb_backend, - dtype=args.dtype, - torch_threads=args.torch_threads, - parallel_opts=build_parallel_opts(args), - ) - - if rank == 0: - mode = "MPI" if args.mpi else "serial" - print( - f"backend=cpu ansatz=TN mode={mode} case={Path(args.case_module).name} " - f"nqubits={args.nqubits} nlayers={args.nlayers} seed={args.seed} " - f"quimb_backend={args.quimb_backend} dtype={args.dtype} " - f"torch_threads={args.torch_threads}", - flush=True, - ) - print("observable exact value abs_error rel_error seconds", flush=True) - - exact = None - if args.exact and rank == 0: - if args.nqubits > args.exact_max_qubits: - raise ValueError( - f"--exact is limited to {args.exact_max_qubits} qubits by default." - ) - exact = exact_for_observable(circuit, observable, args.nqubits) - - result = run_cpu_expectation(circuit, observable, config) - if args.mpi and result.rank != 0: - return - - abs_error = float("nan") if exact is None else abs(result.value - exact) - rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) - exact_text = "nan" if exact is None else f"{exact:.16e}" - print( - f"custom {exact_text} {result.value:.16e} " - f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}", - flush=True, - ) - - for stat in result.parallel_stats or (): - cost = stat["path_cost"] - search_stats = stat.get("search_stats", {}) - print( - "tn_term_summary " - f"term={stat.get('term_index', 0)} " - f"search_seconds={stat.get('search_seconds', float('nan')):.3f} " - f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} " - f"completed_trials={search_stats.get('completed_trials', 'na')} " - f"finite_trials={search_stats.get('finite_trials', 'na')} " - f"failed_trials={search_stats.get('failed_trials', 'na')} " - f"requested_trials={search_stats.get('requested_trials', 'na')} " - f"best_score={search_stats.get('best_score', float('nan')):.6g} " - f"slices={cost.get('slices')} " - f"log10_flops={cost.get('log10_flops', float('nan')):.3f} " - f"log10_write={cost.get('log10_write', float('nan')):.3f} " - f"log2_size={cost.get('log2_size', float('nan')):.3f} " - f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} " - f"rank_slices={stat.get('rank_slices')}", - flush=True, - ) - - -if __name__ == "__main__": - main() diff --git a/tools/run_tn_dask_mpi_all.sh b/tools/run_tn_dask_mpi_all.sh deleted file mode 100755 index b4ba0d1..0000000 --- a/tools/run_tn_dask_mpi_all.sh +++ /dev/null @@ -1,260 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -cd "$ROOT_DIR" - -CASE="${CASE:-main1}" -OBSERVABLES="${OBSERVABLES:-long_z_string}" -NQUBITS="${NQUBITS:-34}" -NLAYERS="${NLAYERS:-20}" -TORCH_THREADS="${TORCH_THREADS:-48}" -SEARCH_REPEATS="${SEARCH_REPEATS:-2048}" -SEARCH_TIME="${SEARCH_TIME:-300}" -TN_TARGET_SIZE="${TN_TARGET_SIZE:-17179869184}" -TN_TARGET_SLICES="${TN_TARGET_SLICES:-}" - -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -DTYPE="${DTYPE:-complex64}" -TREE_DIR="${TREE_DIR:-trees/contest_tn}" -DASK_ADDRESS="${DASK_ADDRESS:-tcp://10.20.1.103:8786}" -DASK_EXPECTED_WORKERS="${DASK_EXPECTED_WORKERS:-}" -DASK_WAIT_FOR_WORKERS="${DASK_WAIT_FOR_WORKERS:-1}" -DASK_WAIT_TIMEOUT="${DASK_WAIT_TIMEOUT:-600}" -TN_DEBUG_TRIALS="${TN_DEBUG_TRIALS:-0}" -MPIEXEC="${MPIEXEC:-mpirun}" -MPIEXEC_FULL="${MPIEXEC_FULL:-}" -MPI_HOSTS="${MPI_HOSTS:-}" -MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}" -MPI_RANKS="${MPI_RANKS:-}" -MPI_PE="${MPI_PE:-$TORCH_THREADS}" -MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}" -MPI_BIND_TO="${MPI_BIND_TO:-core}" -MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}" -MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}" -TN_CONTRACT_ENV_CHECK="${TN_CONTRACT_ENV_CHECK:-1}" -SYNC_TREES="${SYNC_TREES:-1}" -SYNC_HOSTS="${SYNC_HOSTS:-${WORKER_HOSTS:-}}" -SSH_BIN="${SSH_BIN:-ssh}" -DASK_CLUSTER_MANAGED="${DASK_CLUSTER_MANAGED:-0}" - -export TCM_ENABLE="${TCM_ENABLE:-1}" -export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}" -export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}" -source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh" - -tn_slice_args=(--tn-target-size "$TN_TARGET_SIZE") -if [[ -n "$TN_TARGET_SLICES" ]]; then - tn_slice_args+=(--tn-target-slices "$TN_TARGET_SLICES") -fi - -cleanup_dask_cluster() { - local status=$? - if [[ "$DASK_CLUSTER_MANAGED" == "1" ]]; then - set +e - tools/manage_tn_dask_cluster.sh stop >/dev/null 2>&1 || true - fi - exit "$status" -} - -trap cleanup_dask_cluster EXIT INT TERM HUP - -sum_host_slots() { - local hosts="$1" - local total=0 - local item slots - IFS=',' read -r -a host_items <<< "$hosts" - for item in "${host_items[@]}"; do - if [[ "$item" == *:* ]]; then - slots="${item##*:}" - else - slots=1 - fi - total=$((total + slots)) - done - echo "$total" -} - -count_hosts() { - local hosts="$1" - local count=0 - local item - IFS=' ' read -r -a host_items <<< "$hosts" - for item in "${host_items[@]}"; do - [[ -n "$item" ]] && count=$((count + 1)) - done - echo "$count" -} - -wait_for_dask_workers() { - [[ "$DASK_WAIT_FOR_WORKERS" == "1" ]] || return 0 - local expected="$DASK_EXPECTED_WORKERS" - if [[ -z "$expected" && -n "$WORKER_HOSTS" ]]; then - expected=$(( $(count_hosts "$WORKER_HOSTS") * NWORKERS )) - fi - if [[ -z "$expected" || "$expected" -le 0 ]]; then - return 0 - fi - - echo "Waiting for Dask workers: expected=$expected timeout=${DASK_WAIT_TIMEOUT}s" - "$PYTHON_BIN" - "$DASK_ADDRESS" "$expected" "$DASK_WAIT_TIMEOUT" <<'PY' -import sys -import time -from distributed import Client - -address, expected, timeout = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]) -deadline = time.time() + timeout -client = Client(address) -try: - while True: - info = client.scheduler_info(n_workers=-1) - workers = info.get("workers", {}) - count = len(workers) - if count >= expected: - print(f"dask_workers_ready count={count} expected={expected}", flush=True) - break - if time.time() >= deadline: - print( - f"dask_workers_wait_timeout count={count} expected={expected}", - flush=True, - ) - break - time.sleep(2) -finally: - client.close() -PY -} - -append_mpi_env_args() { - [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0 - mpi_prefix+=( - -x "LD_PRELOAD=${LD_PRELOAD:-}" - -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS" - -x "OMP_NUM_THREADS=$OMP_NUM_THREADS" - -x "MKL_NUM_THREADS=$MKL_NUM_THREADS" - -x "OMP_PROC_BIND=$OMP_PROC_BIND" - -x "OMP_PLACES=$OMP_PLACES" - ) -} - -build_mpi_prefix() { - if [[ -n "$MPIEXEC_FULL" ]]; then - # shellcheck disable=SC2206 - mpi_prefix=($MPIEXEC_FULL) - append_mpi_env_args - return - fi - - local ranks="$MPI_RANKS" - if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then - ranks="$(sum_host_slots "$MPI_HOSTS")" - fi - if [[ -z "$ranks" ]]; then - ranks=2 - fi - - mpi_prefix=( - "$MPIEXEC" - --map-by "$MPI_MAP_BY" - --bind-to "$MPI_BIND_TO" - -np "$ranks" - ) - if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then - mpi_prefix+=(--report-bindings) - fi - append_mpi_env_args - if [[ -n "$MPI_HOSTS" ]]; then - mpi_prefix+=(-host "$MPI_HOSTS") - elif [[ -n "$MPI_HOSTFILE" ]]; then - mpi_prefix+=(-hostfile "$MPI_HOSTFILE") - fi -} - -is_local_host() { - local host="$1" - [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0 - [[ "$host" == "$(hostname)" ]] && return 0 - [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0 - hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host" -} - -sync_trees_to_hosts() { - [[ "$SYNC_TREES" == "1" ]] || return 0 - [[ -n "$SYNC_HOSTS" ]] || return 0 - - local src_dir="$TREE_DIR" - local dst_dir="$TREE_DIR" - if [[ "$TREE_DIR" != /* ]]; then - src_dir="$ROOT_DIR/$TREE_DIR" - dst_dir="$ROOT_DIR/$TREE_DIR" - fi - - for host in $SYNC_HOSTS; do - is_local_host "$host" && continue - echo "Sync tree dir to $host:$dst_dir" - "$SSH_BIN" "$host" "mkdir -p $(printf '%q' "$dst_dir")" - if command -v rsync >/dev/null 2>&1; then - rsync -a "$src_dir/" "$host:$dst_dir/" - else - scp -q "$src_dir"/*.pkl "$host:$dst_dir/" - fi - done -} - -tools/manage_tn_dask_cluster.sh start -DASK_CLUSTER_MANAGED=1 -wait_for_dask_workers - -echo "Search with dask: $DASK_ADDRESS" -search_args=( - --case "$CASE" - --nqubits "$NQUBITS" - --nlayers "$NLAYERS" - --observables $OBSERVABLES - --tree-dir "$TREE_DIR" - --dask-address "$DASK_ADDRESS" - --torch-threads "$TORCH_THREADS" - --dtype "$DTYPE" - --tn-search-repeats "$SEARCH_REPEATS" - --tn-search-time "$SEARCH_TIME" - "${tn_slice_args[@]}" -) -if [[ -n "$DASK_EXPECTED_WORKERS" ]]; then - search_args+=(--dask-expected-workers "$DASK_EXPECTED_WORKERS") -fi -if [[ "$TN_DEBUG_TRIALS" == "1" ]]; then - search_args+=(--tn-debug-trials) -fi -"$PYTHON_BIN" -u tools/tn_contest_runner.py search "${search_args[@]}" - -sync_trees_to_hosts - -build_mpi_prefix -echo "Contract with MPI: ${mpi_prefix[*]}" -if [[ "$TN_CONTRACT_ENV_CHECK" == "1" ]]; then - "${mpi_prefix[@]}" "$PYTHON_BIN" -c "from mpi4py import MPI; import os; \ -import torch; \ -rank = MPI.COMM_WORLD.Get_rank(); \ -blis = []; \ -[blis.append(line.strip().split()[-1]) for line in open('/proc/self/maps') if 'libblis' in line and line.strip().split()[-1] not in blis]; \ -print('tn_contract_env ' + \ - f'rank={rank} ' + \ - f'LD_PRELOAD={os.environ.get(\"LD_PRELOAD\", \"\")} ' + \ - f'BLIS_NUM_THREADS={os.environ.get(\"BLIS_NUM_THREADS\", \"\")} ' + \ - f'OMP_NUM_THREADS={os.environ.get(\"OMP_NUM_THREADS\", \"\")} ' + \ - f'MKL_NUM_THREADS={os.environ.get(\"MKL_NUM_THREADS\", \"\")} ' + \ - f'OMP_PROC_BIND={os.environ.get(\"OMP_PROC_BIND\", \"\")} ' + \ - f'OMP_PLACES={os.environ.get(\"OMP_PLACES\", \"\")} ' + \ - f'torch_threads={torch.get_num_threads()} ' + \ - f'blis={\";\".join(blis) if blis else \"missing\"}', flush=True)" -fi -"${mpi_prefix[@]}" "$PYTHON_BIN" -u tools/tn_contest_runner.py contract \ - --mpi \ - --case "$CASE" \ - --nqubits "$NQUBITS" \ - --nlayers "$NLAYERS" \ - --observables $OBSERVABLES \ - --tree-dir "$TREE_DIR" \ - --torch-threads "$TORCH_THREADS" \ - --dtype "$DTYPE" \ - "${tn_slice_args[@]}" diff --git a/tools/run_vidal_mpi_contest_cases.sh b/tools/run_vidal_mpi_contest_cases.sh deleted file mode 100755 index cee84a4..0000000 --- a/tools/run_vidal_mpi_contest_cases.sh +++ /dev/null @@ -1,414 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# Contest-style Vidal/MPI MPS cases. -# -# Usage: -# tools/run_vidal_mpi_contest_cases.sh main1 -# tools/run_vidal_mpi_contest_cases.sh main2 -# tools/run_vidal_mpi_contest_cases.sh strong -# tools/run_vidal_mpi_contest_cases.sh all -# -# Common overrides: -# PYTHON_BIN=.venv/bin/python -# MPIEXEC=mpirun -# MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2" -# MPI_RANKS=8 -# MPI_PE=128 -# MPI_MAP_BY=ppr:1:numa:PE=128 -# MPI_BIND_TO=core -# MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2" -# HOSTFILE=hostfile # optional; used only if the file exists -# RANKS=8 # fallback if MPI_RANKS is not set -# TORCH_THREADS=8 -# CUT_RATIO=1e-12 -# OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0" -# -# Per-case overrides: -# MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001 -# MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002 -# STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001 - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -cd "$ROOT_DIR" - -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -MPIEXEC="${MPIEXEC:-mpirun}" -MPIEXEC_FULL="${MPIEXEC_FULL:-}" -MPI_HOSTS="${MPI_HOSTS:-}" -MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}" -MPI_RANKS="${MPI_RANKS:-${RANKS:-}}" -RANKS="${RANKS:-4}" -TORCH_THREADS="${TORCH_THREADS:-1}" -MPI_PE="${MPI_PE:-$TORCH_THREADS}" -MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}" -MPI_BIND_TO="${MPI_BIND_TO:-core}" -MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}" -MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}" -CUT_RATIO="${CUT_RATIO:-1e-12}" -OBS_FILTER="${OBS_FILTER:-}" -export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}" -export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}" -source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh" - -RUNNER_DIR="$ROOT_DIR/.tmp" -mkdir -p "$RUNNER_DIR" -RUNNER="$(mktemp "$RUNNER_DIR/qibotn_vidal_contest.XXXXXX.py")" -cleanup() { - rm -f "$RUNNER" -} -trap cleanup EXIT - -cat > "$RUNNER" <<'PY' -from __future__ import annotations - -import argparse -import math -import time - -import numpy as np -from mpi4py import MPI -from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Y, Z - -from qibotn.backends.vidal import VidalBackend - - -def set_torch_threads(nthreads): - try: - import torch - - torch.set_num_threads(nthreads) - except Exception: - pass - - -def build_circuit(kind, nqubits, nlayers, seed): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - - for layer in range(nlayers): - for q in range(nqubits): - circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) - if kind in ("rxx_rzz", "scramble"): - circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi))) - - if kind == "reversed_cnot": - for q in range(0, nqubits - 1, 2): - circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1)) - for q in range(1, nqubits - 1, 2): - circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1)) - elif kind == "rxx_rzz": - for q in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9))) - circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9))) - elif kind == "scramble": - for q in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8))) - circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8))) - if layer % 5 == 4: - circuit.add(gates.SWAP(q, q + 1)) - else: - raise ValueError(f"Unknown circuit kind {kind!r}.") - - return circuit - - -def ring_xz(nqubits): - form = 0 - for q in range(nqubits): - form += 0.5 * X(q) * Z((q + 1) % nqubits) - return hamiltonians.SymbolicHamiltonian(form=form) - - -def open_zz(nqubits): - form = 0 - for q in range(nqubits - 1): - form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1) - return hamiltonians.SymbolicHamiltonian(form=form) - - -def range2_xx(nqubits): - form = 0 - for q in range(nqubits - 2): - form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2) - return hamiltonians.SymbolicHamiltonian(form=form) - - -def dense_observable(nqubits, qubits, seed, dim): - rng = np.random.default_rng(seed) - raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim)) - matrix = (raw + raw.conj().T) / 2.0 - matrix = matrix / np.linalg.norm(matrix) - return {"matrix": matrix, "qubits": list(qubits)} - - -def observables_for_case(nqubits, seed): - q1 = nqubits // 4 - q2 = nqubits // 2 - q3 = (3 * nqubits) // 4 - last = nqubits - 1 - - return [ - ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))), - ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))), - ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))), - ( - "long_Z_5_sites", - hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)), - ), - ( - "mixed_XZYZX", - hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)), - ), - ("ring_xz", ring_xz(nqubits)), - ("open_zz", open_zz(nqubits)), - ("range2_xx", range2_xx(nqubits)), - ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))), - ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)), - ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)), - ] - - -def run_case(args): - set_torch_threads(args.torch_threads) - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed) - observables = observables_for_case(args.nqubits, args.seed) - if args.obs_filter: - wanted = set(args.obs_filter.split(",")) - observables = [(name, obs) for name, obs in observables if name in wanted] - if not observables: - raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}") - - if rank == 0: - print("=" * 88, flush=True) - print( - "case " - f"label={args.label} kind={args.kind} ranks={size} " - f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} " - f"bond={args.bond} cut_ratio={args.cut_ratio:g} " - f"torch_threads={args.torch_threads} seed={args.seed} " - f"obs_filter={args.obs_filter or 'all'}", - flush=True, - ) - print( - "observable value seconds trunc_sum trunc_max status", - flush=True, - ) - - for obs_name, observable in observables: - backend = VidalBackend() - backend.configure_tn_simulation( - max_bond_dimension=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - mpi_approach="CT", - mpi_num_procs=size, - fallback=False, - ) - - comm.Barrier() - start = time.perf_counter() - try: - value = backend.expectation( - circuit, - observable, - preprocess=True, - compile_circuit=False, - ) - status = "ok" - except Exception as exc: # pragma: no cover - printed for manual runs - value = np.nan - status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0] - seconds = time.perf_counter() - start - - if rank == 0: - print( - f"{obs_name} {value!r} {seconds:.3f} " - f"{backend.last_truncation_error:.6e} " - f"{backend.last_max_truncation_error:.6e} {status}", - flush=True, - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--label", required=True) - parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True) - parser.add_argument("--nqubits", type=int, required=True) - parser.add_argument("--nlayers", type=int, required=True) - parser.add_argument("--bond", type=int, required=True) - parser.add_argument("--cut-ratio", type=float, required=True) - parser.add_argument("--seed", type=int, required=True) - parser.add_argument("--torch-threads", type=int, required=True) - parser.add_argument("--obs-filter", default="") - run_case(parser.parse_args()) - - -if __name__ == "__main__": - main() -PY - -sum_host_slots() { - local hosts="$1" - local total=0 - local item slots - IFS=',' read -r -a host_items <<< "$hosts" - for item in "${host_items[@]}"; do - if [[ "$item" == *:* ]]; then - slots="${item##*:}" - else - slots=1 - fi - total=$((total + slots)) - done - echo "$total" -} - -append_mpi_env_args() { - [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0 - mpi_prefix+=( - -x "LD_PRELOAD=${LD_PRELOAD:-}" - -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS" - -x "OMP_NUM_THREADS=$OMP_NUM_THREADS" - -x "MKL_NUM_THREADS=$MKL_NUM_THREADS" - -x "OMP_PROC_BIND=$OMP_PROC_BIND" - -x "OMP_PLACES=$OMP_PLACES" - ) -} - -build_mpi_prefix() { - if [[ -n "$MPIEXEC_FULL" ]]; then - # shellcheck disable=SC2206 - mpi_prefix=($MPIEXEC_FULL) - append_mpi_env_args - return - fi - - local ranks="$MPI_RANKS" - if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then - ranks="$(sum_host_slots "$MPI_HOSTS")" - fi - if [[ -z "$ranks" ]]; then - ranks="$RANKS" - fi - - mpi_prefix=( - "$MPIEXEC" - --map-by "$MPI_MAP_BY" - --bind-to "$MPI_BIND_TO" - -np "$ranks" - ) - if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then - mpi_prefix+=(--report-bindings) - fi - append_mpi_env_args - if [[ -n "$MPI_HOSTS" ]]; then - mpi_prefix+=(-host "$MPI_HOSTS") - elif [[ -n "$MPI_HOSTFILE" ]]; then - mpi_prefix+=(-hostfile "$MPI_HOSTFILE") - fi -} - -build_mpi_prefix - -run_case() { - local label="$1" - local kind="$2" - local nq="$3" - local layers="$4" - local bond="$5" - local seed="$6" - - echo - echo "Running $label: kind=$kind nqubits=$nq layers=$layers bond=$bond seed=$seed" - echo "MPI: ${mpi_prefix[*]}" - "${mpi_prefix[@]}" "$PYTHON_BIN" -u "$ROOT_DIR/tools/vidal_mpi_contest_runner.py" \ - --label "$label" \ - --kind "$kind" \ - --nqubits "$nq" \ - --nlayers "$layers" \ - --bond "$bond" \ - --cut-ratio "$CUT_RATIO" \ - --seed "$seed" \ - --torch-threads "$TORCH_THREADS" \ - --obs-filter "$(tr ' ' ',' <<< "$OBS_FILTER")" -} - -case "${1:-help}" in - main1) - run_case \ - "main1-reversed-cnot" \ - "reversed_cnot" \ - "${MAIN1_NQ:-128}" \ - "${MAIN1_LAYERS:-50}" \ - "${MAIN1_BOND:-1024}" \ - "${MAIN1_SEED:-31001}" - ;; - main2) - run_case \ - "main2-rxx-rzz" \ - "rxx_rzz" \ - "${MAIN2_NQ:-128}" \ - "${MAIN2_LAYERS:-64}" \ - "${MAIN2_BOND:-2048}" \ - "${MAIN2_SEED:-31002}" - ;; - strong) - run_case \ - "strong-scramble" \ - "scramble" \ - "${STRONG_NQ:-256}" \ - "${STRONG_LAYERS:-64}" \ - "${STRONG_BOND:-2048}" \ - "${STRONG_SEED:-41001}" - ;; - all) - "$0" main1 - "$0" main2 - "$0" strong - ;; - smoke) - MAIN1_NQ="${MAIN1_NQ:-32}" \ - MAIN1_LAYERS="${MAIN1_LAYERS:-6}" \ - MAIN1_BOND="${MAIN1_BOND:-128}" \ - "$0" main1 - ;; - help|*) - cat >&2 <<'EOF' -Usage: tools/run_vidal_mpi_contest_cases.sh [main1|main2|strong|all|smoke] - -Cases: - main1 128 qubits, 50 layers, reversed-CNOT brickwall, chi=1024 - main2 128 qubits, 64 layers, RXX/RZZ brickwall, chi=2048 - strong 256 qubits, 64 layers, RXX/RZZ + periodic SWAP scramble, chi=2048 - smoke Small syntax/runtime check of main1 - -Common overrides: - PYTHON_BIN=.venv/bin/python - MPIEXEC=mpiexec - MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2" - MPI_RANKS=8 - MPI_PE=128 - MPI_MAP_BY=ppr:1:numa:PE=128 - MPI_BIND_TO=core - MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2" - HOSTFILE=hostfile - RANKS=8 - TORCH_THREADS=8 - CUT_RATIO=1e-12 - OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0" - -Per-case overrides: - MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001 - MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002 - STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001 -EOF - exit 2 - ;; -esac diff --git a/tools/run_vidal_segment_mpi_scan.sh b/tools/run_vidal_segment_mpi_scan.sh deleted file mode 100755 index 49dc138..0000000 --- a/tools/run_vidal_segment_mpi_scan.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -NQ="${NQ:-34}" -LAYERS="${LAYERS:-20}" -BOND="${BOND:-512}" -SEED="${SEED:-42}" -RANKS="${RANKS:-1 2 4}" -THREADS="${THREADS:-32 32 16}" -PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" -MPIEXEC="${MPIEXEC:-mpiexec}" -CIRCUIT="${CIRCUIT:-brickwall_cnot}" -OBSERVABLE="${OBSERVABLE:-ring_xz}" -EXACT="${EXACT:-0}" - -ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" -cd "$ROOT_DIR" - -if [[ "${1:-help}" != "run" ]]; then - cat >&2 <<'EOF' -Usage: tools/run_vidal_segment_mpi_scan.sh run - -Overrides: - NQ=34 LAYERS=20 BOND=512 SEED=42 - RANKS="1 2 4" THREADS="32 32 16" - CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz - EXACT=1 - PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec -EOF - if [[ "${1:-help}" == "help" ]]; then - exit 0 - fi - exit 2 -fi - -read -r -a ranks <<< "$RANKS" -read -r -a threads <<< "$THREADS" - -if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then - echo "RANKS and THREADS must have the same number of entries." >&2 - exit 2 -fi - -common=( - --nqubits "$NQ" - --nlayers "$LAYERS" - --bond "$BOND" - --seed "$SEED" - --mps - --circuits "$CIRCUIT" - --observables "$OBSERVABLE" -) - -if [[ "$EXACT" == "1" ]]; then - common+=(--exact) -fi - -for idx in "${!ranks[@]}"; do - nrank="${ranks[$idx]}" - nthr="${threads[$idx]}" - if [[ "$nrank" == "1" ]]; then - echo "== Vidal serial ranks=1 torch_threads=$nthr ==" - "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - "${common[@]}" --torch-threads "$nthr" - else - echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr ==" - "$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ - "${common[@]}" --torch-threads "$nthr" --mpi - fi -done diff --git a/tools/slice_existing_tree.py b/tools/slice_existing_tree.py deleted file mode 100644 index 4e94e9c..0000000 --- a/tools/slice_existing_tree.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Slice an existing saved cotengra tree without re-running path search.""" - -from __future__ import annotations - -import argparse -import pickle -from pathlib import Path - -from qibotn.parallel import contraction_tree_costs - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("input", help="Input pickle saved by --tn-save-tree.") - parser.add_argument("output", help="Output pickle path.") - parser.add_argument("--term", type=int, default=0) - parser.add_argument("--target-slices", type=int, default=2) - parser.add_argument("--max-repeats", type=int, default=64) - parser.add_argument("--seed", type=int, default=42) - args = parser.parse_args() - - input_path = Path(args.input) - output_path = Path(args.output) - with input_path.open("rb") as f: - payload = pickle.load(f) - - trees = payload["trees"] if isinstance(payload, dict) else payload - if not isinstance(trees, (list, tuple)): - trees = [trees] - tree = trees[args.term] - - print("original", contraction_tree_costs(tree), flush=True) - sliced = tree.slice( - target_slices=args.target_slices, - max_repeats=args.max_repeats, - seed=args.seed, - ) - print("sliced", contraction_tree_costs(sliced), flush=True) - print(f"sliced_inds={sliced.sliced_inds}", flush=True) - - new_trees = list(trees) - new_trees[args.term] = sliced - - if isinstance(payload, dict): - out_payload = dict(payload) - out_payload["trees"] = new_trees - out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees] - out_payload["nterms"] = len(new_trees) - else: - out_payload = new_trees - - output_path.parent.mkdir(parents=True, exist_ok=True) - with output_path.open("wb") as f: - pickle.dump(out_payload, f) - print(f"saved {output_path}", flush=True) - - -if __name__ == "__main__": - main() diff --git a/tools/tn_contest_runner.py b/tools/tn_contest_runner.py deleted file mode 100644 index 06ff913..0000000 --- a/tools/tn_contest_runner.py +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/env python -"""Contest-style CPU TN path search and contraction runner. - -This file is intentionally self-contained: define contest circuits and -observables here, run path search once, then load the saved trees for repeated -MPI contractions. -""" - -from __future__ import annotations - -import argparse -import math -import os -import subprocess -import sys -from dataclasses import dataclass -from pathlib import Path -from urllib.parse import urlparse - -import numpy as np -from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Y, Z - -ROOT = Path(__file__).resolve().parents[1] -SRC = ROOT / "src" -if str(SRC) not in sys.path: - sys.path.insert(0, str(SRC)) - -from qibotn.expectation_runner import ( # noqa: E402 - ExpectationConfig, - exact_for_observable, - run_cpu_expectation, -) - - -@dataclass(frozen=True) -class CaseSpec: - circuit_kind: str - observables: tuple[str, ...] - nqubits: int - nlayers: int - seed: int - target_slices: int | None = None - - -CASES = { - "main1": CaseSpec( - circuit_kind="rxx_rzz_chain", - observables=("ring_xz",), - nqubits=37, - nlayers=20, - seed=31001, - target_slices=None, - ), - "main2": CaseSpec( - circuit_kind="scramble_chain", - observables=("open_zz", "range2_xx"), - nqubits=36, - nlayers=18, - seed=31002, - target_slices=None, - ), - "strong": CaseSpec( - circuit_kind="reversed_cnot", - observables=("ring_xz", "long_z_string"), - nqubits=40, - nlayers=24, - seed=41001, - target_slices=None, - ), -} - - -def optional_int(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return int(text) - - -def optional_float(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return float(text) - - -def set_torch_threads(nthreads): - try: - import torch - - torch.set_num_threads(nthreads) - except Exception: - pass - - -def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False): - for qubit in range(nqubits): - circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) - if include_rx: - circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi))) - - -def build_circuit(kind, nqubits, nlayers, seed): - """Define contest circuits here.""" - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - - for layer in range(nlayers): - if kind == "rxx_rzz_chain": - add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) - for qubit in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) - circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9))) - - elif kind == "scramble_chain": - add_single_qubit_layer(circuit, nqubits, rng, include_rx=True) - for qubit in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) - circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8))) - if layer % 5 == 4: - circuit.add(gates.SWAP(qubit, qubit + 1)) - - elif kind == "reversed_cnot": - add_single_qubit_layer(circuit, nqubits, rng) - for qubit in range(0, nqubits - 1, 2): - gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1) - circuit.add(gate) - for qubit in range(1, nqubits - 1, 2): - gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1) - circuit.add(gate) - - else: - raise ValueError(f"Unknown circuit kind {kind!r}.") - - return circuit - - -def pauli_sum_observable(kind, nqubits, seed): - """Define contest observables here. - - TN path currently expects Pauli products / SymbolicHamiltonian terms. - Keep production contest observables Hermitian unless complex output is - explicitly required by the scoring rule. - """ - del seed - if kind == "ring_xz": - form = 0 - for qubit in range(nqubits): - form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits) - return hamiltonians.SymbolicHamiltonian(form=form) - - if kind == "open_zz": - form = 0 - for qubit in range(nqubits - 1): - form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1) - return hamiltonians.SymbolicHamiltonian(form=form) - - if kind == "range2_xx": - form = 0 - for qubit in range(nqubits - 2): - form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2) - return hamiltonians.SymbolicHamiltonian(form=form) - - if kind == "long_z_string": - stride = max(1, nqubits // 16) - form = None - for qubit in range(0, nqubits, stride): - form = Z(qubit) if form is None else form * Z(qubit) - return hamiltonians.SymbolicHamiltonian(form=form) - - if kind == "mixed_local": - q1 = nqubits // 4 - q2 = nqubits // 2 - q3 = (3 * nqubits) // 4 - form = 0.25 * X(0) - 0.5 * Z(nqubits - 1) - form += 0.125 * X(q1) * Z(q2) * Y(q3) - return hamiltonians.SymbolicHamiltonian(form=form) - - raise ValueError(f"Unknown observable kind {kind!r}.") - - -def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices): - slice_label = "auto" if target_slices is None else f"s{target_slices}" - return ( - Path(tree_dir) - / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}.pkl" - ) - - -def build_parallel_opts(args, tree_file=None, search_only=False): - slicing_opts = {} - if args.tn_target_slices is not None: - slicing_opts["target_slices"] = args.tn_target_slices - if args.tn_target_size is not None: - slicing_opts["target_size"] = args.tn_target_size - - opts = { - "slicing_opts": slicing_opts or None, - "search_workers": args.tn_search_workers or args.torch_threads, - "max_repeats": args.tn_search_repeats, - "max_time": args.tn_search_time, - "print_stats": False, - } - if args.tn_search_backend is not None: - opts["search_backend"] = args.tn_search_backend - if args.dask_address is not None: - opts["dask_address"] = args.dask_address - if args.dask_expected_workers is not None: - opts["dask_expected_workers"] = args.dask_expected_workers - if args.dask_close_workers: - opts["dask_close_workers"] = True - if args.tn_debug_trials: - opts["debug_trials"] = True - if search_only: - opts["search_only"] = True - opts["save_tree_path"] = str(tree_file) - elif tree_file is not None: - opts["load_tree_path"] = str(tree_file) - return opts - - -def run_one(args, case_name, obs_name, mode): - case = CASES[case_name] - circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed) - observable = pauli_sum_observable(obs_name, args.nqubits, args.seed) - path = tree_path( - args.tree_dir, - case_name, - obs_name, - args.nqubits, - args.nlayers, - args.tn_target_slices, - ) - path.parent.mkdir(parents=True, exist_ok=True) - - rank = 0 - if args.mpi: - from mpi4py import MPI - - rank = MPI.COMM_WORLD.Get_rank() - - if rank == 0: - print("=" * 88, flush=True) - print( - f"mode={mode} case={case_name} circuit={case.circuit_kind} " - f"observable={obs_name} nqubits={args.nqubits} nlayers={args.nlayers} " - f"seed={args.seed} gates={len(circuit.queue)} tree={path}", - flush=True, - ) - - if mode == "contract" and not path.exists(): - raise FileNotFoundError(f"Missing tree file: {path}. Run search first.") - - exact = None - if args.exact and rank == 0 and mode != "search": - if args.nqubits > args.exact_max_qubits: - raise ValueError( - f"--exact is limited to {args.exact_max_qubits} qubits by default." - ) - exact = exact_for_observable(circuit, observable, args.nqubits) - - config = ExpectationConfig( - ansatz="tn", - mpi=args.mpi, - bond=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - quimb_backend=args.quimb_backend, - dtype=args.dtype, - torch_threads=args.torch_threads, - parallel_opts=build_parallel_opts( - args, - tree_file=path, - search_only=(mode == "search"), - ), - ) - result = run_cpu_expectation(circuit, observable, config) - if args.mpi and result.rank != 0: - return - - if mode == "search": - print(f"searched observable={obs_name} tree={path}", flush=True) - else: - abs_error = float("nan") if exact is None else abs(result.value - exact) - rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) - exact_text = "nan" if exact is None else f"{exact:.16e}" - print( - f"result observable={obs_name} exact={exact_text} " - f"value={result.value:.16e} abs_error={abs_error:.6e} " - f"rel_error={rel_error:.6e} seconds={result.seconds:.3f}", - flush=True, - ) - - for stat in result.parallel_stats or (): - cost = stat["path_cost"] - search_stats = stat.get("search_stats", {}) - print( - "tn_term_summary " - f"observable={obs_name} " - f"term={stat.get('term_index', 0)} " - f"search_seconds={stat.get('search_seconds', float('nan')):.3f} " - f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} " - f"completed_trials={search_stats.get('completed_trials', 'na')} " - f"finite_trials={search_stats.get('finite_trials', 'na')} " - f"failed_trials={search_stats.get('failed_trials', 'na')} " - f"requested_trials={search_stats.get('requested_trials', 'na')} " - f"best_score={search_stats.get('best_score', float('nan')):.6g} " - f"slices={cost.get('nslices')} " - f"log10_flops={cost.get('log10_flops', float('nan')):.3f} " - f"log10_write={cost.get('log10_write', float('nan')):.3f} " - f"log2_size={cost.get('log2_size', float('nan')):.3f} " - f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} " - f"rank_slices={stat.get('rank_slices')}", - flush=True, - ) - - -def selected_observables(args, case): - if args.observables: - return tuple(args.observables) - if args.obs_filter: - return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip()) - return case.observables - - -def apply_case_defaults(args): - case = CASES[args.case] - if args.nqubits is None: - args.nqubits = case.nqubits - if args.nlayers is None: - args.nlayers = case.nlayers - if args.seed is None: - args.seed = case.seed - if args.tn_target_slices is None: - args.tn_target_slices = case.target_slices - args.observables = selected_observables(args, case) - - -def stop_dask_cluster(args): - if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address: - return - if args.mpi: - from mpi4py import MPI - - if MPI.COMM_WORLD.Get_rank() != 0: - return - script = ROOT / "tools" / "manage_tn_dask_cluster.sh" - if not script.exists(): - print(f"dask_stop_skipped reason=missing_script path={script}", flush=True) - return - - env = os.environ.copy() - parsed = urlparse(args.dask_address) - if parsed.hostname: - env.setdefault("SCHEDULER_HOST", parsed.hostname) - if parsed.port: - env.setdefault("SCHEDULER_PORT", str(parsed.port)) - - print("dask_stop_after_search start", flush=True) - subprocess.run([str(script), "stop"], cwd=str(ROOT), env=env, check=False) - print("dask_stop_after_search done", flush=True) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("mode", choices=("search", "contract", "all", "validate", "list")) - parser.add_argument("--case", choices=sorted(CASES), default="main1") - parser.add_argument("--observables", nargs="+") - parser.add_argument("--obs-filter", default="") - parser.add_argument("--tree-dir", default="trees/contest_tn") - parser.add_argument("--nqubits", type=int) - parser.add_argument("--nlayers", type=int) - parser.add_argument("--seed", type=int) - parser.add_argument("--mpi", action="store_true") - parser.add_argument("--exact", action="store_true") - parser.add_argument("--exact-max-qubits", type=int, default=24) - parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024) - parser.add_argument("--cut-ratio", type=optional_float, default=1e-12) - parser.add_argument("--torch-threads", type=int, default=8) - parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch") - parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64") - parser.add_argument("--tn-target-slices", type=int) - parser.add_argument("--tn-target-size", type=int, default=2**34) - parser.add_argument("--tn-search-workers", type=int) - parser.add_argument("--tn-search-repeats", type=int, default=2048) - parser.add_argument("--tn-search-time", type=float, default=300.0) - parser.add_argument( - "--tn-search-backend", - choices=("processpool", "dask"), - default="dask", - help=( - "Path-search backend. Defaults to dask. Without --dask-address, " - "non-MPI search starts a local dask cluster." - ), - ) - parser.add_argument("--dask-address") - parser.add_argument("--dask-expected-workers", type=int) - parser.add_argument("--dask-close-workers", action="store_true") - parser.add_argument( - "--keep-dask", - action="store_true", - help=( - "Keep an external dask cluster running after search. By default, " - "tools/manage_tn_dask_cluster.sh stop is called after search when " - "--dask-address is used." - ), - ) - parser.add_argument( - "--tn-debug-trials", - action="store_true", - help="Print dask worker summary and per-trial start/done logs.", - ) - parser.add_argument("--no-tn-stats", action="store_true") - args = parser.parse_args() - - if args.mode == "list": - for name, case in CASES.items(): - print( - f"{name}: circuit={case.circuit_kind} " - f"observables={','.join(case.observables)} " - f"nqubits={case.nqubits} nlayers={case.nlayers} " - f"seed={case.seed} target_slices={case.target_slices}" - ) - return - - apply_case_defaults(args) - set_torch_threads(args.torch_threads) - - modes = ("search", "contract") if args.mode == "all" else (args.mode,) - if args.mode == "validate": - args.exact = True - args.nqubits = min(args.nqubits, args.exact_max_qubits) - modes = ("search", "contract") - - for mode in modes: - for obs_name in args.observables: - run_one(args, args.case, obs_name, mode) - if mode == "search": - stop_dask_cluster(args) - - -if __name__ == "__main__": - main() diff --git a/tools/torch_profile_tn_complex64.py b/tools/torch_profile_tn_complex64.py deleted file mode 100644 index b7392f9..0000000 --- a/tools/torch_profile_tn_complex64.py +++ /dev/null @@ -1,114 +0,0 @@ -"""Run the 34q/20L TN complex64 benchmark under torch.profiler briefly.""" - -from __future__ import annotations - -import argparse -import os -import signal -import sys -from pathlib import Path - -from mpi4py import MPI - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--seconds", type=float, default=30.0) - parser.add_argument("--out-dir", default="torch_profiles/tn_complex64") - parser.add_argument("--torch-threads", type=int, default=48) - args = parser.parse_args() - - repo_root = Path(__file__).resolve().parents[1] - os.chdir(repo_root) - sys.path.insert(0, str(repo_root)) - - import torch - from torch.profiler import ProfilerActivity, profile - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - out_dir = Path(args.out_dir) - if rank == 0: - out_dir.mkdir(parents=True, exist_ok=True) - comm.Barrier() - - torch.set_num_threads(args.torch_threads) - - def run_benchmark(): - import benchmark_cpu_expectation - - sys.argv = [ - "benchmark_cpu_expectation.py", - "--mpi", - "--ansatz", - "tn", - "--nqubits", - "34", - "--nlayers", - "20", - "--circuits", - "rxx_rzz", - "--pauli-pattern", - "XZ", - "--tn-load-tree", - "trees/rxx_rzz_34q20l_s4.pkl", - "--quimb-backend", - "torch", - "--torch-threads", - str(args.torch_threads), - "--dtype", - "complex64", - ] - benchmark_cpu_expectation.main() - - trace_path = out_dir / f"rank{rank}_trace.json" - stacks_path = out_dir / f"rank{rank}_stacks.txt" - summary_path = out_dir / f"rank{rank}_summary.txt" - - prof = profile( - activities=[ProfilerActivity.CPU], - record_shapes=True, - profile_memory=True, - with_stack=True, - ) - - class ProfileTimeout(Exception): - pass - - def alarm_handler(signum, frame): - raise ProfileTimeout() - - old_handler = signal.signal(signal.SIGALRM, alarm_handler) - signal.setitimer(signal.ITIMER_REAL, args.seconds) - try: - with prof: - try: - run_benchmark() - except ProfileTimeout: - pass - finally: - signal.setitimer(signal.ITIMER_REAL, 0) - signal.signal(signal.SIGALRM, old_handler) - - prof.export_chrome_trace(str(trace_path)) - try: - prof.export_stacks(str(stacks_path), "self_cpu_time_total") - except Exception as exc: # pragma: no cover - diagnostic only - stacks_path.write_text(f"export_stacks failed: {exc}\n", encoding="utf-8") - - summary = prof.key_averages(group_by_stack_n=5).table( - sort_by="self_cpu_time_total", - row_limit=40, - ) - summary_path.write_text(summary, encoding="utf-8") - - print( - f"torch_profile_done rank={rank}/{size} " - f"trace={trace_path} summary={summary_path}", - flush=True, - ) - - -if __name__ == "__main__": - main() diff --git a/tools/validate_vidal_mpi_correctness.py b/tools/validate_vidal_mpi_correctness.py deleted file mode 100644 index bce8e2d..0000000 --- a/tools/validate_vidal_mpi_correctness.py +++ /dev/null @@ -1,202 +0,0 @@ -"""Correctness checks for the Vidal/TEBD MPS fast path. - -The cases here intentionally cover more than the benchmark ring-XZ observable: -different nearest-neighbor gate orientations and several Pauli-sum observables. -Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare -the segmented Vidal executor. -""" - -from __future__ import annotations - -import argparse -import math -import time - -import numpy as np -import torch -from qibo import Circuit, gates - -from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor -from qibotn.backends.vidal_tebd import VidalTEBDExecutor - - -def build_circuit(kind, nqubits, nlayers, seed): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - for layer in range(nlayers): - for q in range(nqubits): - circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) - if kind == "rx_ry_cz": - circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi))) - - if kind in ("brickwall", "reversed_cnot"): - for q in range(0, nqubits - 1, 2): - if kind == "reversed_cnot" and (layer % 2): - circuit.add(gates.CNOT(q + 1, q)) - else: - circuit.add(gates.CNOT(q, q + 1)) - for q in range(1, nqubits - 1, 2): - if kind == "reversed_cnot" and not (layer % 2): - circuit.add(gates.CNOT(q + 1, q)) - else: - circuit.add(gates.CNOT(q, q + 1)) - elif kind == "rx_ry_cz": - for q in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.CZ(q, q + 1)) - else: - raise ValueError(f"Unknown circuit kind {kind!r}.") - return circuit - - -def observable_terms(kind, nqubits): - if kind == "ring_xz": - return [ - (0.5, (("X", site), ("Z", (site + 1) % nqubits))) - for site in range(nqubits) - ] - if kind == "open_zz": - return [ - (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1))) - for site in range(nqubits - 1) - ] - if kind == "mixed_local": - terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))] - terms += [ - (0.125, (("Y", site), ("Y", site + 1))) - for site in range(0, nqubits - 1, 3) - ] - return terms - raise ValueError(f"Unknown observable kind {kind!r}.") - - -def exact_pauli_sum(circuit, terms, nqubits): - state = circuit().state(numpy=True).reshape(-1) - indices = np.arange(state.size, dtype=np.int64) - value = 0.0 + 0.0j - for coeff, ops in terms: - flipped = indices.copy() - phase = np.ones(state.size, dtype=np.complex128) - for name, site in ops: - shift = nqubits - 1 - site - bit = (indices >> shift) & 1 - name = name.upper() - if name == "X": - flipped ^= 1 << shift - elif name == "Y": - flipped ^= 1 << shift - phase *= 1j * (1 - 2 * bit) - elif name == "Z": - phase *= 1 - 2 * bit - elif name != "I": - raise ValueError(f"Unsupported Pauli {name!r}.") - value += coeff * np.vdot(state[flipped], phase * state) - return float(value.real) - - -def run_vidal(circuit, terms, nqubits, bond, tensor_module): - executor = VidalTEBDExecutor( - nqubits=nqubits, - max_bond=bond, - cut_ratio=1e-12, - tensor_module=tensor_module, - ) - executor.run_circuit(circuit) - return float(executor.expectation_pauli_sum(terms)) - - -def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm): - executor = SegmentVidalMPIExecutor( - nqubits=nqubits, - max_bond=bond, - cut_ratio=1e-12, - tensor_module=tensor_module, - comm=comm, - ) - executor.run_circuit(circuit) - return executor.expectation_pauli_sum_root(terms) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--nqubits", type=int, default=16) - parser.add_argument("--nlayers", type=int, default=6) - parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512) - parser.add_argument("--seed", type=int, default=42) - parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch") - parser.add_argument("--torch-threads", type=int, default=32) - parser.add_argument("--mpi", action="store_true") - parser.add_argument( - "--circuits", - nargs="+", - default=("brickwall", "reversed_cnot", "rx_ry_cz"), - ) - parser.add_argument( - "--observables", - nargs="+", - default=("ring_xz", "open_zz", "mixed_local"), - ) - args = parser.parse_args() - - torch.set_num_threads(args.torch_threads) - comm = None - rank = 0 - size = 1 - if args.mpi: - from mpi4py import MPI - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - if rank == 0: - mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal" - print( - f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} " - f"bond={args.bond} tensor_module={args.tensor_module}" - ) - print("circuit observable exact value abs_error seconds") - - for circuit_kind in args.circuits: - circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed) - exact = None - if rank == 0: - exact_values = { - obs: exact_pauli_sum( - circuit, observable_terms(obs, args.nqubits), args.nqubits - ) - for obs in args.observables - } - else: - exact_values = None - if comm is not None: - exact_values = comm.bcast(exact_values, root=0) - - for obs_kind in args.observables: - terms = observable_terms(obs_kind, args.nqubits) - start = time.perf_counter() - if args.mpi: - value = run_segment_mpi( - circuit, - terms, - args.nqubits, - args.bond, - args.tensor_module, - comm, - ) - else: - value = run_vidal( - circuit, terms, args.nqubits, args.bond, args.tensor_module - ) - if rank != 0: - continue - elapsed = time.perf_counter() - start - exact = exact_values[obs_kind] - print( - f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} " - f"{abs(value - exact):.6e} {elapsed:.3f}" - ) - - -if __name__ == "__main__": - main() diff --git a/tools/vidal_mpi_contest_runner.py b/tools/vidal_mpi_contest_runner.py deleted file mode 100644 index 405f47c..0000000 --- a/tools/vidal_mpi_contest_runner.py +++ /dev/null @@ -1,209 +0,0 @@ -from __future__ import annotations - -import argparse -import math -import time - -import numpy as np -from mpi4py import MPI -from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Y, Z - -from qibotn.backends.vidal import VidalBackend - - -def optional_int(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return int(text) - - -def optional_float(text): - if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}: - return None - return float(text) - - -def format_optional(value, fmt="g"): - return "None" if value is None else format(value, fmt) - - -def set_torch_threads(nthreads): - try: - import torch - - torch.set_num_threads(nthreads) - except Exception: - pass - - -def build_circuit(kind, nqubits, nlayers, seed): - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - - for layer in range(nlayers): - for q in range(nqubits): - circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) - if kind in ("rxx_rzz", "scramble"): - circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi))) - - if kind == "reversed_cnot": - for q in range(0, nqubits - 1, 2): - circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1)) - for q in range(1, nqubits - 1, 2): - circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1)) - elif kind == "rxx_rzz": - for q in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9))) - circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9))) - elif kind == "scramble": - for q in range(layer % 2, nqubits - 1, 2): - circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8))) - circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8))) - if layer % 5 == 4: - circuit.add(gates.SWAP(q, q + 1)) - else: - raise ValueError(f"Unknown circuit kind {kind!r}.") - - return circuit - - -def ring_xz(nqubits): - form = 0 - for q in range(nqubits): - form += 0.5 * X(q) * Z((q + 1) % nqubits) - return hamiltonians.SymbolicHamiltonian(form=form) - - -def open_zz(nqubits): - form = 0 - for q in range(nqubits - 1): - form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1) - return hamiltonians.SymbolicHamiltonian(form=form) - - -def range2_xx(nqubits): - form = 0 - for q in range(nqubits - 2): - form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2) - return hamiltonians.SymbolicHamiltonian(form=form) - - -def dense_observable(nqubits, qubits, seed, dim): - rng = np.random.default_rng(seed) - raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim)) - matrix = (raw + raw.conj().T) / 2.0 - matrix = matrix / np.linalg.norm(matrix) - return {"matrix": matrix, "qubits": list(qubits)} - - -def observables_for_case(nqubits, seed): - q1 = nqubits // 4 - q2 = nqubits // 2 - q3 = (3 * nqubits) // 4 - last = nqubits - 1 - - return [ - ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))), - ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))), - ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))), - ( - "long_Z_5_sites", - hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)), - ), - ( - "mixed_XZYZX", - hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)), - ), - ("ring_xz", ring_xz(nqubits)), - ("open_zz", open_zz(nqubits)), - ("range2_xx", range2_xx(nqubits)), - ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))), - ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)), - ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)), - ] - - -def run_case(args): - set_torch_threads(args.torch_threads) - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed) - observables = observables_for_case(args.nqubits, args.seed) - if args.obs_filter: - wanted = set(args.obs_filter.split(",")) - observables = [(name, obs) for name, obs in observables if name in wanted] - if not observables: - raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}") - - if rank == 0: - print("=" * 88, flush=True) - print( - "case " - f"label={args.label} kind={args.kind} ranks={size} " - f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} " - f"bond={format_optional(args.bond)} " - f"cut_ratio={format_optional(args.cut_ratio)} " - f"torch_threads={args.torch_threads} seed={args.seed} " - f"obs_filter={args.obs_filter or 'all'}", - flush=True, - ) - print( - "observable value seconds trunc_sum trunc_max status", - flush=True, - ) - - for obs_name, observable in observables: - backend = VidalBackend() - backend.configure_tn_simulation( - max_bond_dimension=args.bond, - cut_ratio=args.cut_ratio, - tensor_module="torch", - mpi_approach="CT", - mpi_num_procs=size, - fallback=False, - ) - - comm.Barrier() - start = time.perf_counter() - try: - value = backend.expectation( - circuit, - observable, - preprocess=True, - compile_circuit=False, - ) - status = "ok" - except Exception as exc: # pragma: no cover - printed for manual runs - value = np.nan - status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0] - seconds = time.perf_counter() - start - - if rank == 0: - print( - f"{obs_name} {value!r} {seconds:.3f} " - f"{backend.last_truncation_error:.6e} " - f"{backend.last_max_truncation_error:.6e} {status}", - flush=True, - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--label", required=True) - parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True) - parser.add_argument("--nqubits", type=int, required=True) - parser.add_argument("--nlayers", type=int, required=True) - parser.add_argument("--bond", type=optional_int, required=True) - parser.add_argument("--cut-ratio", type=optional_float, required=True) - parser.add_argument("--seed", type=int, required=True) - parser.add_argument("--torch-threads", type=int, required=True) - parser.add_argument("--obs-filter", default="") - run_case(parser.parse_args()) - - -if __name__ == "__main__": - main() diff --git a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl deleted file mode 100644 index e41f1f5..0000000 Binary files a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl and /dev/null differ