决赛现场脚本
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled

This commit is contained in:
jaunatisblue
2026-05-18 01:37:19 +08:00
parent 4c7a10d026
commit ef3d7e9ee6
26 changed files with 894 additions and 62 deletions

View File

@@ -1,30 +1,40 @@
# TN
```bash
# qibotn目录下
I_MPI_FABRICS=shm:ofi \
I_MPI_OFI_PROVIDER=tcp \
FI_PROVIDER=tcp \
# search + contractOpen MPI 多节点:每节点 2 rank每 rank 绑定 1 个 NUMA。
# MPI_HOSTS 里每个节点写 :2MPI_RANKS = 节点数 * 2。
# 每个 rank 使用 MPI_PE 个 core这台 2-NUMA AMD 节点用 MPI_PE=128。
NQUBITS=40 \
TN_DEBUG_TRIALS=1 \
SCHEDULER_HOST=10.20.1.100 \
DASK_ADDRESS=tcp://10.20.1.100:8786 \
WORKER_HOSTS="10.20.1.100 10.20.1.101 10.20.1.102 10.20.1.103" \
CASE=main1 \
OBSERVABLES=long_z_string \
NQUBITS=34 \
NLAYERS=20 \
TORCH_THREADS=48 \
SEARCH_REPEATS=2048 \
SEARCH_TIME=300 \
SCHEDULER_HOST=10.20.1.103 \
WORKER_HOSTS="10.20.1.103 10.20.6.101" \
DASK_ADDRESS="tcp://10.20.1.103:8786" \
NWORKERS=84 \
NTHREADS=1 \
MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2" \
TORCH_THREADS=80 \
MPI_PE=80 \
MPI_MAP_BY=ppr:1:numa:PE=80 \
MPI_BIND_TO=core \
OMP_NUM_THREADS=80 \
MKL_NUM_THREADS=80 \
BLIS_NUM_THREADS=80 \
MPI_HOSTS="node-0:2,node-1:2,node-2:2,node-3:2" \
MPI_RANKS=8 \
NWORKERS=96 \
TN_TARGET_SIZE=17179869184 \
tools/run_tn_dask_mpi_all.sh
# 单独缩并contract计算
I_MPI_FABRICS=shm:ofi \
I_MPI_OFI_PROVIDER=tcp \
FI_PROVIDER=tcp \
mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
-x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
-x BLIS_NUM_THREADS=80 \
-x OMP_NUM_THREADS=80 \
-x MKL_NUM_THREADS=80 \
-x OMP_PROC_BIND=close \
-x OMP_PLACES=cores \
-np 8 \
-host node-0:2,node-1:2,node-2:2,node-3:2 \
.venv/bin/python -u tools/tn_contest_runner.py contract \
--mpi \
--case main1 \
@@ -32,22 +42,47 @@ mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
--nlayers 20 \
--observables long_z_string \
--tree-dir trees/contest_tn \
--torch-threads 48 \
--torch-threads 80 \
--dtype complex64
```
# MPS
```
cd /home/yx/qibotn
cd /home/qibo/qibotn
I_MPI_FABRICS=shm:ofi \
I_MPI_OFI_PROVIDER=tcp \
FI_PROVIDER=tcp \
MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2" \
MPIEXEC=mpirun \
MPI_HOSTS="node-2:4,node-3:4" \
MPI_RANKS=8 \
MPI_PE=48 \
MPI_MAP_BY=ppr:2:numa:PE=48 \
MPI_BIND_TO=core \
MPI_REPORT_BINDINGS=1 \
TORCH_THREADS=48 \
OMP_NUM_THREADS=48 \
MKL_NUM_THREADS=48 \
BLIS_NUM_THREADS=48 \
OBS_FILTER=ring_xz \
MAIN1_NQ=128 \
MAIN1_LAYERS=24 \
MAIN1_BOND=1024 \
tools/run_vidal_mpi_contest_cases.sh main1
```
MPIEXEC=mpirun \
MPI_HOSTS="node-2:4" \
MPI_RANKS=4 \
MPI_PE=48 \
MPI_MAP_BY=ppr:2:numa:PE=48 \
MPI_BIND_TO=core \
MPI_REPORT_BINDINGS=1 \
TORCH_THREADS=48 \
OMP_NUM_THREADS=48 \
MKL_NUM_THREADS=48 \
BLIS_NUM_THREADS=48 \
OBS_FILTER=ring_xz \
MAIN1_NQ=128 \
MAIN1_LAYERS=24 \
MAIN1_BOND=1024 \
tools/run_vidal_mpi_contest_cases.sh main1
```

42
docs/xianchang.md Normal file
View File

@@ -0,0 +1,42 @@
mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
-x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
-x BLIS_NUM_THREADS=80 \
-x OMP_NUM_THREADS=80 \
-x MKL_NUM_THREADS=80 \
-x OMP_PROC_BIND=close \
-x OMP_PLACES=cores \
-np 4 \
-host node-0:2,node-1:2,node-2:2,node-3:2 \
.venv/bin/python -u tools/tn_contest_runner.py contract \
--mpi \
--case main1 \
--nqubits 34 \
--nlayers 20 \
--observables long_z_string \
--tree-dir trees/contest_tn \
--torch-threads 80 \
--dtype complex64
SEARCH_TIME=300 NQUBITS=40 TN_DEBUG_TRIALS=1 SCHEDULER_HOST=10.20.1.102 DASK_ADDRESS=tcp://10.20.1.102:8786 WORKER_HOSTS="10.20.1.102 10.20.1.103" CASE=main1 OBSERVABLES=long_z_string TORCH_THREADS=80 MPI_PE=80 MPI_MAP_BY=ppr:1:numa:PE=80 MPI_BIND_TO=core OMP_NUM_THREADS=80 MKL_NUM_THREADS=80 BLIS_NUM_THREADS=80 MPI_HOSTS="node-2:2,node-3:2" MPI_RANKS=4 NWORKERS=128 TN_TARGET_SIZE=17179869184 tools/run_tn_dask_mpi_all.sh
NQUBITS=40 \
TN_DEBUG_TRIALS=1 \
SCHEDULER_HOST=10.20.1.102 \
DASK_ADDRESS=tcp://10.20.1.102:8786 \
WORKER_HOSTS="10.20.1.102 10.20.1.103" \
CASE=main1 \
OBSERVABLES=long_z_string \
TORCH_THREADS=80 \
MPI_PE=80 \
MPI_MAP_BY=ppr:1:numa:PE=80 \
MPI_BIND_TO=core \
OMP_NUM_THREADS=80 \
MKL_NUM_THREADS=80 \
BLIS_NUM_THREADS=80 \
MPI_HOSTS="node-2:2,node-3:2" \
MPI_RANKS=4 \
NWORKERS=96 \
TN_TARGET_SIZE=17179869184 \
tools/run_tn_dask_mpi_all.sh

View File

@@ -1,2 +1,4 @@
10.20.1.103:2
10.20.6.101:2
10.20.1.100
10.20.1.101
10.20.1.102
10.20.1.103

138
requirements.txt Normal file
View File

@@ -0,0 +1,138 @@
alembic==1.18.4
annotated-types==0.7.0
antlr4-python3-runtime==4.13.2
anyio==4.13.0
asttokens==3.0.1
attrs==26.1.0
autoray==0.8.10
beautifulsoup4==4.14.3
certifi==2026.4.22
cffi==2.0.0
charset-normalizer==3.4.7
click==8.3.3
cloudpickle==3.1.2
cma==3.4.0
colorlog==6.10.1
contourpy==1.3.3
cotengra==0.7.5
coverage==7.13.5
cryptography==47.0.0
cycler==0.12.1
cytoolz==1.1.0
dask==2026.3.0
decorator==5.2.1
dill==0.4.1
distributed==2026.3.0
executing==2.2.1
filelock==3.25.2
fonttools==4.62.1
fsspec==2026.2.0
greenlet==3.3.2
h11==0.16.0
h5py==3.16.0
html5lib==1.1
httpcore==1.0.9
httpx==0.27.2
httpx-sse==0.4.3
idna==3.13
igraph==1.0.0
iniconfig==2.3.0
ipython==8.39.0
jedi==0.19.2
Jinja2==3.1.6
joblib==1.5.3
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
kahypar==1.3.7
kiwisolver==1.5.0
llvmlite==0.44.0
locket==1.0.0
lxml==6.1.0
Mako==1.3.10
markdownify==1.2.2
MarkupSafe==3.0.3
matplotlib==3.10.8
matplotlib-inline==0.2.1
mcp==1.27.0
mcp-server-fetch==2025.4.7
mpi4py==4.1.1
mpmath==1.3.0
msgpack==1.1.2
networkx==3.6.1
numba==0.61.2
numpy==2.0.1
openqasm3==1.0.1
opt_einsum==3.4.0
optuna==4.8.0
packaging==26.0
parso==0.8.6
partd==1.4.2
pexpect==4.9.0
pillow==12.2.0
pluggy==1.6.0
prompt_toolkit==3.0.52
Protego==0.6.0
protobuf==7.34.1
psutil==5.9.8
ptyprocess==0.7.0
pure_eval==0.2.3
py-spy==0.4.2
pycparser==3.0
pydantic==2.13.3
pydantic-settings==2.14.0
pydantic_core==2.46.3
Pygments==2.20.0
PyJWT==2.12.1
pyparsing==3.3.2
pytest==9.0.3
pytest-cov==7.1.0
pytest-env==1.6.0
python-dateutil==2.9.0.post0
python-dotenv==1.2.2
python-multipart==0.0.26
PyYAML==6.0.3
qibo==0.3.2
qibojit==0.1.15
-e git+https://git.nudt.space/jaunatisblue/qibotn.git@4c7a10d026d514897dcc501b507fa604fb4e52d4#egg=qibotn
qiskit==1.4.5
qmatchatea==1.5.8
qredtea==0.3.15
qtealeaves==1.7.32
quimb==1.13.0
ray==2.55.1
readabilipy==0.3.0
referencing==0.37.0
regex==2026.4.4
requests==2.33.1
rpds-py==0.30.0
rustworkx==0.17.1
scipy==1.17.1
setuptools==70.2.0
six==1.17.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.8.3
SQLAlchemy==2.0.49
sse-starlette==3.4.1
stack-data==0.6.3
starlette==1.0.0
stevedore==5.7.0
symengine==0.13.0
sympy==1.13.1
tabulate==0.9.0
tblib==3.2.2
texttable==1.7.0
threadpoolctl==3.6.0
toolz==1.1.0
torch @ file:///home/qibo/qibotn/wheels/torch-2.10.0a0+a36e1d3-cp312-cp312-linux_x86_64.whl
tornado==6.5.5
tqdm==4.67.3
traitlets==5.14.3
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.6.3
uvicorn==0.46.0
wcwidth==0.6.0
webencodings==0.5.1
zict==3.0.0

View File

@@ -20,6 +20,7 @@ MPI_THREADS="${MPI_THREADS:-12}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
run() {
echo

View File

@@ -420,6 +420,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
search_time = opts.get("max_time", 60)
search_backend = opts.get("search_backend")
dask_address = opts.get("dask_address")
dask_expected_workers = opts.get("dask_expected_workers")
dask_close_workers = bool(opts.get("dask_close_workers", False))
print_stats = bool(opts.get("print_stats", False))
debug_trials = bool(opts.get("debug_trials", False))
@@ -502,6 +503,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
dask_address=dask_address,
debug_trials=debug_trials,
dask_close_workers=dask_close_workers,
expected_workers=dask_expected_workers,
)
search_seconds = time.perf_counter() - search_start
if tree is None:

View File

@@ -356,6 +356,7 @@ def _dask_search(
optlib=None,
debug_trials=False,
close_workers=False,
expected_workers=None,
):
"""Run one centralized cotengra hyper-optimizer over a dask pool.
@@ -403,6 +404,8 @@ def _dask_search(
retire_workers = []
try:
workers, worker_slots = _dask_worker_slots(client)
if expected_workers is not None:
worker_slots = max(worker_slots, int(expected_workers))
if close_workers:
retire_workers = list(workers)
if debug_trials:
@@ -532,7 +535,7 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
max_time=300, n_workers=48, slicing_opts=None,
trial_timeout=None, search_backend=None,
dask_address=None, debug_trials=False,
dask_close_workers=False):
dask_close_workers=False, expected_workers=None):
"""Parallel contraction path search.
Args:
@@ -576,6 +579,7 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
n_workers=n_workers,
debug_trials=debug_trials,
close_workers=dask_close_workers,
expected_workers=expected_workers,
)
else:
raise ValueError(f"Unknown method: {method}")

View File

@@ -16,3 +16,4 @@ Files here are intentionally secondary:
- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
- `qibojit_reference_expectation.py`: state-vector reference helper.
- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.
- `mpi_torch_thread_probe.py`: MPI + torch OpenMP affinity and threading probe.

View File

@@ -0,0 +1,157 @@
#!/usr/bin/env python
"""Benchmark qredtea/qtealeaves SVD control modes.
This isolates the tensor split used by MPS updates: a rank-2 tensor is split
with singular values contracted either left or right, then reconstructed to
measure numerical error and timing.
"""
from __future__ import annotations
import argparse
import gc
import statistics
import time
import torch
import qmatchatea
from qredtea.torchapi import QteaTorchTensor
def _dtype(name: str):
return {
"complex64": torch.complex64,
"complex128": torch.complex128,
"float64": torch.float64,
"float32": torch.float32,
}[name]
def _random_matrix(shape, dtype, seed):
gen = torch.Generator(device="cpu")
gen.manual_seed(seed)
if dtype.is_complex:
real_dtype = torch.float32 if dtype == torch.complex64 else torch.float64
real = torch.randn(shape, dtype=real_dtype, generator=gen)
imag = torch.randn(shape, dtype=real_dtype, generator=gen)
return torch.complex(real, imag).to(dtype)
return torch.randn(shape, dtype=dtype, generator=gen)
def _sync():
if torch.cuda.is_available():
torch.cuda.synchronize()
def run_one(matrix, ctrl, max_bond, contract_singvals, repeats):
conv = qmatchatea.QCConvergenceParameters(
max_bond_dimension=max_bond,
cut_ratio=0.0,
svd_ctrl=ctrl,
)
qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
times = []
rel_error = None
kept = None
status = "ok"
error = ""
for i in range(repeats):
gc.collect()
_sync()
t0 = time.perf_counter()
try:
left, right, singvals, _ = qtensor.split_svd(
[0],
[1],
contract_singvals=contract_singvals,
conv_params=conv,
)
except Exception as exc: # noqa: BLE001 - benchmark should keep going
status = "error"
error = repr(exc)
break
_sync()
times.append(time.perf_counter() - t0)
if i == repeats - 1:
left_matrix = left.elem.reshape(matrix.shape[0], -1)
right_matrix = right.elem.reshape(-1, matrix.shape[1])
recon = left_matrix @ right_matrix
rel_error = (
torch.linalg.vector_norm(matrix - recon)
/ torch.linalg.vector_norm(matrix)
).item()
kept = int(singvals.numel())
return {
"ctrl": ctrl,
"contract_singvals": contract_singvals,
"status": status,
"median_ms": float("nan") if not times else statistics.median(times) * 1000,
"min_ms": float("nan") if not times else min(times) * 1000,
"rel_error": rel_error,
"kept": kept,
"error": error,
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--shapes", nargs="+", default=("256x1024", "1024x256", "512x512"))
parser.add_argument("--max-bond", type=int, default=128)
parser.add_argument("--dtype", choices=("complex64", "complex128", "float32", "float64"), default="complex128")
parser.add_argument("--threads", type=int, default=8)
parser.add_argument("--repeats", type=int, default=3)
parser.add_argument(
"--controls",
nargs="+",
default=("A", "D", "V", "R", "E", "E!", "X", "X!"),
)
args = parser.parse_args()
torch.set_num_threads(args.threads)
dtype = _dtype(args.dtype)
print(
"svd_benchmark "
f"dtype={args.dtype} threads={torch.get_num_threads()} "
f"max_bond={args.max_bond} repeats={args.repeats}",
flush=True,
)
print(
"columns shape contract ctrl status median_ms min_ms kept rel_error error",
flush=True,
)
for shape_text in args.shapes:
m_text, n_text = shape_text.lower().split("x", 1)
shape = (int(m_text), int(n_text))
matrix = _random_matrix(shape, dtype, seed=sum(shape))
for contract_singvals in ("L", "R"):
for ctrl in args.controls:
result = run_one(
matrix,
ctrl=ctrl,
max_bond=args.max_bond,
contract_singvals=contract_singvals,
repeats=args.repeats,
)
print(
f"row shape={shape_text} "
f"contract={contract_singvals} "
f"ctrl={ctrl} "
f"status={result['status']} "
f"median_ms={result['median_ms']:.3f} "
f"min_ms={result['min_ms']:.3f} "
f"kept={result['kept']} "
f"rel_error={result['rel_error']} "
f"error={result['error']}",
flush=True,
)
if __name__ == "__main__":
main()

View File

@@ -17,10 +17,10 @@ set -euo pipefail
# WORKER_HOSTS="10.20.1.103 10.20.6.101"
# NWORKERS=48
# NTHREADS=1
# ROOT_DIR=/home/yx/qibotn
# ROOT_DIR=/home/qibo/qibotn
# PYTHON_BIN=.venv/bin/python
ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}"
ROOT_DIR="${ROOT_DIR:-/home/qibo/qibotn}"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python
"""Probe MPI rank placement and whether torch CPU ops use multiple threads.
Run this under mpirun/mpiexec to check:
* which CPUs each rank is allowed to run on,
* whether torch sees the requested intra-op thread count, and
* whether a large CPU tensor op actually consumes more CPU time than wall time.
The script is intentionally small and self-contained so it can be used to debug
MPI launcher affinity and torch OpenMP behavior independently from the TN code
path.
"""
from __future__ import annotations
import argparse
import os
import socket
import time
from pathlib import Path
from mpi4py import MPI
def _dtype_from_name(name):
import torch
mapping = {
"float32": torch.float32,
"float64": torch.float64,
"complex64": torch.complex64,
"complex128": torch.complex128,
}
return mapping[name]
def _make_tensor(shape, dtype):
import torch
if dtype in (torch.complex64, torch.complex128):
base = torch.float32 if dtype == torch.complex64 else torch.float64
return torch.complex(
torch.randn(shape, dtype=base),
torch.randn(shape, dtype=base),
)
return torch.randn(shape, dtype=dtype)
def _bench(label, fn, iters, warmup=2):
for _ in range(warmup):
fn()
start_wall = time.perf_counter()
start_cpu = time.process_time()
checksum = 0.0
for _ in range(iters):
value = fn()
checksum += float(value)
wall = time.perf_counter() - start_wall
cpu = time.process_time() - start_cpu
ratio = cpu / wall if wall > 0 else float("inf")
print(
f"{label} wall={wall:.3f}s cpu={cpu:.3f}s cpu_over_wall={ratio:.2f} "
f"checksum={checksum:.6e}",
flush=True,
)
def _visible_numa_nodes():
nodes = []
for path in sorted(Path("/sys/devices/system/node").glob("node[0-9]*")):
cpulist = path / "cpulist"
if cpulist.exists():
nodes.append(f"{path.name}:{cpulist.read_text(encoding='utf-8').strip()}")
return ",".join(nodes) if nodes else "unknown"
def _dtype_nbytes(name):
return {
"float32": 4,
"float64": 8,
"complex64": 8,
"complex128": 16,
}[name]
def _format_gib(nbytes):
return f"{nbytes / (1024 ** 3):.2f}GiB"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--threads", type=int, default=48)
parser.add_argument("--n", type=int, default=4096)
parser.add_argument("--iters", type=int, default=4)
parser.add_argument("--dtype", choices=("float32", "float64", "complex64", "complex128"), default="float32")
parser.add_argument("--op", choices=("matmul", "tensordot", "both"), default="both")
parser.add_argument(
"--affinity-only",
action="store_true",
help="Print MPI/torch placement diagnostics without allocating tensors.",
)
args = parser.parse_args()
os.environ.setdefault("OMP_NUM_THREADS", str(args.threads))
os.environ.setdefault("MKL_NUM_THREADS", str(args.threads))
os.environ.setdefault("OMP_PROC_BIND", "close")
os.environ.setdefault("OMP_PLACES", "cores")
import torch
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
torch.set_num_threads(args.threads)
try:
torch.set_num_interop_threads(1)
except Exception:
pass
dtype = _dtype_from_name(args.dtype)
affinity = sorted(os.sched_getaffinity(0))
allowed_list = ""
try:
with open("/proc/self/status", encoding="utf-8") as f:
for line in f:
if line.startswith("Cpus_allowed_list:"):
allowed_list = line.split(":", 1)[1].strip()
break
except OSError:
pass
print(
f"rank={rank}/{size} host={socket.gethostname()} pid={os.getpid()} "
f"affinity_len={len(affinity)} allowed={allowed_list} "
f"torch_threads={torch.get_num_threads()} "
f"torch_interop={torch.get_num_interop_threads()} "
f"OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')} "
f"MKL_NUM_THREADS={os.environ.get('MKL_NUM_THREADS')} "
f"OMP_PROC_BIND={os.environ.get('OMP_PROC_BIND')} "
f"OMP_PLACES={os.environ.get('OMP_PLACES')} "
f"visible_numa={_visible_numa_nodes()}",
flush=True,
)
if rank == 0:
print(torch.__config__.parallel_info(), flush=True)
input_bytes = args.n * args.n * _dtype_nbytes(args.dtype)
min_live_bytes = 3 * input_bytes
print(
f"matrix_n={args.n} dtype={args.dtype} "
f"one_matrix={_format_gib(input_bytes)} "
f"approx_min_live_per_rank={_format_gib(min_live_bytes)} "
f"approx_min_live_all_ranks={_format_gib(min_live_bytes * size)}",
flush=True,
)
comm.Barrier()
if args.affinity_only:
return
a = _make_tensor((args.n, args.n), dtype)
b = _make_tensor((args.n, args.n), dtype)
def run_matmul():
value = (a @ b).sum()
return value.real.item() if value.is_complex() else value.item()
def run_tensordot():
value = torch.tensordot(a, b, dims=1)
value = value.sum()
return value.real.item() if value.is_complex() else value.item()
if args.op in ("matmul", "both"):
_bench("matmul", run_matmul, args.iters)
if args.op in ("tensordot", "both"):
_bench("tensordot", run_tensordot, args.iters)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,22 @@
#!/usr/bin/env bash
# Shared runtime setup for CPU torch TN/MPS runs.
#
# This makes AOCL BLIS use the multithreaded library when available, which is
# required for complex64 tensordot/cgemm to actually use all cores on this host.
QIBOTN_BLIS_MT="${QIBOTN_BLIS_MT:-/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5}"
export BLIS_NUM_THREADS="${BLIS_NUM_THREADS:-${OMP_NUM_THREADS:-1}}"
if [[ -f "$QIBOTN_BLIS_MT" ]]; then
case ":${LD_PRELOAD:-}:" in
*":$QIBOTN_BLIS_MT:"*)
;;
*)
export LD_PRELOAD="${LD_PRELOAD:+$LD_PRELOAD:}$QIBOTN_BLIS_MT"
;;
esac
fi
export OMP_PROC_BIND="${OMP_PROC_BIND:-close}"
export OMP_PLACES="${OMP_PLACES:-cores}"

View File

@@ -21,6 +21,7 @@ TN_THREADS="${TN_THREADS:-8}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
run_mpi() {
local ranks="$1"

View File

@@ -22,6 +22,7 @@ TN_THREADS="${TN_THREADS:-12}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
estimate_mps_memory() {
local nqubits="$1"

View File

@@ -11,25 +11,165 @@ NLAYERS="${NLAYERS:-20}"
TORCH_THREADS="${TORCH_THREADS:-48}"
SEARCH_REPEATS="${SEARCH_REPEATS:-2048}"
SEARCH_TIME="${SEARCH_TIME:-300}"
TN_TARGET_SIZE="${TN_TARGET_SIZE:-8589934592}"
TN_TARGET_SIZE="${TN_TARGET_SIZE:-17179869184}"
TN_TARGET_SLICES="${TN_TARGET_SLICES:-}"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
DTYPE="${DTYPE:-complex64}"
TREE_DIR="${TREE_DIR:-trees/contest_tn}"
DASK_ADDRESS="${DASK_ADDRESS:-tcp://10.20.1.103:8786}"
MPIEXEC_FULL="${MPIEXEC_FULL:-mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2}"
DASK_EXPECTED_WORKERS="${DASK_EXPECTED_WORKERS:-}"
DASK_WAIT_FOR_WORKERS="${DASK_WAIT_FOR_WORKERS:-1}"
DASK_WAIT_TIMEOUT="${DASK_WAIT_TIMEOUT:-600}"
TN_DEBUG_TRIALS="${TN_DEBUG_TRIALS:-0}"
MPIEXEC="${MPIEXEC:-mpirun}"
MPIEXEC_FULL="${MPIEXEC_FULL:-}"
MPI_HOSTS="${MPI_HOSTS:-}"
MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
MPI_RANKS="${MPI_RANKS:-}"
MPI_PE="${MPI_PE:-$TORCH_THREADS}"
MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
MPI_BIND_TO="${MPI_BIND_TO:-core}"
MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
TN_CONTRACT_ENV_CHECK="${TN_CONTRACT_ENV_CHECK:-1}"
SYNC_TREES="${SYNC_TREES:-1}"
SYNC_HOSTS="${SYNC_HOSTS:-${WORKER_HOSTS:-}}"
SSH_BIN="${SSH_BIN:-ssh}"
DASK_CLUSTER_MANAGED="${DASK_CLUSTER_MANAGED:-0}"
export TCM_ENABLE="${TCM_ENABLE:-1}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
tn_slice_args=(--tn-target-size "$TN_TARGET_SIZE")
if [[ -n "$TN_TARGET_SLICES" ]]; then
tn_slice_args+=(--tn-target-slices "$TN_TARGET_SLICES")
fi
cleanup_dask_cluster() {
local status=$?
if [[ "$DASK_CLUSTER_MANAGED" == "1" ]]; then
set +e
tools/manage_tn_dask_cluster.sh stop >/dev/null 2>&1 || true
fi
exit "$status"
}
trap cleanup_dask_cluster EXIT INT TERM HUP
sum_host_slots() {
local hosts="$1"
local total=0
local item slots
IFS=',' read -r -a host_items <<< "$hosts"
for item in "${host_items[@]}"; do
if [[ "$item" == *:* ]]; then
slots="${item##*:}"
else
slots=1
fi
total=$((total + slots))
done
echo "$total"
}
count_hosts() {
local hosts="$1"
local count=0
local item
IFS=' ' read -r -a host_items <<< "$hosts"
for item in "${host_items[@]}"; do
[[ -n "$item" ]] && count=$((count + 1))
done
echo "$count"
}
wait_for_dask_workers() {
[[ "$DASK_WAIT_FOR_WORKERS" == "1" ]] || return 0
local expected="$DASK_EXPECTED_WORKERS"
if [[ -z "$expected" && -n "$WORKER_HOSTS" ]]; then
expected=$(( $(count_hosts "$WORKER_HOSTS") * NWORKERS ))
fi
if [[ -z "$expected" || "$expected" -le 0 ]]; then
return 0
fi
echo "Waiting for Dask workers: expected=$expected timeout=${DASK_WAIT_TIMEOUT}s"
"$PYTHON_BIN" - "$DASK_ADDRESS" "$expected" "$DASK_WAIT_TIMEOUT" <<'PY'
import sys
import time
from distributed import Client
address, expected, timeout = sys.argv[1], int(sys.argv[2]), int(sys.argv[3])
deadline = time.time() + timeout
client = Client(address)
try:
while True:
info = client.scheduler_info(n_workers=-1)
workers = info.get("workers", {})
count = len(workers)
if count >= expected:
print(f"dask_workers_ready count={count} expected={expected}", flush=True)
break
if time.time() >= deadline:
print(
f"dask_workers_wait_timeout count={count} expected={expected}",
flush=True,
)
break
time.sleep(2)
finally:
client.close()
PY
}
append_mpi_env_args() {
[[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
mpi_prefix+=(
-x "LD_PRELOAD=${LD_PRELOAD:-}"
-x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
-x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
-x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
-x "OMP_PROC_BIND=$OMP_PROC_BIND"
-x "OMP_PLACES=$OMP_PLACES"
)
}
build_mpi_prefix() {
if [[ -n "$MPIEXEC_FULL" ]]; then
# shellcheck disable=SC2206
mpi_prefix=($MPIEXEC_FULL)
append_mpi_env_args
return
fi
local ranks="$MPI_RANKS"
if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
ranks="$(sum_host_slots "$MPI_HOSTS")"
fi
if [[ -z "$ranks" ]]; then
ranks=2
fi
mpi_prefix=(
"$MPIEXEC"
--map-by "$MPI_MAP_BY"
--bind-to "$MPI_BIND_TO"
-np "$ranks"
)
if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
mpi_prefix+=(--report-bindings)
fi
append_mpi_env_args
if [[ -n "$MPI_HOSTS" ]]; then
mpi_prefix+=(-host "$MPI_HOSTS")
elif [[ -n "$MPI_HOSTFILE" ]]; then
mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
fi
}
is_local_host() {
local host="$1"
[[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
@@ -62,25 +202,52 @@ sync_trees_to_hosts() {
}
tools/manage_tn_dask_cluster.sh start
DASK_CLUSTER_MANAGED=1
wait_for_dask_workers
echo "Search with dask: $DASK_ADDRESS"
"$PYTHON_BIN" -u tools/tn_contest_runner.py search \
--case "$CASE" \
--nqubits "$NQUBITS" \
--nlayers "$NLAYERS" \
--observables $OBSERVABLES \
--tree-dir "$TREE_DIR" \
--dask-address "$DASK_ADDRESS" \
--torch-threads "$TORCH_THREADS" \
--dtype "$DTYPE" \
--tn-search-repeats "$SEARCH_REPEATS" \
--tn-search-time "$SEARCH_TIME" \
search_args=(
--case "$CASE"
--nqubits "$NQUBITS"
--nlayers "$NLAYERS"
--observables $OBSERVABLES
--tree-dir "$TREE_DIR"
--dask-address "$DASK_ADDRESS"
--torch-threads "$TORCH_THREADS"
--dtype "$DTYPE"
--tn-search-repeats "$SEARCH_REPEATS"
--tn-search-time "$SEARCH_TIME"
"${tn_slice_args[@]}"
)
if [[ -n "$DASK_EXPECTED_WORKERS" ]]; then
search_args+=(--dask-expected-workers "$DASK_EXPECTED_WORKERS")
fi
if [[ "$TN_DEBUG_TRIALS" == "1" ]]; then
search_args+=(--tn-debug-trials)
fi
"$PYTHON_BIN" -u tools/tn_contest_runner.py search "${search_args[@]}"
sync_trees_to_hosts
echo "Contract with MPI: $MPIEXEC_FULL"
read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
build_mpi_prefix
echo "Contract with MPI: ${mpi_prefix[*]}"
if [[ "$TN_CONTRACT_ENV_CHECK" == "1" ]]; then
"${mpi_prefix[@]}" "$PYTHON_BIN" -c "from mpi4py import MPI; import os; \
import torch; \
rank = MPI.COMM_WORLD.Get_rank(); \
blis = []; \
[blis.append(line.strip().split()[-1]) for line in open('/proc/self/maps') if 'libblis' in line and line.strip().split()[-1] not in blis]; \
print('tn_contract_env ' + \
f'rank={rank} ' + \
f'LD_PRELOAD={os.environ.get(\"LD_PRELOAD\", \"\")} ' + \
f'BLIS_NUM_THREADS={os.environ.get(\"BLIS_NUM_THREADS\", \"\")} ' + \
f'OMP_NUM_THREADS={os.environ.get(\"OMP_NUM_THREADS\", \"\")} ' + \
f'MKL_NUM_THREADS={os.environ.get(\"MKL_NUM_THREADS\", \"\")} ' + \
f'OMP_PROC_BIND={os.environ.get(\"OMP_PROC_BIND\", \"\")} ' + \
f'OMP_PLACES={os.environ.get(\"OMP_PLACES\", \"\")} ' + \
f'torch_threads={torch.get_num_threads()} ' + \
f'blis={\";\".join(blis) if blis else \"missing\"}', flush=True)"
fi
"${mpi_prefix[@]}" "$PYTHON_BIN" -u tools/tn_contest_runner.py contract \
--mpi \
--case "$CASE" \

View File

@@ -11,10 +11,15 @@ set -euo pipefail
#
# Common overrides:
# PYTHON_BIN=.venv/bin/python
# MPIEXEC=mpiexec
# MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
# MPIEXEC=mpirun
# MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
# MPI_RANKS=8
# MPI_PE=128
# MPI_MAP_BY=ppr:1:numa:PE=128
# MPI_BIND_TO=core
# MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
# HOSTFILE=hostfile # optional; used only if the file exists
# RANKS=8
# RANKS=8 # fallback if MPI_RANKS is not set
# TORCH_THREADS=8
# CUT_RATIO=1e-12
# OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
@@ -28,12 +33,23 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
HOSTFILE="${HOSTFILE:-}"
MPIEXEC="${MPIEXEC:-mpirun}"
MPIEXEC_FULL="${MPIEXEC_FULL:-}"
MPI_HOSTS="${MPI_HOSTS:-}"
MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
MPI_RANKS="${MPI_RANKS:-${RANKS:-}}"
RANKS="${RANKS:-4}"
TORCH_THREADS="${TORCH_THREADS:-1}"
MPI_PE="${MPI_PE:-$TORCH_THREADS}"
MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
MPI_BIND_TO="${MPI_BIND_TO:-core}"
MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
CUT_RATIO="${CUT_RATIO:-1e-12}"
OBS_FILTER="${OBS_FILTER:-}"
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
RUNNER_DIR="$ROOT_DIR/.tmp"
mkdir -p "$RUNNER_DIR"
@@ -238,15 +254,68 @@ if __name__ == "__main__":
main()
PY
if [[ -n "${MPIEXEC_FULL:-}" ]]; then
read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
else
mpi_prefix=("$MPIEXEC")
if [[ -n "$HOSTFILE" && -f "$HOSTFILE" ]]; then
mpi_prefix+=("-hostfile" "$HOSTFILE")
sum_host_slots() {
local hosts="$1"
local total=0
local item slots
IFS=',' read -r -a host_items <<< "$hosts"
for item in "${host_items[@]}"; do
if [[ "$item" == *:* ]]; then
slots="${item##*:}"
else
slots=1
fi
total=$((total + slots))
done
echo "$total"
}
append_mpi_env_args() {
[[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
mpi_prefix+=(
-x "LD_PRELOAD=${LD_PRELOAD:-}"
-x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
-x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
-x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
-x "OMP_PROC_BIND=$OMP_PROC_BIND"
-x "OMP_PLACES=$OMP_PLACES"
)
}
build_mpi_prefix() {
if [[ -n "$MPIEXEC_FULL" ]]; then
# shellcheck disable=SC2206
mpi_prefix=($MPIEXEC_FULL)
append_mpi_env_args
return
fi
mpi_prefix+=("-n" "$RANKS")
fi
local ranks="$MPI_RANKS"
if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
ranks="$(sum_host_slots "$MPI_HOSTS")"
fi
if [[ -z "$ranks" ]]; then
ranks="$RANKS"
fi
mpi_prefix=(
"$MPIEXEC"
--map-by "$MPI_MAP_BY"
--bind-to "$MPI_BIND_TO"
-np "$ranks"
)
if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
mpi_prefix+=(--report-bindings)
fi
append_mpi_env_args
if [[ -n "$MPI_HOSTS" ]]; then
mpi_prefix+=(-host "$MPI_HOSTS")
elif [[ -n "$MPI_HOSTFILE" ]]; then
mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
fi
}
build_mpi_prefix
run_case() {
local label="$1"
@@ -323,7 +392,12 @@ Cases:
Common overrides:
PYTHON_BIN=.venv/bin/python
MPIEXEC=mpiexec
MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
MPI_RANKS=8
MPI_PE=128
MPI_MAP_BY=ppr:1:numa:PE=128
MPI_BIND_TO=core
MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
HOSTFILE=hostfile
RANKS=8
TORCH_THREADS=8

View File

@@ -47,7 +47,7 @@ CASES = {
"main1": CaseSpec(
circuit_kind="rxx_rzz_chain",
observables=("ring_xz",),
nqubits=34,
nqubits=37,
nlayers=20,
seed=31001,
target_slices=None,
@@ -205,6 +205,8 @@ def build_parallel_opts(args, tree_file=None, search_only=False):
opts["search_backend"] = args.tn_search_backend
if args.dask_address is not None:
opts["dask_address"] = args.dask_address
if args.dask_expected_workers is not None:
opts["dask_expected_workers"] = args.dask_expected_workers
if args.dask_close_workers:
opts["dask_close_workers"] = True
if args.tn_debug_trials:
@@ -378,7 +380,7 @@ def main():
parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64")
parser.add_argument("--tn-target-slices", type=int)
parser.add_argument("--tn-target-size", type=int, default=2**32)
parser.add_argument("--tn-target-size", type=int, default=2**34)
parser.add_argument("--tn-search-workers", type=int)
parser.add_argument("--tn-search-repeats", type=int, default=2048)
parser.add_argument("--tn-search-time", type=float, default=300.0)
@@ -392,6 +394,7 @@ def main():
),
)
parser.add_argument("--dask-address")
parser.add_argument("--dask-expected-workers", type=int)
parser.add_argument("--dask-close-workers", action="store_true")
parser.add_argument(
"--keep-dask",

Binary file not shown.

Binary file not shown.