优化 torch CPU 张量网络收缩路径

- torch CPU 收缩默认走 cotengra matmul lowering - 复用 mm/bmm/matmul 输出缓冲区，降低中间张量分配压力 - 仅回收 contiguous tensor，避免非连续 view 进入 workspace - 调整 cotengra 中间节点 index 顺序，减少 reshape 触发 clone/copy - qibotn MPI 分片收缩显式使用 backend=torch - rank 内分片结果先在 torch 中累加，最后再转 numpy 做 Reduce - 统一 quimb 后端 torch 数组转换为 CPU contiguous complex128
完善脚本功能，添加计时估计
2026-05-08 11:57:18 +08:00 · 2026-05-08 10:20:03 +08:00 · 2026-05-08 00:16:28 +08:00 · 2026-05-08 00:12:32 +08:00 · 2026-05-07 23:37:15 +08:00 · 2026-05-07 23:35:23 +08:00
19 changed files with 1064 additions and 158 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,14 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+data/
 # C extensions
 *.so
-
+bak/
 path/
 profiles/
 vtune_expval/
 perf*
 # Distribution / packaging
 .Python
 build/
--- a/benchmark_contract_sliced.py
+++ b/benchmark_contract_sliced.py
@@ -0,0 +1,56 @@
 """MPI parallel sliced contraction using pre-sliced tree."""
 import time, pickle, os
 import numpy as np
 from mpi4py import MPI
 NQUBITS, NLAYERS, NCORES = 25, 10, 48
 comm = MPI.COMM_WORLD
 rank, size = comm.Get_rank(), comm.Get_size()
 os.environ['OMP_NUM_THREADS'] = str(NCORES)
 os.environ['MKL_NUM_THREADS'] = str(NCORES)
 import torch
 import qibo, quimb as qu
 from qibotn.observables import build_random_circuit
 torch.set_num_threads(NCORES)
 circuit = build_random_circuit(NQUBITS, NLAYERS)
 qibo.set_backend("qibotn", platform="quimb")
 backend = qibo.get_backend()
 backend.configure_tn_simulation(ansatz="tn")
 qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
 tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
 if rank == 0:
    with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f:
        tree = pickle.load(f)
 else:
    tree = None
 tree = comm.bcast(tree, root=0)
 arrays = [torch.from_numpy(np.asarray(t._data)) for t in tn.tensors]
 n_slices = tree.multiplicity
 if rank == 0:
    print(f"Slices: {n_slices}, Ranks: {size}, "
          f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, "
          f"Threads/rank: {NCORES}, Backend: torch")
 t0 = time.time()
 result = None
 for i in range(rank, n_slices, size):
    val = tree.contract_slice(arrays, i, backend='torch')
    val_np = val.cpu().numpy().reshape(-1)
    result = val_np if result is None else result + val_np
 if result is None:
    result = np.zeros(1, dtype=np.complex128)
 total = np.zeros_like(result) if rank == 0 else None
 comm.Reduce(result, total, root=0)
 if rank == 0:
    print(f"Contract: {time.time() - t0:.4f}s  Expectation: {0.5 * total[0].real:.10f}")
--- a/benchmark_search.py
+++ b/benchmark_search.py
@@ -0,0 +1,34 @@
 """Search contraction path and save."""
 import time, os, pickle
 from qibotn.parallel import parallel_path_search
 from qibotn.observables import build_random_circuit
 import qibo, quimb as qu
 from mpi4py import MPI
 NQUBITS, NLAYERS, WORKERS = 20, 10, 96
 comm = MPI.COMM_WORLD
 rank, size = comm.Get_rank(), comm.Get_size()
 method = 'mpi' if size > 1 else 'processpool'
 circuit = build_random_circuit(NQUBITS, NLAYERS)
 qibo.set_backend("qibotn", platform="quimb")
 backend = qibo.get_backend()
 backend.configure_tn_simulation(ansatz="tn")
 qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
 tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
 if rank == 0:
    print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...")
 t0 = time.time()
 tree = parallel_path_search(tn, tn.outer_inds(), method=method,
    total_repeats=1024, max_time=300, n_workers=WORKERS,trial_timeout=60)
 t_search = time.time() - t0
 if rank == 0:
    os.makedirs('data', exist_ok=True)
    path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl"
    with open(path, 'wb') as f:
        pickle.dump(tree, f)
    print(f"Search: {t_search:.2f}s  Peak: {tree.max_size() * 16 / 1e9:.2f} GB  Saved: {path}")
--- a/benchmark_slice.py
+++ b/benchmark_slice.py
@@ -0,0 +1,16 @@
 """Slice saved tree and save."""
 import pickle
 NQUBITS, NLAYERS = 25, 10
 with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
    tree = pickle.load(f)
 print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB")
 tree_sliced = tree.slice_and_reconfigure(target_size=2**28)
 with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f:
    pickle.dump(tree_sliced, f)
 print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB  Slices: {tree_sliced.multiplicity}")
--- a/benchmark_tn_mpi.py
+++ b/benchmark_tn_mpi.py
@@ -0,0 +1,378 @@
 """MPI-parallel TN benchmark: path search + contraction via MPI."""
 import json
 import pickle
 import time
 import argparse
 import numpy as np
 import cotengra as ctg
 import qibo
 from qibo import Circuit, gates
 from mpi4py import MPI
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from qibotn.observables import check_observable, extract_gates_and_qubits
 def _load_observable(observable_file=None, observable_json=None):
    if observable_file:
        with open(observable_file, "r", encoding="utf8") as f:
            return json.load(f)
    if observable_json:
        return json.loads(observable_json)
    return None
 def _term_to_quimb_operator(term):
    """Convert one extracted Hamiltonian term to a quimb operator."""
    import quimb as qu
    coeff = complex(term[0][2]) if term else 1.0
    op = None
    where = []
    for qubit, gate_name, _ in term:
        qubit = int(qubit)
        gate_name = str(gate_name).upper()
        if gate_name == "I":
            continue
        where.append(qubit)
        op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower())
    return complex(coeff), op, tuple(where)
 def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time):
    import pickle, cotengra as ctg, random
    random.seed(seed)
    tn = pickle.loads(tn_bytes)
    opt = ctg.HyperOptimizer(
        methods=['kahypar', 'kahypar-agglom', 'spinglass'],
        max_repeats=repeats,
        parallel=False,
        minimize='combo-256',
        max_time=max_time,
        optlib="random",
        slicing_opts={'target_size': 2**29, 'allow_outer': True},
        progbar=False,
    )
    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
    return tree.combo_cost(factor=256), tree
 def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
                    timeout):
    import pickle, os, signal
    from concurrent.futures import ProcessPoolExecutor, as_completed
    tn_bytes = pickle.dumps(tn)
    if n_workers <= 1:
        return _run_serial_search(
            tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout
        )[1]
    repeats_per = max(1, total_repeats // n_workers)
    best_cost, best_tree = float('inf'), None
    pool = ProcessPoolExecutor(max_workers=n_workers)
    futures = [
        pool.submit(_run_serial_search, tn_bytes, output_inds,
                    repeats_per, seed, num_slices, n_ranks, timeout)
        for seed in range(n_workers)
    ]
    try:
        for fut in as_completed(futures, timeout=timeout + 5):
            try:
                cost, tree = fut.result()
                if cost < best_cost:
                    best_cost, best_tree = cost, tree
            except Exception as e:
                print(f"  [worker failed] {e}")
    except TimeoutError:
        pass
    finally:
        for fut in futures:
            fut.cancel()
        for pid in list(pool._processes.keys()):
            try:
                os.kill(pid, signal.SIGKILL)
            except ProcessLookupError:
                pass
        pool.shutdown(wait=False)
    return best_tree
 def make_circuit(circuit_type, nqubits, nlayers=1):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        return QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    elif circuit_type == "brickwork":
        for q in range(nqubits):
            c.add(gates.H(q))
        for layer in range(nlayers):
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CNOT(q, q + 1))
                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    return c
 def _contract_mpi(tree, arrays, comm, root=0):
    rank = comm.Get_rank()
    size = comm.Get_size()
    is_torch = type(arrays[0]).__module__.startswith("torch")
    result_np = None
    for i in range(rank, tree.multiplicity, size):
        x = tree.contract_slice(arrays, i)
        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
        result_np = x_np if result_np is None else result_np + x_np
    if result_np is None:
        result_np = np.zeros(1, dtype=np.complex128)
    result = np.zeros_like(result_np) if rank == root else None
    comm.Reduce(result_np, result, root=root)
    if rank == root:
        import torch
        return torch.from_numpy(np.asarray(result)) if is_torch else result
    return None
 def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
            load_path=None, save_path=None):
    """Each MPI rank runs serial path search over total_repeats/size trials,
    rank 0 picks the global best, then all ranks contract in parallel."""
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    import torch
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex128)
    # --- path search: each rank serial, gather best to rank 0 ---
    if load_path:
        if rank == 0:
            with open(load_path, "rb") as f:
                saved = pickle.load(f)
            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
            print(f"  [path loaded]  {load_path}")
        else:
            tree = psi = None
            t_search = 0.0
    else:
        rank_repeats = max(1, total_repeats // size)
        t0 = time.time()
        # get TN object first (no contraction), then run parallel search
        psi_tn = qc.to_dense(rehearse="tn")
        local_tree = parallel_search(
            psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
            num_slices=num_slices, n_ranks=size, timeout=600,
        )
        t_search = time.time() - t0
        local_psi = psi_tn
        all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree, local_psi), root=0)
        if rank == 0:
            _, tree, psi = min(all_results, key=lambda x: x[0])
            print(f"  [path search]  {t_search:.3f}s  "
                  f"flops~2^{tree.contraction_cost(log=2):.2f}  "
                  f"size~2^{tree.contraction_width():.2f}  "
                  f"slices={tree.multiplicity}")
            if save_path:
                with open(save_path, "wb") as f:
                    pickle.dump({"tree": tree, "psi": psi}, f)
                print(f"  [path saved]   {save_path}")
        else:
            tree = psi = None
        if save_path:
            t_search = comm.bcast(t_search, root=0)
            return None, t_search
    tree = comm.bcast(tree, root=0)
    psi = comm.bcast(psi, root=0)
    t_search = comm.bcast(t_search, root=0)
    # --- contraction: all ranks work in parallel ---
    import torch
    torch.set_num_threads(max(1, 96 // size))
    arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays]
    t0 = time.time()
    sv = _contract_mpi(tree, arrays, comm, root=0)
    t_contract = time.time() - t0
    if rank == 0:
        print(f"  [contraction]  {t_contract:.3f}s")
        return np.array(sv).reshape(-1), t_search + t_contract
    return None, t_search + t_contract
 def run_mpi_expval(
    circuit,
    nqubits,
    observable=None,
    total_repeats=1024,
    search_workers=1,
    search_timeout=300,
 ):
    """Compute a Hamiltonian expectation value directly from TN via MPI.
    MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps
    path search for each term."""
    import torch
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    observable = check_observable(observable, nqubits)
    ham_gate_map = extract_gates_and_qubits(observable)
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    my_terms = ham_gate_map[rank::size]
    torch.set_num_threads(max(1, 96 // size))
    t0 = time.time()
    my_exp = 0.0 + 0.0j
    for term in my_terms:
        coeff, op, where = _term_to_quimb_operator(term)
        if op is None:
            my_exp += coeff
            continue
        tn = qc.local_expectation_tn(op, where=where)
        if len(tn.outer_inds()) == 0:
            val = complex(tn.contract())
        else:
            tree = parallel_search(
                tn,
                tn.outer_inds(),
                total_repeats,
                n_workers=search_workers,
                num_slices=1,
                n_ranks=size,
                timeout=search_timeout,
            )
            if tree is None:
                raise RuntimeError("Failed to find a contraction tree for expectation TN.")
            arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays]
            acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity))
            val = complex(acc.item() if hasattr(acc, 'item') else acc)
        my_exp += coeff * val
    t_total = time.time() - t0
    all_results = comm.gather(my_exp, root=0)
    if rank == 0:
        total_exp = sum(all_results)
        print(f"\n[TN expval]  time={t_total:.4f}s  expval={total_exp.real:.12f}")
        return np.real_if_close(total_exp), t_total
    return None, t_total
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=30)
    parser.add_argument("--circuit", type=str, default="qft",
                        choices=["qft", "variational", "ghz", "brickwork"])
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--num-slices", type=int, default=1)
    parser.add_argument("--total-repeats", type=int, default=1024)
    parser.add_argument("--search-workers", type=int, default=1)
    parser.add_argument("--search-timeout", type=int, default=300)
    parser.add_argument("--observable-file", type=str, default=None)
    parser.add_argument("--observable-json", type=str, default=None)
    parser.add_argument("--save-path", type=str, default=None)
    parser.add_argument("--load-path", type=str, default=None)
    parser.add_argument("--no-compare", action="store_true")
    parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"])
    args = parser.parse_args()
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    if rank == 0:
        print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
              f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
    np.random.seed(42)
    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
    observable = _load_observable(args.observable_file, args.observable_json)
    if args.mode == "expval":
        try:
            expval, t_total = run_mpi_expval(
                circuit,
                args.nqubits,
                observable=observable,
                total_repeats=args.total_repeats,
                search_workers=args.search_workers,
                search_timeout=args.search_timeout,
            )
        except Exception as e:
            if rank == 0:
                print(f"[FAILED] {e}")
            raise
        if rank == 0:
            np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval))
            if not args.no_compare:
                print("No built-in reference comparison for arbitrary observables.")
        return
    try:
        sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
                              total_repeats=args.total_repeats,
                              load_path=args.load_path, save_path=args.save_path)
    except Exception as e:
        if rank == 0:
            print(f"[FAILED] {e}")
        raise
    if rank == 0 and sv is not None:
        print(f"\n[quimb TN MPI]  time={t_total:.4f}s  shape={sv.shape}")
        np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
        if not args.no_compare:
            from qibotn.bak.benchmark_tn import run_qibojit
            import gc
            np.random.seed(42)
            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
            sv_ref, t_ref = run_qibojit(circuit_ref)
            np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref)
            print(f"[qibojit]       time={t_ref:.4f}s")
            # free memory before loading via mmap for expval comparison
            del sv, sv_ref
            gc.collect()
            from compare_jit_tn_quimb import check_results
            ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy"
            tn_path  = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy"
            check_results(ref_path, tn_path, args.nqubits)
            if t_total > 0:
                print(f"Speedup  : {t_ref/t_total:.2f}x")
 if __name__ == "__main__":
    main()
--- a/check_tree.py
+++ b/check_tree.py
@@ -0,0 +1,25 @@
 """Check contraction tree statistics."""
 import pickle, sys
 path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl"
 with open(path, 'rb') as f:
    tree = pickle.load(f)
 # Intel 8558P: 96 cores, 2.1GHz, AVX-512 (16 FP64/cycle), FMA x2
 # complex128 multiply-add = 6 real FLOPs
 CORES = 96
 FREQ = 2.1e9
 AVX512_FP64 = 16
 TFLOPS = CORES * FREQ * AVX512_FP64 * 2 / 1e12  # ~6.45 TFLOPS real FP64
 COMPLEX_FLOPS = TFLOPS / 6  # complex128 effective
 flops = tree.total_flops()
 slices = tree.multiplicity
 est_seconds = flops * slices / (COMPLEX_FLOPS * 1e12) 
 print(f"File: {path}")
 print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}")
 print(f"Total FLOPs: {flops:.2e}  x{slices} slices = {flops*slices:.2e}")
 print(f"Contraction width: {tree.contraction_width()}")
 print(f"Multiplicity (slices): {slices}")
 print(f"Estimated time (96 cores): {est_seconds:.1f}s  ({est_seconds/3600:.2f}h)")
--- a/data/tree_q25_l10.pkl
+++ b/data/tree_q25_l10.pkl
--- a/data/tree_q25_l10_sliced.pkl
+++ b/data/tree_q25_l10_sliced.pkl
--- a/data/tree_q30_l10.pkl
+++ b/data/tree_q30_l10.pkl
--- a/data/tree_q30_l10_sliced.pkl
+++ b/data/tree_q30_l10_sliced.pkl
--- a/doc/make.bat
+++ b/doc/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
+@ECHO OFF
-
+
-pushd %~dp0
+pushd %~dp0
-
+
-REM Command file for Sphinx documentation
+REM Command file for Sphinx documentation
-
+
-if "%SPHINXBUILD%" == "" (
+if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
+	set SPHINXBUILD=sphinx-build
-)
+)
-set SOURCEDIR=source
+set SOURCEDIR=source
-set BUILDDIR=build
+set BUILDDIR=build
-
+
-%SPHINXBUILD% >NUL 2>NUL
+%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
+if errorlevel 9009 (
-	echo.
+	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
+	echo.may add the Sphinx directory to PATH.
-	echo.
+	echo.
-	echo.If you don't have Sphinx installed, grab it from
+	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
+	echo.https://www.sphinx-doc.org/
-	exit /b 1
+	exit /b 1
-)
+)
-
+
-if "%1" == "" goto help
+if "%1" == "" goto help
-
+
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
+goto end
-
+
-:help
+:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
+
-:end
+:end
-popd
+popd
--- a/2
+++ b/2
@@ -0,0 +1,2 @@
 10.20.6.74
 #10.20.6.102
--- a/poetry.lock
+++ b/poetry.lock
@@ -1733,14 +1733,14 @@ files = [
 [[package]]
 name = "mako"
-version = "1.3.10"
+version = "1.3.11"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"},
+    {file = "mako-1.3.11-py3-none-any.whl", hash = "sha256:e372c6e333cf004aa736a15f425087ec977e1fcbd2966aae7f17c8dc1da27a77"},
-    {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"},
+    {file = "mako-1.3.11.tar.gz", hash = "sha256:071eb4ab4c5010443152255d77db7faa6ce5916f35226eb02dc34479b6858069"},
 ]
 [package.dependencies]
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -38,6 +38,36 @@ GATE_MAP = {
 }
 def _torch_cpu_array(data, dtype=None):
    """Convert array-like data to a contiguous CPU torch tensor."""
    import numpy as np
    import torch
    if isinstance(data, torch.Tensor):
        x = data
    else:
        array = np.asarray(data)
        if any(stride < 0 for stride in array.strides):
            array = np.ascontiguousarray(array)
        x = torch.from_numpy(array)
    if x.device.type != "cpu":
        x = x.cpu()
    if dtype is not None and x.dtype != dtype:
        x = x.to(dtype)
    if not x.is_contiguous():
        x = x.contiguous()
    return x
 def _arrays_to_backend(arrays, backend, engine):
    if backend == "torch":
        import torch
        return [_torch_cpu_array(array, dtype=torch.complex128) for array in arrays]
    return [engine.asarray(array) for array in arrays]
 def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"):
    super(self.__class__, self).__init__()
@@ -167,7 +197,7 @@ def execute_circuit(
        raise_error(ValueError, "Initial state not None supported only for MPS ansatz.")
    circ_quimb = self.circuit_ansatz.from_openqasm2_str(
-        circuit.to_qasm(), psi0=initial_state
+        circuit.to_qasm(), psi0=initial_state, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff}
    )
    if nshots:
@@ -186,7 +216,16 @@ def execute_circuit(
    else:
        frequencies = None
        measured_probabilities = None
-
+    '''
    if return_array:
        if self.ansatz == "mps":
            psi = circ_quimb.psi
            statevector = psi.to_dense().reshape(-1)
        else:
            statevector = circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
    else:
        statevector = None
    '''
    statevector = (
        circ_quimb.to_dense(backend=self.backend, optimize=self.contractions_optimizer)
        if return_array
@@ -291,6 +330,15 @@ def _qibo_circuit_to_quimb(
        quimb_gate_name = GATE_MAP.get(gate_name, None)
        if quimb_gate_name == "measure":
            continue
        if gate_name == "cu1":
            theta = gate.parameters[0]
            c, t = gate.qubits
            circ.apply_gate("RZ", theta / 2, c)
            circ.apply_gate("RZ", theta / 2, t)
            circ.apply_gate("CNOT", c, t)
            circ.apply_gate("RZ", -theta / 2, t)
            circ.apply_gate("CNOT", c, t)
            continue
        if quimb_gate_name is None:
            raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")
@@ -334,6 +382,172 @@ def _string_to_quimb_operator(self, op_str):
    return op
 def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
    """
    Compute expectation value with optional parallel acceleration.
    Parameters
    ----------
    circuit : qibo.models.Circuit
        The quantum circuit.
    observable : qibo.hamiltonians.SymbolicHamiltonian or form
        The observable to measure.
    parallel : str, optional
        Parallelization method: 'mpi', 'processpool', or None (default).
    parallel_opts : dict, optional
        Options for parallel execution:
        - max_repeats: int (default 1024)
        - max_time: int (default 300)
        - search_workers: int (default 48, processpool only)
        - mpi_contract: bool (default False, use MPI for contraction)
    Returns
    -------
    float
        The expectation value.
    """
    from qibotn.observables import check_observable, extract_gates_and_qubits
    if parallel_opts is None:
        parallel_opts = {}
    observable = check_observable(observable, circuit.nqubits)
    if parallel is None:
        # Use original implementation
        from qibotn.observables import extract_gates_and_qubits
        all_terms = extract_gates_and_qubits(observable)
        qc = self._qibo_circuit_to_quimb(
            circuit,
            quimb_circuit_type=self.circuit_ansatz,
            gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff},
        )
        exp_val = 0.0
        for coeff, factors in all_terms:
            op = None
            where = []
            for qubit, gate_name in factors:
                p = qu.pauli(gate_name.lower())
                op = p if op is None else op & p
                where.append(qubit)
            val = qc.local_expectation(
                op, tuple(where),
                backend=self.backend,
                optimize=self.contractions_optimizer,
                simplify_sequence="ADCRS",
                simplify_atol=1e-12,
            )
            exp_val += coeff * val
        return self.real(exp_val)
    else:
        # Use parallel implementation
        return self._expectation_parallel(circuit, observable, parallel, parallel_opts)
 def _expectation_parallel(self, circuit, observable, method, opts):
    """Parallel expectation value computation."""
    from qibotn.observables import extract_gates_and_qubits
    from qibotn.parallel import parallel_path_search, parallel_contract
    import torch
    try:
        from mpi4py import MPI
        comm = MPI.COMM_WORLD if method == 'mpi' else None
        rank = comm.Get_rank() if comm else 0
        size = comm.Get_size() if comm else 1
    except ImportError:
        comm, rank, size = None, 0, 1
    max_repeats = opts.get('max_repeats', 1024)
    max_time = opts.get('max_time', 300)
    search_workers = opts.get('search_workers', 48)
    mpi_contract = opts.get('mpi_contract', False)
    torch_threads = opts.get('torch_threads', None)
    slicing_opts = opts.get('slicing_opts', None)
    trial_timeout = opts.get('trial_timeout', None)
    qc = self._qibo_circuit_to_quimb(
        circuit,
        quimb_circuit_type=self.circuit_ansatz,
        gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff},
    )
    all_terms = extract_gates_and_qubits(observable)
    my_terms = all_terms[rank::size]
    if method == 'mpi' and comm:
        torch.set_num_threads(max(1, 96 // size))
    elif torch_threads:
        torch.set_num_threads(torch_threads)
    my_exp = 0.0
    for coeff, factors in my_terms:
        op = None
        where = []
        for qubit, gate_name in factors:
            p = qu.pauli(gate_name.lower())
            op = p if op is None else op & p
            where.append(qubit)
        tn = qc.local_expectation(op, tuple(where), rehearse='tn')
        tree = parallel_path_search(
            tn, tn.outer_inds(),
            method=method,
            total_repeats=max_repeats,
            max_time=max_time,
            n_workers=search_workers,
            slicing_opts=slicing_opts,
            trial_timeout=trial_timeout,
        )
        if tree is None:
            continue
        if mpi_contract and comm and size > 1:
            arrays = _arrays_to_backend(tn.arrays, self.backend, self.engine)
            val = parallel_contract(tree, arrays, method='mpi', comm=comm)
        else:
            if self.backend == "torch":
                for tensor in tn.tensors:
                    tensor._data = _torch_cpu_array(
                        tensor._data, dtype=torch.complex128
                    )
                val = complex(
                    tn.contract(
                        all,
                        output_inds=(),
                        optimize=tree,
                        backend="torch",
                    )
                )
            else:
                val = complex(
                    tn.contract(
                        all,
                        output_inds=(),
                        optimize=tree,
                        backend=self.backend,
                    )
                )
        my_exp += coeff * complex(val)
    if comm:
        all_exp = comm.gather(my_exp, root=0)
        if rank == 0:
            total_exp = sum(all_exp)
            return self.real(total_exp)
        return 0.0
    return self.real(my_exp)
 CLASSES_ROOTS = {"numpy": "Numpy", "torch": "PyTorch", "jax": "Jax"}
 METHODS = {
@@ -344,6 +558,8 @@ METHODS = {
    "exp_value_observable_symbolic": exp_value_observable_symbolic,
    "_qibo_circuit_to_quimb": _qibo_circuit_to_quimb,
    "_string_to_quimb_operator": _string_to_quimb_operator,
    "expectation": expectation,
    "_expectation_parallel": _expectation_parallel,
    "circuit_ansatz": circuit_ansatz,
 }
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -4,83 +4,16 @@ from cupy.cuda import nccl
 from cupy.cuda.runtime import getDeviceCount
 from cuquantum.tensornet import Network, contract
 from mpi4py import MPI
 from qibo import hamiltonians
 from qibo.symbols import I, X, Y, Z
 from qibotn.circuit_convertor import QiboCircuitToEinsum
 from qibotn.circuit_to_mps import QiboCircuitToMPS
 from qibotn.mps_contraction_helper import MPSContractionHelper
-
+from qibotn.observables import (
-
+    build_observable,
-def check_observable(observable, circuit_nqubit):
+    check_observable,
-    """Checks the type of observable and returns the appropriate Hamiltonian."""
+    create_hamiltonian_from_dict,
-    if observable is None:
+    extract_gates_and_qubits,
-        return build_observable(circuit_nqubit)
+)
    elif isinstance(observable, dict):
        return create_hamiltonian_from_dict(observable, circuit_nqubit)
    elif isinstance(observable, hamiltonians.SymbolicHamiltonian):
        # TODO: check if the observable is compatible with the circuit
        return observable
    else:
        raise TypeError("Invalid observable type.")
 def build_observable(circuit_nqubit):
    """Helper function to construct a target observable."""
    hamiltonian_form = 0
    for i in range(circuit_nqubit):
        hamiltonian_form += 0.5 * X(i % circuit_nqubit) * Z((i + 1) % circuit_nqubit)
    hamiltonian = hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
    return hamiltonian
 def create_hamiltonian_from_dict(data, circuit_nqubit):
    """Create a Qibo SymbolicHamiltonian from a dictionary representation.
    Ensures that each Hamiltonian term explicitly acts on all circuit qubits
    by adding identity (`I`) gates where needed.
    Args:
        data (dict): Dictionary containing Hamiltonian terms.
        circuit_nqubit (int): Total number of qubits in the quantum circuit.
    Returns:
        hamiltonians.SymbolicHamiltonian: The constructed Hamiltonian.
    """
    PAULI_GATES = {"X": X, "Y": Y, "Z": Z}
    terms = []
    for term in data["terms"]:
        coeff = term["coefficient"]
        operators = term["operators"]  # List of tuples like [("Z", 0), ("X", 1)]
        # Convert the operator list into a dictionary {qubit_index: gate}
        operator_dict = {q: PAULI_GATES[g] for g, q in operators}
        # Build the full term ensuring all qubits are covered
        full_term_expr = [
            operator_dict[q](q) if q in operator_dict else I(q)
            for q in range(circuit_nqubit)
        ]
        # Multiply all operators together to form a single term
        term_expr = full_term_expr[0]
        for op in full_term_expr[1:]:
            term_expr *= op
        # Scale by the coefficient
        final_term = coeff * term_expr
        terms.append(final_term)
    if not terms:
        raise ValueError("No valid Hamiltonian terms were added.")
    # Combine all terms
    hamiltonian_form = sum(terms)
    return hamiltonians.SymbolicHamiltonian(hamiltonian_form)
 def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
@@ -111,45 +44,6 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
    return gates
 def extract_gates_and_qubits(hamiltonian):
    """
    Extracts the gates and their corresponding qubits from a Qibo Hamiltonian.
    Parameters:
        hamiltonian (qibo.hamiltonians.Hamiltonian or qibo.hamiltonians.SymbolicHamiltonian):
            A Qibo Hamiltonian object.
    Returns:
        list of tuples: [(coefficient, [(gate, qubit), ...]), ...]
            - coefficient: The prefactor of the term.
            - list of (gate, qubit): Each term's gates and the qubits they act on.
    """
    extracted_terms = []
    if isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
        for term in hamiltonian.terms:
            coeff = term.coefficient  # Extract coefficient
            gate_qubit_list = []
            # Extract gate and qubit information
            for factor in term.factors:
                gate_name = str(factor)[
                    0
                ]  # Extract the gate type (X, Y, Z) from 'X0', 'Z1'
                qubit = int(str(factor)[1:])  # Extract the qubit index
                gate_qubit_list.append((qubit, gate_name, coeff))
                coeff = 1.0
            extracted_terms.append(gate_qubit_list)
    else:
        raise ValueError(
            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
        )
    return extracted_terms
 def initialize_mpi():
    """Initialize MPI communication and device selection."""
    comm = MPI.COMM_WORLD
--- a/src/qibotn/observables.py
+++ b/src/qibotn/observables.py
@@ -0,0 +1,86 @@
 """Observable helpers shared by tensor-network backends and benchmarks."""
 from qibo import hamiltonians
 from qibo.symbols import I, X, Y, Z
 def check_observable(observable, circuit_nqubit):
    """Checks the type of observable and returns the appropriate Hamiltonian."""
    if observable is None:
        return build_observable(circuit_nqubit)
    if isinstance(observable, dict):
        return create_hamiltonian_from_dict(observable, circuit_nqubit)
    if isinstance(observable, hamiltonians.SymbolicHamiltonian):
        return observable
    raise TypeError("Invalid observable type.")
 def build_observable(circuit_nqubit):
    """Construct the default benchmark observable used by qibotn."""
    hamiltonian_form = 0
    for i in range(circuit_nqubit):
        hamiltonian_form += 0.5 * X(i % circuit_nqubit) * Z((i + 1) % circuit_nqubit)
    return hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
 def create_hamiltonian_from_dict(data, circuit_nqubit):
    """Create a Qibo SymbolicHamiltonian from the qibotn dict representation."""
    pauli_gates = {"X": X, "Y": Y, "Z": Z}
    terms = []
    for term in data["terms"]:
        coeff = term["coefficient"]
        operators = term["operators"]
        operator_dict = {q: pauli_gates[g] for g, q in operators}
        full_term_expr = [
            operator_dict[q](q) if q in operator_dict else I(q)
            for q in range(circuit_nqubit)
        ]
        term_expr = full_term_expr[0]
        for op in full_term_expr[1:]:
            term_expr *= op
        terms.append(coeff * term_expr)
    if not terms:
        raise ValueError("No valid Hamiltonian terms were added.")
    return hamiltonians.SymbolicHamiltonian(sum(terms))
 def build_random_circuit(nqubits, nlayers, seed=42):
    """Build a random circuit with RY+RZ+CNOT layers for benchmarks."""
    import numpy as np
    from qibo import Circuit, gates
    np.random.seed(seed)
    c = Circuit(nqubits)
    for _ in range(nlayers):
        for q in range(nqubits):
            c.add(gates.RY(q, theta=np.random.uniform(0, 2*np.pi)))
            c.add(gates.RZ(q, theta=np.random.uniform(0, 2*np.pi)))
        for q in range(nqubits):
            c.add(gates.CNOT(q % nqubits, (q + 1) % nqubits))
    return c
 def extract_gates_and_qubits(hamiltonian):
    """Extract per-term Pauli factors from a Qibo SymbolicHamiltonian.
    Returns list of terms, where each term is (coefficient, [(qubit, gate_name), ...]).
    """
    extracted_terms = []
    if not isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
        raise ValueError(
            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
        )
    for term in hamiltonian.terms:
        coeff = term.coefficient
        factors = [(int(str(f)[1:]), str(f)[0]) for f in term.factors]
        extracted_terms.append((coeff, factors))
    return extracted_terms
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
@@ -0,0 +1,195 @@
 """Parallel path search and contraction utilities for tensor networks."""
 import os
 import pickle
 import signal
 import numpy as np
 from concurrent.futures import ProcessPoolExecutor, as_completed
 try:
    from mpi4py import MPI
    _HAVE_MPI = True
 except ImportError:
    _HAVE_MPI = False
    MPI = None
 def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
    import random, cotengra as ctg
    random.seed(seed)
    tn = pickle.loads(tn_bytes)
    opt = ctg.HyperOptimizer(
        methods=["kahypar", "kahypar-agglom", "spinglass"],
        max_repeats=1,
        parallel=False,
        minimize="combo-256",
        optlib="random",
        slicing_opts=slicing_opts,
        progbar=False,
    )
    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
    return tree.combo_cost(factor=256), tree
 def _kill_pool(pool):
    for pid in list(pool._processes.keys()):
        try:
            os.kill(pid, signal.SIGKILL)
        except ProcessLookupError:
            pass
    pool.shutdown(wait=False)
 def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=None, trial_timeout=None):
    import time
    if trial_timeout is None:
        import random, cotengra as ctg
        random.seed(seed)
        tn = pickle.loads(tn_bytes)
        opt = ctg.HyperOptimizer(
            methods=["kahypar", "kahypar-agglom", "spinglass"],
            max_repeats=repeats,
            parallel=False,
            minimize="combo-256",
            max_time=max_time,
            optlib="random",
            slicing_opts=slicing_opts,
            progbar=False,
        )
        tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
        return tree.combo_cost(factor=256), tree
    deadline = time.time() + max_time
    best_cost, best_tree = float("inf"), None
    for i in range(repeats):
        if time.time() >= deadline:
            break
        timeout = min(trial_timeout, deadline - time.time())
        pool = ProcessPoolExecutor(max_workers=1)
        fut = pool.submit(_run_single_trial, tn_bytes, output_inds, seed * 10000 + i, slicing_opts)
        try:
            cost, tree = fut.result(timeout=timeout)
            if cost < best_cost:
                best_cost, best_tree = cost, tree
        except Exception:
            pass
        finally:
            _kill_pool(pool)
    return best_cost, best_tree
 def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None):
    tn_bytes = pickle.dumps(tn)
    repeats_per = max(1, total_repeats // n_workers)
    pool = ProcessPoolExecutor(max_workers=n_workers)
    futures = [
        pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time, slicing_opts, trial_timeout)
        for seed in range(n_workers)
    ]
    best_cost, best_tree = float("inf"), None
    try:
        for fut in as_completed(futures, timeout=max_time + 5):
            try:
                cost, tree = fut.result()
                if cost < best_cost:
                    best_cost, best_tree = cost, tree
            except Exception:
                pass
    except TimeoutError:
        pass
    finally:
        for fut in futures:
            fut.cancel()
        _kill_pool(pool)
    return best_tree
 def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicing_opts=None, trial_timeout=None):
    comm = MPI.COMM_WORLD
    rank, size = comm.Get_rank(), comm.Get_size()
    tn_bytes = pickle.dumps(tn)
    repeats_per = max(1, total_repeats // size)
    if n_workers and n_workers > 1:
        local_tree = _processpool_search(
            tn, output_inds, repeats_per, n_workers, max_time, slicing_opts, trial_timeout
        )
        local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")
    else:
        local_cost, local_tree = _serial_search(
            tn_bytes, output_inds, repeats_per, rank, max_time, slicing_opts, trial_timeout
        )
    all_results = comm.gather((local_cost, local_tree), root=0)
    best_tree = None
    if rank == 0:
        best_cost = float("inf")
        for cost, tree in all_results:
            if tree is not None and cost < best_cost:
                best_cost, best_tree = cost, tree
    return comm.bcast(best_tree, root=0)
 def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
                         max_time=300, n_workers=48, slicing_opts=None, trial_timeout=None):
    """Parallel contraction path search.
    Args:
        method: 'processpool' | 'mpi' | 'serial'
        total_repeats: Total optimization repeats across all workers
        max_time: Global timeout per worker (seconds)
        n_workers: Workers per MPI rank (or total for processpool)
        slicing_opts: cotengra slicing options for memory control
        trial_timeout: Per-trial timeout (seconds); kills and skips hung trials
    """
    if method == 'serial':
        tn_bytes = pickle.dumps(tn)
        _, tree = _serial_search(tn_bytes, output_inds, total_repeats, 0, max_time, slicing_opts, trial_timeout)
        return tree
    elif method == 'mpi':
        if not _HAVE_MPI:
            raise ImportError("mpi4py not available")
        return _mpi_search(tn, output_inds, total_repeats, max_time, n_workers, slicing_opts, trial_timeout)
    elif method == 'processpool':
        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
    else:
        raise ValueError(f"Unknown method: {method}")
 def parallel_contract(tree, arrays, method='mpi', comm=None):
    if method == 'mpi':
        if not _HAVE_MPI or comm is None:
            raise ValueError("MPI method requires mpi4py and comm")
        return _contract_mpi(tree, arrays, comm)
    raise ValueError(f"Unknown method: {method}")
 def _contract_mpi(tree, arrays, comm, root=0):
    rank, size = comm.Get_rank(), comm.Get_size()
    is_torch = type(arrays[0]).__module__.startswith("torch")
    if is_torch:
        result_torch = None
        for i in range(rank, tree.multiplicity, size):
            x = tree.contract_slice(arrays, i, backend="torch").reshape(-1)
            result_torch = x if result_torch is None else result_torch + x
        if result_torch is None:
            result_np = np.zeros(1, dtype=np.complex128)
        else:
            result_np = result_torch.detach().cpu().numpy()
    else:
        result_np = None
        for i in range(rank, tree.multiplicity, size):
            x = tree.contract_slice(arrays, i)
            x_np = np.asarray(x).reshape(-1)
            result_np = x_np if result_np is None else result_np + x_np
        if result_np is None:
            result_np = np.zeros(1, dtype=np.complex128)
    result = np.zeros_like(result_np) if rank == root else None
    comm.Reduce(result_np, result, root=root)
    return result
--- a/src/qibotn/result.py
+++ b/src/qibotn/result.py
@@ -57,10 +57,10 @@ class TensorNetworkResult:
        return self.measures
    def state(self):
-        """Return the statevector if the number of qubits is less than 20."""
+        """Return the statevector if the number of qubits is less than 35."""
-        if self.nqubits < 20:
+        if self.nqubits < 35:
            return self.statevector
        raise_error(
            NotImplementedError,
-            f"Tensor network simulation cannot be used to reconstruct statevector for >= 20 .",
+            f"Tensor network simulation cannot be used to reconstruct statevector for >= 35 .",
        )
--- a/tests/test_expectation.py
+++ b/tests/test_expectation.py
@@ -35,7 +35,7 @@ def test_observable_expval(backend, nqubits):
    numpy_backend = construct_backend("numpy")
    ham, ham_form = build_observable(nqubits)
    circ = build_circuit(nqubits=nqubits, nlayers=1)
-
+    
    exact_expval = numpy_backend.calculate_expectation_state(
        hamiltonian=ham,
        state=circ().state(),
Author	SHA1	Message	Date
jaunatisblue	5479574502	优化 torch CPU 张量网络收缩路径 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details - torch CPU 收缩默认走 cotengra matmul lowering - 复用 mm/bmm/matmul 输出缓冲区，降低中间张量分配压力 - 仅回收 contiguous tensor，避免非连续 view 进入 workspace - 调整 cotengra 中间节点 index 顺序，减少 reshape 触发 clone/copy - qibotn MPI 分片收缩显式使用 backend=torch - rank 内分片结果先在 torch 中累加，最后再转 numpy 做 Reduce - 统一 quimb 后端 torch 数组转换为 CPU contiguous complex128	2026-05-08 11:57:18 +08:00
jaunatisblue	cec0ba272a	完善脚本功能，添加计时估计	2026-05-08 10:20:03 +08:00
jaunatisblue	8b71ff96c8	误传profiling删除 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-08 00:16:28 +08:00
jaunatisblue	49b27a5840	完善时间剪枝功能 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-08 00:12:32 +08:00
jaunatisblue	c818ac7a6e	mpi完善 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 23:37:15 +08:00
jaunatisblue	0a96553bd8	多机测试 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 23:35:23 +08:00
jaunatisblue	2f5c863952	并行化支持完善 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 23:26:53 +08:00
jaunatisblue	fbae48eb3d	期望值计算支持；更新后端调用 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 11:19:45 +08:00
jaunatisblue	f776fbb04f	修复时间剪枝功能 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-05 23:31:23 +08:00
jaunatisblue	5a692033a6	添加MPI并行TN benchmark及辅助脚本，移除旧benchmark Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-05 19:04:09 +08:00
jaunatisblue	a3f39a1d67	删除tn脚本implementation	2026-05-03 19:06:21 +08:00
jaunatisblue	dd222587b7	tn脚本更新 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-03 18:54:05 +08:00
jaunatisblue	740828872e	添加tn脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-28 23:07:39 +08:00
jaunatisblue	80d9c1de5a	benchmark测试，发现瓶颈：路径搜索 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 18:59:54 +08:00
jaunatisblue	2c54840e7b	1.完成mps态脚本，与原始qibojit结果比对确定bond demension和cut off值；2.更新了官方库；3.新大陆 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 11:03:57 +08:00