Files
qibotn/tools/manage_tn_dask_cluster.sh
jaunatisblue 915c24dc7b
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled
赛前稳定版
2026-05-15 09:32:26 +08:00

224 lines
6.5 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# Manage the dask cluster used by TN path search.
#
# Defaults target two servers:
# scheduler: 10.20.1.103:8786
# workers: 10.20.1.103, 10.20.1.102
#
# Usage:
# tools/manage_tn_dask_cluster.sh start
# tools/manage_tn_dask_cluster.sh status
# tools/manage_tn_dask_cluster.sh stop
#
# Common overrides:
# SCHEDULER_HOST=10.20.1.103
# WORKER_HOSTS="10.20.1.103 10.20.1.102"
# NWORKERS=48
# NTHREADS=1
# ROOT_DIR=/home/yx/qibotn
# PYTHON_BIN=.venv/bin/python
ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.1.102}"
NWORKERS="${NWORKERS:-48}"
NTHREADS="${NTHREADS:-1}"
MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}"
SSH_BIN="${SSH_BIN:-ssh}"
DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}"
DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}"
DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}"
SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}"
is_local_host() {
local host="$1"
[[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
[[ "$host" == "$(hostname)" ]] && return 0
[[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
}
run_on_host() {
local host="$1"
shift
local cmd="$*"
if is_local_host "$host"; then
bash -lc "$cmd"
else
"$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")"
fi
}
start_scheduler() {
local host="$SCHEDULER_HOST"
local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log"
local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
run_on_host "$host" "
set -euo pipefail
cd '$ROOT_DIR'
mkdir -p '$LOG_DIR'
if [[ -s '$pid_file' ]]; then
pid=\$(cat '$pid_file')
if kill -0 \"\$pid\" 2>/dev/null; then
echo \"scheduler already running on $host pid=\$pid\"
exit 0
fi
fi
DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \
--host '$SCHEDULER_HOST' \
--port '$SCHEDULER_PORT' \
--dashboard-address '$DASHBOARD_ADDRESS' \
> '$log' 2>&1 < /dev/null &
pid=\$!
echo \"\$pid\" > '$pid_file'
echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\"
"
}
start_worker() {
local host="$1"
local log="$LOG_DIR/worker_${host}.log"
local pid_file="$LOG_DIR/worker_${host}.pid"
run_on_host "$host" "
set -euo pipefail
cd '$ROOT_DIR'
mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY'
if [[ -s '$pid_file' ]]; then
pid=\$(cat '$pid_file')
if kill -0 \"\$pid\" 2>/dev/null; then
echo \"worker already running on $host pid=\$pid\"
exit 0
fi
fi
TCM_ENABLE=1 \
DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \
'$SCHEDULER_ADDR' \
--host '$host' \
--nworkers '$NWORKERS' \
--nthreads '$NTHREADS' \
--memory-limit '$MEMORY_LIMIT' \
--local-directory '$LOCAL_DIRECTORY' \
> '$log' 2>&1 < /dev/null &
pid=\$!
echo \"\$pid\" > '$pid_file'
echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\"
"
}
stop_host() {
local host="$1"
local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
local worker_pid_file="$LOG_DIR/worker_${host}.pid"
run_on_host "$host" "
set +e
for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
[[ -f \"\$pid_file\" ]] || continue
if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
continue
fi
pid=\$(cat \"\$pid_file\")
kill \"\$pid\" 2>/dev/null || true
rm -f \"\$pid_file\"
done
pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR'
pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT'
true
"
}
status_host() {
local host="$1"
local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
local worker_pid_file="$LOG_DIR/worker_${host}.pid"
echo "--------------------------------------------------------------------------------"
echo "host=$host"
run_on_host "$host" "
set +e
for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
[[ -f \"\$pid_file\" ]] || continue
if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
continue
fi
pid=\$(cat \"\$pid_file\")
if kill -0 \"\$pid\" 2>/dev/null; then
ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers
else
echo \"stale pid_file=\$pid_file pid=\$pid\"
fi
done
pgrep -af '[d]istributed.cli.dask' || true
"
}
case "${1:-help}" in
start)
start_scheduler
sleep 2
for host in $WORKER_HOSTS; do
start_worker "$host"
done
echo
echo "Dask scheduler: $SCHEDULER_ADDR"
echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS"
;;
stop)
for host in $WORKER_HOSTS; do
stop_host "$host"
done
stop_host "$SCHEDULER_HOST"
;;
status)
status_host "$SCHEDULER_HOST"
for host in $WORKER_HOSTS; do
[[ "$host" == "$SCHEDULER_HOST" ]] && continue
status_host "$host"
done
;;
restart)
"$0" stop
sleep 2
"$0" start
;;
help|*)
cat <<EOF
Usage: tools/manage_tn_dask_cluster.sh [start|stop|restart|status]
Defaults:
SCHEDULER_HOST=$SCHEDULER_HOST
SCHEDULER_PORT=$SCHEDULER_PORT
WORKER_HOSTS="$WORKER_HOSTS"
NWORKERS=$NWORKERS
NTHREADS=$NTHREADS
ROOT_DIR=$ROOT_DIR
PYTHON_BIN=$PYTHON_BIN
DASK_WORKER_TTL="$DASK_WORKER_TTL"
DASK_TICK_LIMIT=$DASK_TICK_LIMIT
DASK_LOST_WORKER_TIMEOUT=$DASK_LOST_WORKER_TIMEOUT
Search command after start:
TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \\
--case main1 \\
--dask-address $SCHEDULER_ADDR \\
--torch-threads 48 \\
--dtype complex64 \\
--tn-search-repeats 2048 \\
--tn-search-time 300
EOF
exit 2
;;
esac