Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled
224 lines
6.5 KiB
Bash
Executable File
224 lines
6.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Manage the dask cluster used by TN path search.
|
|
#
|
|
# Defaults target two servers:
|
|
# scheduler: 10.20.1.103:8786
|
|
# workers: 10.20.1.103, 10.20.1.102
|
|
#
|
|
# Usage:
|
|
# tools/manage_tn_dask_cluster.sh start
|
|
# tools/manage_tn_dask_cluster.sh status
|
|
# tools/manage_tn_dask_cluster.sh stop
|
|
#
|
|
# Common overrides:
|
|
# SCHEDULER_HOST=10.20.1.103
|
|
# WORKER_HOSTS="10.20.1.103 10.20.1.102"
|
|
# NWORKERS=48
|
|
# NTHREADS=1
|
|
# ROOT_DIR=/home/yx/qibotn
|
|
# PYTHON_BIN=.venv/bin/python
|
|
|
|
ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}"
|
|
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
|
|
SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
|
|
SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
|
|
DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
|
|
WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.1.102}"
|
|
NWORKERS="${NWORKERS:-48}"
|
|
NTHREADS="${NTHREADS:-1}"
|
|
MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
|
|
LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
|
|
LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}"
|
|
SSH_BIN="${SSH_BIN:-ssh}"
|
|
DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}"
|
|
DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}"
|
|
DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}"
|
|
|
|
SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}"
|
|
|
|
is_local_host() {
|
|
local host="$1"
|
|
[[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
|
|
[[ "$host" == "$(hostname)" ]] && return 0
|
|
[[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
|
|
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
|
|
}
|
|
|
|
run_on_host() {
|
|
local host="$1"
|
|
shift
|
|
local cmd="$*"
|
|
if is_local_host "$host"; then
|
|
bash -lc "$cmd"
|
|
else
|
|
"$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")"
|
|
fi
|
|
}
|
|
|
|
start_scheduler() {
|
|
local host="$SCHEDULER_HOST"
|
|
local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log"
|
|
local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
|
|
run_on_host "$host" "
|
|
set -euo pipefail
|
|
cd '$ROOT_DIR'
|
|
mkdir -p '$LOG_DIR'
|
|
if [[ -s '$pid_file' ]]; then
|
|
pid=\$(cat '$pid_file')
|
|
if kill -0 \"\$pid\" 2>/dev/null; then
|
|
echo \"scheduler already running on $host pid=\$pid\"
|
|
exit 0
|
|
fi
|
|
fi
|
|
DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
|
|
DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
|
|
DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
|
|
setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \
|
|
--host '$SCHEDULER_HOST' \
|
|
--port '$SCHEDULER_PORT' \
|
|
--dashboard-address '$DASHBOARD_ADDRESS' \
|
|
> '$log' 2>&1 < /dev/null &
|
|
pid=\$!
|
|
echo \"\$pid\" > '$pid_file'
|
|
echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\"
|
|
"
|
|
}
|
|
|
|
start_worker() {
|
|
local host="$1"
|
|
local log="$LOG_DIR/worker_${host}.log"
|
|
local pid_file="$LOG_DIR/worker_${host}.pid"
|
|
run_on_host "$host" "
|
|
set -euo pipefail
|
|
cd '$ROOT_DIR'
|
|
mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY'
|
|
if [[ -s '$pid_file' ]]; then
|
|
pid=\$(cat '$pid_file')
|
|
if kill -0 \"\$pid\" 2>/dev/null; then
|
|
echo \"worker already running on $host pid=\$pid\"
|
|
exit 0
|
|
fi
|
|
fi
|
|
TCM_ENABLE=1 \
|
|
DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
|
|
DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
|
|
DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
|
|
setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \
|
|
'$SCHEDULER_ADDR' \
|
|
--host '$host' \
|
|
--nworkers '$NWORKERS' \
|
|
--nthreads '$NTHREADS' \
|
|
--memory-limit '$MEMORY_LIMIT' \
|
|
--local-directory '$LOCAL_DIRECTORY' \
|
|
> '$log' 2>&1 < /dev/null &
|
|
pid=\$!
|
|
echo \"\$pid\" > '$pid_file'
|
|
echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\"
|
|
"
|
|
}
|
|
|
|
stop_host() {
|
|
local host="$1"
|
|
local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
|
|
local worker_pid_file="$LOG_DIR/worker_${host}.pid"
|
|
run_on_host "$host" "
|
|
set +e
|
|
for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
|
|
[[ -f \"\$pid_file\" ]] || continue
|
|
if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
|
|
continue
|
|
fi
|
|
pid=\$(cat \"\$pid_file\")
|
|
kill \"\$pid\" 2>/dev/null || true
|
|
rm -f \"\$pid_file\"
|
|
done
|
|
pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR'
|
|
pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT'
|
|
true
|
|
"
|
|
}
|
|
|
|
status_host() {
|
|
local host="$1"
|
|
local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
|
|
local worker_pid_file="$LOG_DIR/worker_${host}.pid"
|
|
echo "--------------------------------------------------------------------------------"
|
|
echo "host=$host"
|
|
run_on_host "$host" "
|
|
set +e
|
|
for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
|
|
[[ -f \"\$pid_file\" ]] || continue
|
|
if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
|
|
continue
|
|
fi
|
|
pid=\$(cat \"\$pid_file\")
|
|
if kill -0 \"\$pid\" 2>/dev/null; then
|
|
ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers
|
|
else
|
|
echo \"stale pid_file=\$pid_file pid=\$pid\"
|
|
fi
|
|
done
|
|
pgrep -af '[d]istributed.cli.dask' || true
|
|
"
|
|
}
|
|
|
|
case "${1:-help}" in
|
|
start)
|
|
start_scheduler
|
|
sleep 2
|
|
for host in $WORKER_HOSTS; do
|
|
start_worker "$host"
|
|
done
|
|
echo
|
|
echo "Dask scheduler: $SCHEDULER_ADDR"
|
|
echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS"
|
|
;;
|
|
stop)
|
|
for host in $WORKER_HOSTS; do
|
|
stop_host "$host"
|
|
done
|
|
stop_host "$SCHEDULER_HOST"
|
|
;;
|
|
status)
|
|
status_host "$SCHEDULER_HOST"
|
|
for host in $WORKER_HOSTS; do
|
|
[[ "$host" == "$SCHEDULER_HOST" ]] && continue
|
|
status_host "$host"
|
|
done
|
|
;;
|
|
restart)
|
|
"$0" stop
|
|
sleep 2
|
|
"$0" start
|
|
;;
|
|
help|*)
|
|
cat <<EOF
|
|
Usage: tools/manage_tn_dask_cluster.sh [start|stop|restart|status]
|
|
|
|
Defaults:
|
|
SCHEDULER_HOST=$SCHEDULER_HOST
|
|
SCHEDULER_PORT=$SCHEDULER_PORT
|
|
WORKER_HOSTS="$WORKER_HOSTS"
|
|
NWORKERS=$NWORKERS
|
|
NTHREADS=$NTHREADS
|
|
ROOT_DIR=$ROOT_DIR
|
|
PYTHON_BIN=$PYTHON_BIN
|
|
DASK_WORKER_TTL="$DASK_WORKER_TTL"
|
|
DASK_TICK_LIMIT=$DASK_TICK_LIMIT
|
|
DASK_LOST_WORKER_TIMEOUT=$DASK_LOST_WORKER_TIMEOUT
|
|
|
|
Search command after start:
|
|
TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \\
|
|
--case main1 \\
|
|
--dask-address $SCHEDULER_ADDR \\
|
|
--torch-threads 48 \\
|
|
--dtype complex64 \\
|
|
--tn-search-repeats 2048 \\
|
|
--tn-search-time 300
|
|
EOF
|
|
exit 2
|
|
;;
|
|
esac
|