The single TP=4 server on 4x L40S (no NVLink) pays a per-layer all-reduce tax over PCIe. Since the A10B MoE fits in 2 cards at FP8, run two TP=2 replicas (GPUs 0,1 / 2,3) behind a streaming load balancer on the public port 7080 for better concurrent throughput. - 14_start_replica_122b.sh: one TP=2 replica pinned to a GPU pair - 15_start_replicas_122b.sh: launch both replicas + load balancer - 16_start_loadbalancer.sh + lb_proxy.py: least-in-flight streaming reverse proxy on 7080 -> replicas on 7091/7092 (clear of Open WebUI on 7081) - 17_stop_replicas_122b.sh: stop LB + both replicas - 11_start_server_122b.sh: add --kv-cache-dtype fp8 (~2x more 128k KV slots), --max-num-seqs 16, chunked prefill, gpu-util 0.95 Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
47 lines
1.4 KiB
Bash
Executable File
47 lines
1.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ------------------------------------------------------------------
|
|
# 17_stop_replicas_122b.sh
|
|
# Stops the load balancer and both 122B replicas started by
|
|
# 15_start_replicas_122b.sh.
|
|
# ------------------------------------------------------------------
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
LOG_DIR="${SCRIPT_DIR}/logs"
|
|
|
|
stop_one () {
|
|
local name="$1" pidfile="$2"
|
|
if [ ! -f "$pidfile" ]; then
|
|
echo "${name}: no PID file, skipping."
|
|
return
|
|
fi
|
|
local pid
|
|
pid=$(cat "$pidfile")
|
|
if kill -0 "$pid" 2>/dev/null; then
|
|
echo "Stopping ${name} (PID ${pid})..."
|
|
kill "$pid" 2>/dev/null || true
|
|
for _ in 1 2 3 4 5; do
|
|
kill -0 "$pid" 2>/dev/null || break
|
|
sleep 1
|
|
done
|
|
if kill -0 "$pid" 2>/dev/null; then
|
|
echo " still alive, SIGKILL..."
|
|
kill -9 "$pid" 2>/dev/null || true
|
|
fi
|
|
echo " ${name} stopped."
|
|
else
|
|
echo "${name} (PID ${pid}) not running."
|
|
fi
|
|
rm -f "$pidfile"
|
|
}
|
|
|
|
# Stop LB first so no new requests get routed to dying replicas.
|
|
stop_one "load balancer" "${LOG_DIR}/vllm_lb.pid"
|
|
stop_one "replica a" "${LOG_DIR}/vllm_replica_a.pid"
|
|
stop_one "replica b" "${LOG_DIR}/vllm_replica_b.pid"
|
|
|
|
echo ""
|
|
echo "All replicas and load balancer stopped."
|
|
echo "Note: vLLM worker subprocesses may take a few seconds to release GPU memory."
|
|
echo "Verify with: nvidia-smi"
|