LLM_Inferenz_Server_1/17_stop_replicas_122b.sh

#!/usr/bin/env bash
# ------------------------------------------------------------------
# 17_stop_replicas_122b.sh
# Stops the load balancer and both 122B replicas started by
# 15_start_replicas_122b.sh.
# ------------------------------------------------------------------
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
LOG_DIR="${SCRIPT_DIR}/logs"

stop_one () {
    local name="$1" pidfile="$2"
    if [ ! -f "$pidfile" ]; then
        echo "${name}: no PID file, skipping."
        return
    fi
    local pid
    pid=$(cat "$pidfile")
    if kill -0 "$pid" 2>/dev/null; then
        echo "Stopping ${name} (PID ${pid})..."
        kill "$pid" 2>/dev/null || true
        for _ in 1 2 3 4 5; do
            kill -0 "$pid" 2>/dev/null || break
            sleep 1
        done
        if kill -0 "$pid" 2>/dev/null; then
            echo "  still alive, SIGKILL..."
            kill -9 "$pid" 2>/dev/null || true
        fi
        echo "  ${name} stopped."
    else
        echo "${name} (PID ${pid}) not running."
    fi
    rm -f "$pidfile"
}

# Stop LB first so no new requests get routed to dying replicas.
stop_one "load balancer" "${LOG_DIR}/vllm_lb.pid"
stop_one "replica a"     "${LOG_DIR}/vllm_replica_a.pid"
stop_one "replica b"     "${LOG_DIR}/vllm_replica_b.pid"

echo ""
echo "All replicas and load balancer stopped."
echo "Note: vLLM worker subprocesses may take a few seconds to release GPU memory."
echo "Verify with:  nvidia-smi"