- Add download script (10), start script (11), and background launcher (12) for the 122B FP8 model using all 4 GPUs with TP=4 - Both models share port 7080; only one runs at a time - Update README with dual-model hardware table, switching workflow, and updated file overview - Update STUDENT_GUIDE with both model names and discovery instructions Made-with: Cursor
89 lines
2.8 KiB
Bash
Executable File
89 lines
2.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ------------------------------------------------------------------
|
|
# 11_start_server_122b.sh
|
|
# Launches the vLLM inference server for Qwen3.5-122B-A10B-FP8
|
|
# inside the Apptainer container using all 4 GPUs.
|
|
#
|
|
# NOTE: Only one model can run on port 7080 at a time.
|
|
# Stop the 35B model first: bash 05_stop_server.sh
|
|
#
|
|
# Usage:
|
|
# bash 11_start_server_122b.sh
|
|
#
|
|
# Environment variables (override defaults):
|
|
# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-122B-A10B-FP8)
|
|
# PORT - Server port (default: 7080)
|
|
# MAX_MODEL_LEN - Maximum context length (default: 32768)
|
|
# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92)
|
|
# API_KEY - API key for authentication (default: none)
|
|
# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 4)
|
|
# ------------------------------------------------------------------
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
|
|
|
|
MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-122B-A10B-FP8}"
|
|
PORT="${PORT:-7080}"
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
|
GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
|
|
API_KEY="${API_KEY:-}"
|
|
TENSOR_PARALLEL="${TENSOR_PARALLEL:-4}"
|
|
|
|
if [ ! -f "$SIF_FILE" ]; then
|
|
echo "ERROR: Container image not found at ${SIF_FILE}"
|
|
echo " Run 01_build_container.sh first."
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -d "$MODEL_DIR" ]; then
|
|
echo "ERROR: Model directory not found at ${MODEL_DIR}"
|
|
echo " Run 10_download_model_122b.sh first."
|
|
exit 1
|
|
fi
|
|
|
|
MODEL_PARENT="$(dirname "$MODEL_DIR")"
|
|
MODEL_NAME="$(basename "$MODEL_DIR")"
|
|
|
|
VLLM_ARGS=(
|
|
--model "/models/${MODEL_NAME}"
|
|
--port "$PORT"
|
|
--host 0.0.0.0
|
|
--tensor-parallel-size "$TENSOR_PARALLEL"
|
|
--max-model-len "$MAX_MODEL_LEN"
|
|
--gpu-memory-utilization "$GPU_MEM_UTIL"
|
|
--dtype auto
|
|
--trust-remote-code
|
|
--reasoning-parser qwen3
|
|
--served-model-name "qwen3.5-122b-a10b-fp8"
|
|
--max-num-seqs 16
|
|
--enable-prefix-caching
|
|
)
|
|
|
|
if [ -n "$API_KEY" ]; then
|
|
VLLM_ARGS+=(--api-key "$API_KEY")
|
|
fi
|
|
|
|
echo "=============================================="
|
|
echo " vLLM Inference Server — Qwen3.5-122B-A10B-FP8"
|
|
echo "=============================================="
|
|
echo " Model: ${MODEL_DIR}"
|
|
echo " Container: ${SIF_FILE}"
|
|
echo " Port: ${PORT}"
|
|
echo " Context len: ${MAX_MODEL_LEN}"
|
|
echo " GPU util: ${GPU_MEM_UTIL}"
|
|
echo " TP size: ${TENSOR_PARALLEL}"
|
|
echo " dtype: auto (FP8)"
|
|
echo " API key: ${API_KEY:-<none>}"
|
|
echo "=============================================="
|
|
echo ""
|
|
echo "Starting server... (Ctrl+C to stop)"
|
|
echo "API will be available at: http://$(hostname):${PORT}/v1"
|
|
echo ""
|
|
|
|
apptainer exec --nv \
|
|
--writable-tmpfs \
|
|
--bind "${MODEL_PARENT}:/models" \
|
|
"$SIF_FILE" \
|
|
python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
|