#!/usr/bin/env bash # ------------------------------------------------------------------ # 11_start_server_122b.sh # Launches the vLLM inference server for Qwen3.5-122B-A10B-FP8 # inside the Apptainer container using all 4 GPUs. # # NOTE: Only one model can run on port 7080 at a time. # Stop the 35B model first: bash 05_stop_server.sh # # Usage: # bash 11_start_server_122b.sh # # Environment variables (override defaults): # MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-122B-A10B-FP8) # PORT - Server port (default: 7080) # MAX_MODEL_LEN - Maximum context length (default: 32768) # GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92) # API_KEY - API key for authentication (default: none) # TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 4) # ------------------------------------------------------------------ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-122B-A10B-FP8}" PORT="${PORT:-7080}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}" API_KEY="${API_KEY:-}" TENSOR_PARALLEL="${TENSOR_PARALLEL:-4}" if [ ! -f "$SIF_FILE" ]; then echo "ERROR: Container image not found at ${SIF_FILE}" echo " Run 01_build_container.sh first." exit 1 fi if [ ! -d "$MODEL_DIR" ]; then echo "ERROR: Model directory not found at ${MODEL_DIR}" echo " Run 10_download_model_122b.sh first." exit 1 fi MODEL_PARENT="$(dirname "$MODEL_DIR")" MODEL_NAME="$(basename "$MODEL_DIR")" VLLM_ARGS=( --model "/models/${MODEL_NAME}" --port "$PORT" --host 0.0.0.0 --tensor-parallel-size "$TENSOR_PARALLEL" --max-model-len "$MAX_MODEL_LEN" --gpu-memory-utilization "$GPU_MEM_UTIL" --dtype auto --trust-remote-code --reasoning-parser qwen3 --served-model-name "qwen3.5-122b-a10b-fp8" --max-num-seqs 16 --enable-prefix-caching ) if [ -n "$API_KEY" ]; then VLLM_ARGS+=(--api-key "$API_KEY") fi echo "==============================================" echo " vLLM Inference Server — Qwen3.5-122B-A10B-FP8" echo "==============================================" echo " Model: ${MODEL_DIR}" echo " Container: ${SIF_FILE}" echo " Port: ${PORT}" echo " Context len: ${MAX_MODEL_LEN}" echo " GPU util: ${GPU_MEM_UTIL}" echo " TP size: ${TENSOR_PARALLEL}" echo " dtype: auto (FP8)" echo " API key: ${API_KEY:-}" echo "==============================================" echo "" echo "Starting server... (Ctrl+C to stop)" echo "API will be available at: http://$(hostname):${PORT}/v1" echo "" apptainer exec --nv \ --writable-tmpfs \ --bind "${MODEL_PARENT}:/models" \ "$SIF_FILE" \ python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"