LLM_Inferenz_Server_1/11_start_server_122b.sh
herzogflorian eff76401ee Add Qwen3.5-122B-A10B-FP8 model support
- Add download script (10), start script (11), and background launcher (12)
  for the 122B FP8 model using all 4 GPUs with TP=4
- Both models share port 7080; only one runs at a time
- Update README with dual-model hardware table, switching workflow, and
  updated file overview
- Update STUDENT_GUIDE with both model names and discovery instructions

Made-with: Cursor
2026-03-02 19:00:32 +01:00

89 lines
2.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# ------------------------------------------------------------------
# 11_start_server_122b.sh
# Launches the vLLM inference server for Qwen3.5-122B-A10B-FP8
# inside the Apptainer container using all 4 GPUs.
#
# NOTE: Only one model can run on port 7080 at a time.
# Stop the 35B model first: bash 05_stop_server.sh
#
# Usage:
# bash 11_start_server_122b.sh
#
# Environment variables (override defaults):
# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-122B-A10B-FP8)
# PORT - Server port (default: 7080)
# MAX_MODEL_LEN - Maximum context length (default: 32768)
# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92)
# API_KEY - API key for authentication (default: none)
# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 4)
# ------------------------------------------------------------------
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-122B-A10B-FP8}"
PORT="${PORT:-7080}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
API_KEY="${API_KEY:-}"
TENSOR_PARALLEL="${TENSOR_PARALLEL:-4}"
if [ ! -f "$SIF_FILE" ]; then
echo "ERROR: Container image not found at ${SIF_FILE}"
echo " Run 01_build_container.sh first."
exit 1
fi
if [ ! -d "$MODEL_DIR" ]; then
echo "ERROR: Model directory not found at ${MODEL_DIR}"
echo " Run 10_download_model_122b.sh first."
exit 1
fi
MODEL_PARENT="$(dirname "$MODEL_DIR")"
MODEL_NAME="$(basename "$MODEL_DIR")"
VLLM_ARGS=(
--model "/models/${MODEL_NAME}"
--port "$PORT"
--host 0.0.0.0
--tensor-parallel-size "$TENSOR_PARALLEL"
--max-model-len "$MAX_MODEL_LEN"
--gpu-memory-utilization "$GPU_MEM_UTIL"
--dtype auto
--trust-remote-code
--reasoning-parser qwen3
--served-model-name "qwen3.5-122b-a10b-fp8"
--max-num-seqs 16
--enable-prefix-caching
)
if [ -n "$API_KEY" ]; then
VLLM_ARGS+=(--api-key "$API_KEY")
fi
echo "=============================================="
echo " vLLM Inference Server — Qwen3.5-122B-A10B-FP8"
echo "=============================================="
echo " Model: ${MODEL_DIR}"
echo " Container: ${SIF_FILE}"
echo " Port: ${PORT}"
echo " Context len: ${MAX_MODEL_LEN}"
echo " GPU util: ${GPU_MEM_UTIL}"
echo " TP size: ${TENSOR_PARALLEL}"
echo " dtype: auto (FP8)"
echo " API key: ${API_KEY:-<none>}"
echo "=============================================="
echo ""
echo "Starting server... (Ctrl+C to stop)"
echo "API will be available at: http://$(hostname):${PORT}/v1"
echo ""
apptainer exec --nv \
--writable-tmpfs \
--bind "${MODEL_PARENT}:/models" \
"$SIF_FILE" \
python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"