#!/usr/bin/env bash # ------------------------------------------------------------------ # 03_start_server.sh # Launches the vLLM inference server for Qwen3.5-35B-A3B # inside the Apptainer container. # # Usage: # bash 03_start_server.sh # # Environment variables (override defaults): # MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-35B-A3B) # PORT - Server port (default: 7080) # MAX_MODEL_LEN - Maximum context length (default: 32768) # GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92) # API_KEY - API key for authentication (default: none) # TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2) # ------------------------------------------------------------------ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}" PORT="${PORT:-7080}" MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}" API_KEY="${API_KEY:-}" TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}" if [ ! -f "$SIF_FILE" ]; then echo "ERROR: Container image not found at ${SIF_FILE}" echo " Run 01_build_container.sh first." exit 1 fi if [ ! -d "$MODEL_DIR" ]; then echo "ERROR: Model directory not found at ${MODEL_DIR}" echo " Run 02_download_model.sh first." exit 1 fi MODEL_PARENT="$(dirname "$MODEL_DIR")" MODEL_NAME="$(basename "$MODEL_DIR")" VLLM_ARGS=( --model "/models/${MODEL_NAME}" --port "$PORT" --host 0.0.0.0 --tensor-parallel-size "$TENSOR_PARALLEL" --max-model-len "$MAX_MODEL_LEN" --gpu-memory-utilization "$GPU_MEM_UTIL" --dtype bfloat16 --trust-remote-code --reasoning-parser qwen3 --served-model-name "qwen3.5-35b-a3b" --max-num-seqs 16 --enable-prefix-caching ) if [ -n "$API_KEY" ]; then VLLM_ARGS+=(--api-key "$API_KEY") fi echo "==============================================" echo " vLLM Inference Server — Qwen3.5-35B-A3B" echo "==============================================" echo " Model: ${MODEL_DIR}" echo " Container: ${SIF_FILE}" echo " Port: ${PORT}" echo " Context len: ${MAX_MODEL_LEN}" echo " GPU util: ${GPU_MEM_UTIL}" echo " TP size: ${TENSOR_PARALLEL}" echo " API key: ${API_KEY:-}" echo "==============================================" echo "" echo "Starting server... (Ctrl+C to stop)" echo "API will be available at: http://$(hostname):${PORT}/v1" echo "" apptainer exec --nv \ --writable-tmpfs \ --bind "${MODEL_PARENT}:/models" \ "$SIF_FILE" \ python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"