Scripts to build container, download model, and serve Qwen3.5-35B-A3B via vLLM with OpenAI-compatible API on port 7080. Configured for 2x NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent students. Made-with: Cursor
85 lines
2.6 KiB
Bash
Executable File
85 lines
2.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ------------------------------------------------------------------
|
|
# 03_start_server.sh
|
|
# Launches the vLLM inference server for Qwen3.5-35B-A3B
|
|
# inside the Apptainer container.
|
|
#
|
|
# Usage:
|
|
# bash 03_start_server.sh
|
|
#
|
|
# Environment variables (override defaults):
|
|
# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-35B-A3B)
|
|
# PORT - Server port (default: 7080)
|
|
# MAX_MODEL_LEN - Maximum context length (default: 32768)
|
|
# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92)
|
|
# API_KEY - API key for authentication (default: none)
|
|
# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2)
|
|
# ------------------------------------------------------------------
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
|
|
|
|
MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}"
|
|
PORT="${PORT:-7080}"
|
|
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
|
GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
|
|
API_KEY="${API_KEY:-}"
|
|
TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}"
|
|
|
|
if [ ! -f "$SIF_FILE" ]; then
|
|
echo "ERROR: Container image not found at ${SIF_FILE}"
|
|
echo " Run 01_build_container.sh first."
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -d "$MODEL_DIR" ]; then
|
|
echo "ERROR: Model directory not found at ${MODEL_DIR}"
|
|
echo " Run 02_download_model.sh first."
|
|
exit 1
|
|
fi
|
|
|
|
MODEL_PARENT="$(dirname "$MODEL_DIR")"
|
|
MODEL_NAME="$(basename "$MODEL_DIR")"
|
|
|
|
VLLM_ARGS=(
|
|
--model "/models/${MODEL_NAME}"
|
|
--port "$PORT"
|
|
--host 0.0.0.0
|
|
--tensor-parallel-size "$TENSOR_PARALLEL"
|
|
--max-model-len "$MAX_MODEL_LEN"
|
|
--gpu-memory-utilization "$GPU_MEM_UTIL"
|
|
--dtype bfloat16
|
|
--trust-remote-code
|
|
--reasoning-parser qwen3
|
|
--served-model-name "qwen3.5-35b-a3b"
|
|
--max-num-seqs 16
|
|
--enable-prefix-caching
|
|
)
|
|
|
|
if [ -n "$API_KEY" ]; then
|
|
VLLM_ARGS+=(--api-key "$API_KEY")
|
|
fi
|
|
|
|
echo "=============================================="
|
|
echo " vLLM Inference Server — Qwen3.5-35B-A3B"
|
|
echo "=============================================="
|
|
echo " Model: ${MODEL_DIR}"
|
|
echo " Container: ${SIF_FILE}"
|
|
echo " Port: ${PORT}"
|
|
echo " Context len: ${MAX_MODEL_LEN}"
|
|
echo " GPU util: ${GPU_MEM_UTIL}"
|
|
echo " TP size: ${TENSOR_PARALLEL}"
|
|
echo " API key: ${API_KEY:-<none>}"
|
|
echo "=============================================="
|
|
echo ""
|
|
echo "Starting server... (Ctrl+C to stop)"
|
|
echo "API will be available at: http://$(hostname):${PORT}/v1"
|
|
echo ""
|
|
|
|
apptainer exec --nv \
|
|
--writable-tmpfs \
|
|
--bind "${MODEL_PARENT}:/models" \
|
|
"$SIF_FILE" \
|
|
python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
|