LLM_Inferenz_Server_1/03_start_server.sh
herzogflorian 076001b07f Add vLLM inference setup for Qwen3.5-35B-A3B on Apptainer
Scripts to build container, download model, and serve Qwen3.5-35B-A3B
via vLLM with OpenAI-compatible API on port 7080. Configured for 2x
NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent
students.

Made-with: Cursor
2026-03-02 14:43:39 +01:00

85 lines
2.6 KiB
Bash
Executable File

#!/usr/bin/env bash
# ------------------------------------------------------------------
# 03_start_server.sh
# Launches the vLLM inference server for Qwen3.5-35B-A3B
# inside the Apptainer container.
#
# Usage:
# bash 03_start_server.sh
#
# Environment variables (override defaults):
# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-35B-A3B)
# PORT - Server port (default: 7080)
# MAX_MODEL_LEN - Maximum context length (default: 32768)
# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92)
# API_KEY - API key for authentication (default: none)
# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2)
# ------------------------------------------------------------------
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}"
PORT="${PORT:-7080}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
API_KEY="${API_KEY:-}"
TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}"
if [ ! -f "$SIF_FILE" ]; then
echo "ERROR: Container image not found at ${SIF_FILE}"
echo " Run 01_build_container.sh first."
exit 1
fi
if [ ! -d "$MODEL_DIR" ]; then
echo "ERROR: Model directory not found at ${MODEL_DIR}"
echo " Run 02_download_model.sh first."
exit 1
fi
MODEL_PARENT="$(dirname "$MODEL_DIR")"
MODEL_NAME="$(basename "$MODEL_DIR")"
VLLM_ARGS=(
--model "/models/${MODEL_NAME}"
--port "$PORT"
--host 0.0.0.0
--tensor-parallel-size "$TENSOR_PARALLEL"
--max-model-len "$MAX_MODEL_LEN"
--gpu-memory-utilization "$GPU_MEM_UTIL"
--dtype bfloat16
--trust-remote-code
--reasoning-parser qwen3
--served-model-name "qwen3.5-35b-a3b"
--max-num-seqs 16
--enable-prefix-caching
)
if [ -n "$API_KEY" ]; then
VLLM_ARGS+=(--api-key "$API_KEY")
fi
echo "=============================================="
echo " vLLM Inference Server — Qwen3.5-35B-A3B"
echo "=============================================="
echo " Model: ${MODEL_DIR}"
echo " Container: ${SIF_FILE}"
echo " Port: ${PORT}"
echo " Context len: ${MAX_MODEL_LEN}"
echo " GPU util: ${GPU_MEM_UTIL}"
echo " TP size: ${TENSOR_PARALLEL}"
echo " API key: ${API_KEY:-<none>}"
echo "=============================================="
echo ""
echo "Starting server... (Ctrl+C to stop)"
echo "API will be available at: http://$(hostname):${PORT}/v1"
echo ""
apptainer exec --nv \
--writable-tmpfs \
--bind "${MODEL_PARENT}:/models" \
"$SIF_FILE" \
python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"