diff --git a/10_download_model_122b.sh b/10_download_model_122b.sh new file mode 100755 index 0000000..03c130e --- /dev/null +++ b/10_download_model_122b.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 10_download_model_122b.sh +# Downloads Qwen3.5-122B-A10B-FP8 weights from Hugging Face +# using huggingface-cli INSIDE the Apptainer container. +# +# Prerequisites: +# - Container built via 01_build_container.sh +# +# Usage: +# bash 10_download_model_122b.sh [TARGET_DIR] +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" + +MODEL_ID="Qwen/Qwen3.5-122B-A10B-FP8" +TARGET_DIR="${1:-$HOME/models/Qwen3.5-122B-A10B-FP8}" +HF_CACHE_DIR="${HOME}/.cache/huggingface" + +if [ ! -f "$SIF_FILE" ]; then + echo "ERROR: Container image not found at ${SIF_FILE}" + echo " Run 01_build_container.sh first." + exit 1 +fi + +echo "=== Downloading ${MODEL_ID} to ${TARGET_DIR} ===" +echo " Using huggingface-cli inside the container." +echo " This is ~125 GB and may take a while." +echo "" + +mkdir -p "$TARGET_DIR" "$HF_CACHE_DIR" + +apptainer exec \ + --writable-tmpfs \ + --bind "$(dirname "$TARGET_DIR"):$(dirname "$TARGET_DIR")" \ + --bind "${HF_CACHE_DIR}:${HF_CACHE_DIR}" \ + --env HF_HOME="${HF_CACHE_DIR}" \ + --env HF_HUB_CACHE="${HF_CACHE_DIR}/hub" \ + --env XDG_CACHE_HOME="${HOME}/.cache" \ + "$SIF_FILE" \ + huggingface-cli download "$MODEL_ID" \ + --local-dir "$TARGET_DIR" \ + --local-dir-use-symlinks False + +echo "" +echo "=== Download complete ===" +echo "Model stored at: ${TARGET_DIR}" +echo "Total size:" +du -sh "$TARGET_DIR" diff --git a/11_start_server_122b.sh b/11_start_server_122b.sh new file mode 100755 index 0000000..c922fc2 --- /dev/null +++ b/11_start_server_122b.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 11_start_server_122b.sh +# Launches the vLLM inference server for Qwen3.5-122B-A10B-FP8 +# inside the Apptainer container using all 4 GPUs. +# +# NOTE: Only one model can run on port 7080 at a time. +# Stop the 35B model first: bash 05_stop_server.sh +# +# Usage: +# bash 11_start_server_122b.sh +# +# Environment variables (override defaults): +# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-122B-A10B-FP8) +# PORT - Server port (default: 7080) +# MAX_MODEL_LEN - Maximum context length (default: 32768) +# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92) +# API_KEY - API key for authentication (default: none) +# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 4) +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" + +MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-122B-A10B-FP8}" +PORT="${PORT:-7080}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" +GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}" +API_KEY="${API_KEY:-}" +TENSOR_PARALLEL="${TENSOR_PARALLEL:-4}" + +if [ ! -f "$SIF_FILE" ]; then + echo "ERROR: Container image not found at ${SIF_FILE}" + echo " Run 01_build_container.sh first." + exit 1 +fi + +if [ ! -d "$MODEL_DIR" ]; then + echo "ERROR: Model directory not found at ${MODEL_DIR}" + echo " Run 10_download_model_122b.sh first." + exit 1 +fi + +MODEL_PARENT="$(dirname "$MODEL_DIR")" +MODEL_NAME="$(basename "$MODEL_DIR")" + +VLLM_ARGS=( + --model "/models/${MODEL_NAME}" + --port "$PORT" + --host 0.0.0.0 + --tensor-parallel-size "$TENSOR_PARALLEL" + --max-model-len "$MAX_MODEL_LEN" + --gpu-memory-utilization "$GPU_MEM_UTIL" + --dtype auto + --trust-remote-code + --reasoning-parser qwen3 + --served-model-name "qwen3.5-122b-a10b-fp8" + --max-num-seqs 16 + --enable-prefix-caching +) + +if [ -n "$API_KEY" ]; then + VLLM_ARGS+=(--api-key "$API_KEY") +fi + +echo "==============================================" +echo " vLLM Inference Server — Qwen3.5-122B-A10B-FP8" +echo "==============================================" +echo " Model: ${MODEL_DIR}" +echo " Container: ${SIF_FILE}" +echo " Port: ${PORT}" +echo " Context len: ${MAX_MODEL_LEN}" +echo " GPU util: ${GPU_MEM_UTIL}" +echo " TP size: ${TENSOR_PARALLEL}" +echo " dtype: auto (FP8)" +echo " API key: ${API_KEY:-}" +echo "==============================================" +echo "" +echo "Starting server... (Ctrl+C to stop)" +echo "API will be available at: http://$(hostname):${PORT}/v1" +echo "" + +apptainer exec --nv \ + --writable-tmpfs \ + --bind "${MODEL_PARENT}:/models" \ + "$SIF_FILE" \ + python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" diff --git a/12_start_server_122b_background.sh b/12_start_server_122b_background.sh new file mode 100755 index 0000000..e94ceca --- /dev/null +++ b/12_start_server_122b_background.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 12_start_server_122b_background.sh +# Launches the vLLM server for Qwen3.5-122B-A10B-FP8 in background. +# +# NOTE: Only one model can run on port 7080 at a time. +# Stop the current model first: bash 05_stop_server.sh +# +# Usage: +# bash 12_start_server_122b_background.sh +# +# Logs are written to: ./logs/vllm_server_122b_.log +# PID is written to: ./logs/vllm_server.pid +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LOG_DIR="${SCRIPT_DIR}/logs" +mkdir -p "$LOG_DIR" + +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +LOG_FILE="${LOG_DIR}/vllm_server_122b_${TIMESTAMP}.log" +PID_FILE="${LOG_DIR}/vllm_server.pid" + +if [ -f "$PID_FILE" ]; then + OLD_PID=$(cat "$PID_FILE") + if kill -0 "$OLD_PID" 2>/dev/null; then + echo "A vLLM server is already running with PID ${OLD_PID}" + echo "Stop it first: bash 05_stop_server.sh" + exit 1 + fi +fi + +echo "Starting vLLM server (122B) in background..." +echo "Log file: ${LOG_FILE}" + +nohup bash "${SCRIPT_DIR}/11_start_server_122b.sh" > "$LOG_FILE" 2>&1 & +SERVER_PID=$! +echo "$SERVER_PID" > "$PID_FILE" + +echo "Server PID: ${SERVER_PID}" +echo "" +echo "Monitor logs: tail -f ${LOG_FILE}" +echo "Stop server: bash 05_stop_server.sh" +echo "" + +sleep 3 +if kill -0 "$SERVER_PID" 2>/dev/null; then + echo "Server process is running. Waiting for model to load..." + echo "(This can take 5-10 minutes for Qwen3.5-122B-A10B-FP8)" +else + echo "ERROR: Server process exited. Check logs:" + tail -20 "$LOG_FILE" + exit 1 +fi diff --git a/README.md b/README.md index 04414dd..6dc7cce 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,16 @@ -# LLM Inferenz Server — Qwen3.5-35B-A3B +# LLM Inferenz Server — Qwen3.5 -Self-hosted LLM inference for ~15 concurrent students using **Qwen3.5-35B-A3B** -(MoE, 35B total / 3B active per token), served via **vLLM** inside an -**Apptainer** container on a GPU server. Two front-ends are provided: -**Open WebUI** (server-hosted ChatGPT-like UI) and a **Streamlit app** -(local chat + file editor with code execution). +Self-hosted LLM inference for ~15 concurrent students, served via **vLLM** +inside an **Apptainer** container on a GPU server. Two models are available +(one at a time): + +| Model | Params | Active | Weights | GPUs | +|-------|--------|--------|---------|------| +| **Qwen3.5-35B-A3B** | 35B MoE | 3B | ~67 GB BF16 | 2× L40S (TP=2) | +| **Qwen3.5-122B-A10B-FP8** | 122B MoE | 10B | ~125 GB FP8 | 4× L40S (TP=4) | + +Two front-ends are provided: **Open WebUI** (server-hosted ChatGPT-like UI) +and a **Streamlit app** (local chat + file editor with code execution). ## Architecture @@ -25,33 +31,33 @@ Students │ vLLM Server (nightly) │ │ Apptainer container (.sif) │ ├──────────────────────────────┤ - │ Qwen3.5-35B-A3B weights │ + │ Model weights │ │ (bind-mounted from host) │ ├──────────────────────────────┤ - │ 2× NVIDIA L40S (46 GB ea.) │ - │ Tensor Parallel = 2 │ + │ 4× NVIDIA L40S (46 GB ea.) │ + │ 184 GB total VRAM │ └──────────────────────────────┘ ``` ## Hardware -The server `silicon.fhgr.ch` has **4× NVIDIA L40S** GPUs (46 GB VRAM each). -The inference server uses **2 GPUs** with tensor parallelism, leaving 2 GPUs free. +The server `silicon.fhgr.ch` has **4× NVIDIA L40S** GPUs (46 GB VRAM each, +184 GB total). Only one model runs at a time on port 7080. -| Component | Value | -|-----------|-------| -| GPUs used | 2× NVIDIA L40S | -| VRAM used | ~92 GB total | -| Model size (BF16) | ~67 GB | -| Active params/token | 3B (MoE) | -| Context length | 32,768 tokens | -| Port | 7080 | +| | Qwen3.5-35B-A3B | Qwen3.5-122B-A10B-FP8 | +|---|---|---| +| GPUs used | 2× L40S (TP=2) | 4× L40S (TP=4) | +| VRAM used | ~92 GB | ~184 GB | +| Weight size | ~67 GB (BF16) | ~125 GB (FP8) | +| Active params/token | 3B (MoE) | 10B (MoE) | +| Context length | 32,768 tokens | 32,768 tokens | +| Port | 7080 | 7080 | ## Prerequisites - **Apptainer** (formerly Singularity) installed on the server - **NVIDIA drivers** with GPU passthrough support (`--nv` flag) -- **~80 GB disk** for model weights + ~8 GB for the container image +- **~200 GB disk** for model weights (both models) + ~8 GB for the container image - **Network access** to Hugging Face (for model download) and Docker Hub (for container build) > **Note**: No `pip` or `python` is needed on the host — everything runs inside @@ -97,32 +103,45 @@ Pulls the `vllm/vllm-openai:nightly` Docker image (required for Qwen3.5 support), installs latest `transformers` from source, and packages everything into `vllm_qwen.sif` (~8 GB). Takes 15-20 minutes. -### Step 4: Download the Model (~67 GB) +### Step 4: Download Model Weights +**35B model (~67 GB):** ```bash bash 02_download_model.sh ``` -Downloads Qwen3.5-35B-A3B weights using `huggingface-cli` **inside the -container**. Stored at `~/models/Qwen3.5-35B-A3B`. Takes 5-30 minutes -depending on bandwidth. +**122B model (~125 GB):** +```bash +bash 10_download_model_122b.sh +``` + +Both use `huggingface-cli` **inside the container**. Stored at +`~/models/Qwen3.5-35B-A3B` and `~/models/Qwen3.5-122B-A10B-FP8` respectively. ### Step 5: Start the Server -**Interactive (foreground) — recommended with tmux:** +Only one model can run at a time on port 7080. Choose one: + +**35B model (2 GPUs, faster per-token, smaller):** ```bash -tmux new -s llm -bash 03_start_server.sh -# Ctrl+B, then D to detach +bash 03_start_server.sh # foreground +bash 04_start_server_background.sh # background ``` -**Background with logging:** +**122B model (4 GPUs, more capable, FP8):** ```bash -bash 04_start_server_background.sh -tail -f logs/vllm_server_*.log +bash 11_start_server_122b.sh # foreground +bash 12_start_server_122b_background.sh # background ``` -The model takes 2-5 minutes to load into GPU memory. It's ready when you see: +**To switch models:** +```bash +bash 05_stop_server.sh # stop whichever is running +bash 11_start_server_122b.sh # start the other one +``` + +The model takes 2-5 minutes (35B) or 5-10 minutes (122B) to load. It's ready +when you see: ``` INFO: Uvicorn running on http://0.0.0.0:7080 ``` @@ -175,7 +194,7 @@ Access it at `http://silicon.fhgr.ch:7081`. Distribute `STUDENT_GUIDE.md` with connection details: - **Open WebUI**: `http://silicon.fhgr.ch:7081` (recommended for most students) - **API Base URL**: `http://silicon.fhgr.ch:7080/v1` (for SDK / programmatic use) -- **Model name**: `qwen3.5-35b-a3b` +- **Model name**: `qwen3.5-35b-a3b` or `qwen3.5-122b-a10b-fp8` (depending on which is running) --- @@ -271,28 +290,28 @@ Opens at `http://localhost:8501` with two tabs: ## Server Configuration -All configuration is via environment variables passed to `03_start_server.sh`: +Both start scripts accept the same environment variables: -| Variable | Default | Description | -|-------------------|----------------------------------|--------------------------------| -| `MODEL_DIR` | `~/models/Qwen3.5-35B-A3B` | Path to model weights | -| `PORT` | `7080` | HTTP port | -| `MAX_MODEL_LEN` | `32768` | Max context length (tokens) | -| `GPU_MEM_UTIL` | `0.92` | Fraction of GPU memory to use | -| `API_KEY` | *(empty = no auth)* | API key for authentication | -| `TENSOR_PARALLEL` | `2` | Number of GPUs | +| Variable | 35B default | 122B default | Description | +|----------|-------------|--------------|-------------| +| `MODEL_DIR` | `~/models/Qwen3.5-35B-A3B` | `~/models/Qwen3.5-122B-A10B-FP8` | Model weights path | +| `PORT` | `7080` | `7080` | HTTP port | +| `MAX_MODEL_LEN` | `32768` | `32768` | Max context length | +| `GPU_MEM_UTIL` | `0.92` | `0.92` | GPU memory fraction | +| `API_KEY` | *(none)* | *(none)* | API key for auth | +| `TENSOR_PARALLEL` | `2` | `4` | Number of GPUs | ### Examples ```bash -# Increase context length +# Increase context length (35B) MAX_MODEL_LEN=65536 bash 03_start_server.sh -# Add API key authentication -API_KEY="your-secret-key" bash 03_start_server.sh +# Increase context length (122B — has room with FP8) +MAX_MODEL_LEN=65536 bash 11_start_server_122b.sh -# Use all 4 GPUs (more KV cache headroom) -TENSOR_PARALLEL=4 bash 03_start_server.sh +# Add API key authentication (works for either model) +API_KEY="your-secret-key" bash 11_start_server_122b.sh ``` --- @@ -327,14 +346,17 @@ tmux attach -t llm |------------------------------------|------------------------------------------------------| | `vllm_qwen.def` | Apptainer container definition (vLLM nightly + deps) | | `01_build_container.sh` | Builds the Apptainer `.sif` image | -| `02_download_model.sh` | Downloads model weights (runs inside container) | -| `03_start_server.sh` | Starts vLLM server (foreground) | -| `04_start_server_background.sh` | Starts vLLM server in background with logging | -| `05_stop_server.sh` | Stops the background vLLM server | +| `02_download_model.sh` | Downloads 35B model weights | +| `03_start_server.sh` | Starts 35B vLLM server (foreground, TP=2) | +| `04_start_server_background.sh` | Starts 35B server in background with logging | +| `05_stop_server.sh` | Stops whichever background vLLM server is running | | `06_setup_openwebui.sh` | Pulls the Open WebUI container image | | `07_start_openwebui.sh` | Starts Open WebUI (foreground) | | `08_start_openwebui_background.sh` | Starts Open WebUI in background with logging | | `09_stop_openwebui.sh` | Stops the background Open WebUI | +| `10_download_model_122b.sh` | Downloads 122B FP8 model weights | +| `11_start_server_122b.sh` | Starts 122B vLLM server (foreground, TP=4) | +| `12_start_server_122b_background.sh` | Starts 122B server in background with logging | | `app.py` | Streamlit chat & file editor web app | | `requirements.txt` | Python dependencies for the Streamlit app | | `test_server.py` | Tests the running server via CLI | diff --git a/STUDENT_GUIDE.md b/STUDENT_GUIDE.md index 7009a09..33f2696 100644 --- a/STUDENT_GUIDE.md +++ b/STUDENT_GUIDE.md @@ -1,10 +1,15 @@ -# Student Guide — Qwen3.5-35B-A3B Inference Server +# Student Guide — Qwen3.5 Inference Server ## Overview -A **Qwen3.5-35B-A3B** language model is running on our GPU server. It's a -Mixture-of-Experts model (35B total parameters, 3B active per token), providing -fast and high-quality responses. +A **Qwen3.5** large language model is running on our GPU server. Two models +may be available at different times (your instructor will let you know which +one is active): + +| Model | Params | Best for | +|-------|--------|----------| +| `qwen3.5-35b-a3b` | 35B (3B active) | Fast responses, everyday tasks | +| `qwen3.5-122b-a10b-fp8` | 122B (10B active) | Complex reasoning, coding, research | There are **three ways** to interact with the model: @@ -12,7 +17,7 @@ There are **three ways** to interact with the model: 2. **Streamlit App** — Local app with chat, file editor, and code execution 3. **Python SDK / curl** — Programmatic access via the OpenAI-compatible API -> **Note**: You must be on the university network or VPN to reach the server. +> **Note**: You must be on the fhgr network or VPN to reach the server. ## Connection Details @@ -20,9 +25,13 @@ There are **three ways** to interact with the model: |------------------|---------------------------------------------| | **Open WebUI** | `http://silicon.fhgr.ch:7081` | | **API Base URL** | `http://silicon.fhgr.ch:7080/v1` | -| **Model** | `qwen3.5-35b-a3b` | +| **Model** | *(check Open WebUI model selector or ask your instructor)* | | **API Key** | *(ask your instructor — may be `EMPTY`)* | +> **Tip**: In Open WebUI, the model dropdown at the top automatically shows +> whichever model is currently running. For the API, use +> `curl http://silicon.fhgr.ch:7080/v1/models` to check. + --- ## Option 1: Open WebUI (Recommended)