commit 076001b07f20c8123d92ce9237dad23ac5e40e7c Author: herzogflorian Date: Mon Mar 2 14:43:39 2026 +0100 Add vLLM inference setup for Qwen3.5-35B-A3B on Apptainer Scripts to build container, download model, and serve Qwen3.5-35B-A3B via vLLM with OpenAI-compatible API on port 7080. Configured for 2x NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent students. Made-with: Cursor diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0868882 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Apptainer container image (large binary) +*.sif + +# Logs +logs/ + +# Model weights (downloaded separately) +models/ + +# HuggingFace cache +.cache/ + +# macOS +.DS_Store diff --git a/01_build_container.sh b/01_build_container.sh new file mode 100755 index 0000000..5f83367 --- /dev/null +++ b/01_build_container.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 01_build_container.sh +# Builds the Apptainer SIF image for vLLM inference. +# This must be run FIRST — everything else runs inside the container. +# +# Usage: +# bash 01_build_container.sh +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +DEF_FILE="${SCRIPT_DIR}/vllm_qwen.def" +SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" + +if [ -f "$SIF_FILE" ]; then + echo "WARNING: ${SIF_FILE} already exists." + read -p "Rebuild? [y/N] " confirm + [[ "$confirm" =~ ^[Yy]$ ]] || exit 0 +fi + +echo "=== Building Apptainer image from ${DEF_FILE} ===" +echo " This will pull the vLLM Docker image and convert it." +echo " Estimated time: 10-20 minutes depending on network speed." +echo "" + +apptainer build --nv "$SIF_FILE" "$DEF_FILE" + +echo "" +echo "=== Build complete ===" +echo "Image: ${SIF_FILE}" +ls -lh "$SIF_FILE" diff --git a/02_download_model.sh b/02_download_model.sh new file mode 100755 index 0000000..0defe41 --- /dev/null +++ b/02_download_model.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 02_download_model.sh +# Downloads Qwen3.5-35B-A3B weights from Hugging Face +# using huggingface-cli INSIDE the Apptainer container. +# +# Prerequisites: +# - Container built via 01_build_container.sh +# +# Usage: +# bash 02_download_model.sh [TARGET_DIR] +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" + +MODEL_ID="Qwen/Qwen3.5-35B-A3B" +TARGET_DIR="${1:-$HOME/models/Qwen3.5-35B-A3B}" +HF_CACHE_DIR="${HOME}/.cache/huggingface" + +if [ ! -f "$SIF_FILE" ]; then + echo "ERROR: Container image not found at ${SIF_FILE}" + echo " Run 01_build_container.sh first." + exit 1 +fi + +echo "=== Downloading ${MODEL_ID} to ${TARGET_DIR} ===" +echo " Using huggingface-cli inside the container." +echo "" + +mkdir -p "$TARGET_DIR" "$HF_CACHE_DIR" + +apptainer exec \ + --writable-tmpfs \ + --bind "$(dirname "$TARGET_DIR"):$(dirname "$TARGET_DIR")" \ + --bind "${HF_CACHE_DIR}:${HF_CACHE_DIR}" \ + --env HF_HOME="${HF_CACHE_DIR}" \ + --env HF_HUB_CACHE="${HF_CACHE_DIR}/hub" \ + --env XDG_CACHE_HOME="${HOME}/.cache" \ + "$SIF_FILE" \ + huggingface-cli download "$MODEL_ID" \ + --local-dir "$TARGET_DIR" \ + --local-dir-use-symlinks False + +echo "" +echo "=== Download complete ===" +echo "Model stored at: ${TARGET_DIR}" +echo "Total size:" +du -sh "$TARGET_DIR" diff --git a/03_start_server.sh b/03_start_server.sh new file mode 100755 index 0000000..c0e0a16 --- /dev/null +++ b/03_start_server.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 03_start_server.sh +# Launches the vLLM inference server for Qwen3.5-35B-A3B +# inside the Apptainer container. +# +# Usage: +# bash 03_start_server.sh +# +# Environment variables (override defaults): +# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-35B-A3B) +# PORT - Server port (default: 7080) +# MAX_MODEL_LEN - Maximum context length (default: 32768) +# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92) +# API_KEY - API key for authentication (default: none) +# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2) +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif" + +MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}" +PORT="${PORT:-7080}" +MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}" +GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}" +API_KEY="${API_KEY:-}" +TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}" + +if [ ! -f "$SIF_FILE" ]; then + echo "ERROR: Container image not found at ${SIF_FILE}" + echo " Run 01_build_container.sh first." + exit 1 +fi + +if [ ! -d "$MODEL_DIR" ]; then + echo "ERROR: Model directory not found at ${MODEL_DIR}" + echo " Run 02_download_model.sh first." + exit 1 +fi + +MODEL_PARENT="$(dirname "$MODEL_DIR")" +MODEL_NAME="$(basename "$MODEL_DIR")" + +VLLM_ARGS=( + --model "/models/${MODEL_NAME}" + --port "$PORT" + --host 0.0.0.0 + --tensor-parallel-size "$TENSOR_PARALLEL" + --max-model-len "$MAX_MODEL_LEN" + --gpu-memory-utilization "$GPU_MEM_UTIL" + --dtype bfloat16 + --trust-remote-code + --reasoning-parser qwen3 + --served-model-name "qwen3.5-35b-a3b" + --max-num-seqs 16 + --enable-prefix-caching +) + +if [ -n "$API_KEY" ]; then + VLLM_ARGS+=(--api-key "$API_KEY") +fi + +echo "==============================================" +echo " vLLM Inference Server — Qwen3.5-35B-A3B" +echo "==============================================" +echo " Model: ${MODEL_DIR}" +echo " Container: ${SIF_FILE}" +echo " Port: ${PORT}" +echo " Context len: ${MAX_MODEL_LEN}" +echo " GPU util: ${GPU_MEM_UTIL}" +echo " TP size: ${TENSOR_PARALLEL}" +echo " API key: ${API_KEY:-}" +echo "==============================================" +echo "" +echo "Starting server... (Ctrl+C to stop)" +echo "API will be available at: http://$(hostname):${PORT}/v1" +echo "" + +apptainer exec --nv \ + --writable-tmpfs \ + --bind "${MODEL_PARENT}:/models" \ + "$SIF_FILE" \ + python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}" diff --git a/04_start_server_background.sh b/04_start_server_background.sh new file mode 100755 index 0000000..06a9c96 --- /dev/null +++ b/04_start_server_background.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 04_start_server_background.sh +# Launches the vLLM server in the background with logging. +# Useful for long-running deployments or running inside tmux/screen. +# +# Usage: +# bash 04_start_server_background.sh +# +# Logs are written to: ./logs/vllm_server_.log +# PID is written to: ./logs/vllm_server.pid +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +LOG_DIR="${SCRIPT_DIR}/logs" +mkdir -p "$LOG_DIR" + +TIMESTAMP="$(date +%Y%m%d_%H%M%S)" +LOG_FILE="${LOG_DIR}/vllm_server_${TIMESTAMP}.log" +PID_FILE="${LOG_DIR}/vllm_server.pid" + +if [ -f "$PID_FILE" ]; then + OLD_PID=$(cat "$PID_FILE") + if kill -0 "$OLD_PID" 2>/dev/null; then + echo "Server already running with PID ${OLD_PID}" + echo "Stop it first: bash 05_stop_server.sh" + exit 1 + fi +fi + +echo "Starting vLLM server in background..." +echo "Log file: ${LOG_FILE}" + +nohup bash "${SCRIPT_DIR}/03_start_server.sh" > "$LOG_FILE" 2>&1 & +SERVER_PID=$! +echo "$SERVER_PID" > "$PID_FILE" + +echo "Server PID: ${SERVER_PID}" +echo "" +echo "Monitor logs: tail -f ${LOG_FILE}" +echo "Stop server: bash 05_stop_server.sh" +echo "" + +sleep 3 +if kill -0 "$SERVER_PID" 2>/dev/null; then + echo "Server process is running. Waiting for model to load..." + echo "(This can take several minutes for Qwen3.5-35B-A3B)" +else + echo "ERROR: Server process exited. Check logs:" + tail -20 "$LOG_FILE" + exit 1 +fi diff --git a/05_stop_server.sh b/05_stop_server.sh new file mode 100755 index 0000000..fd0ab2b --- /dev/null +++ b/05_stop_server.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# 05_stop_server.sh +# Gracefully stops the background vLLM server. +# ------------------------------------------------------------------ +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PID_FILE="${SCRIPT_DIR}/logs/vllm_server.pid" + +if [ ! -f "$PID_FILE" ]; then + echo "No PID file found. Server may not be running." + exit 0 +fi + +SERVER_PID=$(cat "$PID_FILE") + +if kill -0 "$SERVER_PID" 2>/dev/null; then + echo "Stopping server (PID: ${SERVER_PID})..." + kill "$SERVER_PID" + sleep 2 + if kill -0 "$SERVER_PID" 2>/dev/null; then + echo "Process still alive, sending SIGKILL..." + kill -9 "$SERVER_PID" + fi + echo "Server stopped." +else + echo "Server process (PID: ${SERVER_PID}) is not running." +fi + +rm -f "$PID_FILE" diff --git a/README.md b/README.md new file mode 100644 index 0000000..b835a07 --- /dev/null +++ b/README.md @@ -0,0 +1,265 @@ +# LLM Local — Qwen3.5-27B Inference Server + +Self-hosted LLM inference for ~15 concurrent students using **Qwen3.5-27B**, +served via **vLLM** inside an **Apptainer** container on a GPU server. + +## Architecture + +``` +Students (OpenAI SDK / curl) + │ + ▼ + ┌─────────────────────────┐ + │ silicon.fhgr.ch:7080 │ + │ OpenAI-compatible API │ + ├─────────────────────────┤ + │ vLLM Server │ + │ (Apptainer container) │ + ├─────────────────────────┤ + │ Qwen3.5-27B weights │ + │ (bind-mounted) │ + ├─────────────────────────┤ + │ NVIDIA GPU │ + └─────────────────────────┘ +``` + +## Prerequisites + +- **GPU**: NVIDIA GPU with >=80 GB VRAM (A100-80GB or H100 recommended). + Qwen3.5-27B in BF16 requires ~56 GB VRAM plus KV cache overhead. +- **Apptainer** (formerly Singularity) installed on the server. +- **NVIDIA drivers** + **nvidia-container-cli** for GPU passthrough. +- **~60 GB disk space** for model weights + ~15 GB for the container image. +- **Network**: Students must be on the university network or VPN. + +## Hardware Sizing + +| Component | Minimum | Recommended | +|-----------|----------------|-----------------| +| GPU VRAM | 80 GB (1× A100)| 80 GB (1× H100) | +| RAM | 64 GB | 128 GB | +| Disk | 100 GB free | 200 GB free | + +> **If your GPU has less than 80 GB VRAM**, you have two options: +> 1. Use a **quantized** version (e.g., AWQ/GPTQ 4-bit — ~16 GB VRAM) +> 2. Use **tensor parallelism** across multiple GPUs (set `TENSOR_PARALLEL=2`) + +--- + +## Step-by-Step Setup + +### Step 0: SSH into the Server + +```bash +ssh herzogfloria@silicon.fhgr.ch +``` + +### Step 1: Clone This Repository + +```bash +# Or copy the files to the server +git clone ~/LLM_local +cd ~/LLM_local +chmod +x *.sh +``` + +### Step 2: Check GPU and Environment + +```bash +# Verify GPU is visible +nvidia-smi + +# Verify Apptainer is installed +apptainer --version + +# Check available disk space +df -h ~ +``` + +### Step 3: Download the Model (~60 GB) + +```bash +# Install huggingface-cli if not available +pip install --user huggingface_hub[cli] + +# Download Qwen3.5-27B +bash 01_download_model.sh +# Default target: ~/models/Qwen3.5-27B +``` + +This downloads the full BF16 weights. Takes 20-60 minutes depending on bandwidth. + +### Step 4: Build the Apptainer Container + +```bash +bash 02_build_container.sh +``` + +This pulls the `vllm/vllm-openai:latest` Docker image and converts it to a `.sif` file. +Takes 10-20 minutes. The resulting `vllm_qwen.sif` is ~12-15 GB. + +> **Tip**: If building fails due to network/proxy issues, you can pull the Docker image +> first and convert manually: +> ```bash +> apptainer pull docker://vllm/vllm-openai:latest +> ``` + +### Step 5: Start the Server + +**Interactive (foreground):** +```bash +bash 03_start_server.sh +``` + +**Background (recommended for production):** +```bash +bash 04_start_server_background.sh +``` + +The server takes 2-5 minutes to load the model into GPU memory. Monitor with: +```bash +tail -f logs/vllm_server_*.log +``` + +Look for the line: +``` +INFO: Uvicorn running on http://0.0.0.0:8000 +``` + +### Step 6: Test the Server + +```bash +# Quick health check +curl http://localhost:7080/v1/models + +# Full test +pip install openai +python test_server.py +``` + +### Step 7: Share with Students + +Distribute the `STUDENT_GUIDE.md` file or share the connection details: +- **27B Base URL**: `http://silicon.fhgr.ch:7080/v1` — model name: `qwen3.5-27b` +- **35B Base URL**: `http://silicon.fhgr.ch:7081/v1` — model name: `qwen3.5-35b-a3b` + +--- + +## Configuration + +All configuration is via environment variables in `03_start_server.sh`: + +| Variable | Default | Description | +|-------------------|------------------------------|-------------------------------------| +| `MODEL_DIR` | `~/models/Qwen3.5-27B` | Path to model weights | +| `PORT` | `7080` | HTTP port | +| `MAX_MODEL_LEN` | `32768` | Max context length (tokens) | +| `GPU_MEM_UTIL` | `0.92` | Fraction of GPU memory to use | +| `API_KEY` | *(empty = no auth)* | API key for authentication | +| `TENSOR_PARALLEL` | `1` | Number of GPUs | + +### Context Length Tuning + +The default `MAX_MODEL_LEN=32768` is conservative and ensures stable operation for 15 +concurrent users. If you have plenty of VRAM headroom: + +```bash +MAX_MODEL_LEN=65536 bash 03_start_server.sh +``` + +Qwen3.5-27B natively supports up to 262,144 tokens, but longer contexts require +significantly more GPU memory for KV cache. + +### Adding Authentication + +```bash +API_KEY="your-secret-key-here" bash 03_start_server.sh +``` + +Students then use this key in their `api_key` parameter. + +### Multi-GPU Setup + +If you have multiple GPUs: + +```bash +TENSOR_PARALLEL=2 bash 03_start_server.sh +``` + +--- + +## Server Management + +```bash +# Start in background +bash 04_start_server_background.sh + +# Check if running +curl -s http://localhost:7080/v1/models | python -m json.tool + +# View logs +tail -f logs/vllm_server_*.log + +# Stop +bash 05_stop_server.sh + +# Monitor GPU usage +watch -n 2 nvidia-smi +``` + +### Running Persistently with tmux + +For a robust setup that survives SSH disconnects: + +```bash +ssh herzogfloria@silicon.fhgr.ch +tmux new -s llm_server +bash 03_start_server.sh +# Press Ctrl+B, then D to detach + +# Reconnect later: +tmux attach -t llm_server +``` + +--- + +## Files Overview + +| File | Purpose | +|------------------------------|------------------------------------------- | +| `vllm_qwen.def` | Apptainer container definition | +| `01_download_model.sh` | Downloads model weights from Hugging Face | +| `02_build_container.sh` | Builds the Apptainer .sif image | +| `03_start_server.sh` | Starts vLLM server (foreground) | +| `04_start_server_background.sh` | Starts server in background with logging| +| `05_stop_server.sh` | Stops the background server | +| `test_server.py` | Tests the running server | +| `STUDENT_GUIDE.md` | Instructions for students | + +--- + +## Troubleshooting + +### "CUDA out of memory" +- Reduce `MAX_MODEL_LEN` (e.g., 16384) +- Reduce `GPU_MEM_UTIL` (e.g., 0.85) +- Use a quantized model variant + +### Container build fails +- Ensure you have internet access and sufficient disk space (~20 GB for build cache) +- Try: `apptainer pull docker://vllm/vllm-openai:latest` first + +### "No NVIDIA GPU detected" +- Check that `nvidia-smi` works outside the container +- Ensure `--nv` flag is passed (already in scripts) +- Verify nvidia-container-cli: `apptainer exec --nv vllm_qwen.sif nvidia-smi` + +### Server starts but students can't connect +- Check firewall: `sudo ufw allow 7080:7090/tcp` or equivalent +- Verify the server binds to `0.0.0.0` (not just localhost) +- Students must use the server's hostname/IP, not `localhost` + +### Slow generation with many users +- This is expected — vLLM batches requests but throughput is finite +- Consider reducing `max_tokens` in student requests +- Monitor with: `curl http://localhost:7080/metrics` diff --git a/STUDENT_GUIDE.md b/STUDENT_GUIDE.md new file mode 100644 index 0000000..2f333e4 --- /dev/null +++ b/STUDENT_GUIDE.md @@ -0,0 +1,118 @@ +# Student Guide — Qwen3.5-35B-A3B Inference Server + +## Overview + +A **Qwen3.5-35B-A3B** language model is running on our GPU server. It's a +Mixture-of-Experts model (35B total parameters, 3B active per token), providing +fast and high-quality responses. You can interact with it using the +**OpenAI-compatible API**. + +## Connection Details + +| Parameter | Value | +|------------- |---------------------------------------------| +| **Base URL** | `http://silicon.fhgr.ch:7080/v1` | +| **Model** | `qwen3.5-35b-a3b` | +| **API Key** | *(ask your instructor — may be `EMPTY`)* | + +> **Note**: You must be on the university network or VPN to reach the server. + +--- + +## Quick Start with Python + +### 1. Install the OpenAI SDK + +```bash +pip install openai +``` + +### 2. Simple Chat + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://silicon.fhgr.ch:7080/v1", + api_key="EMPTY", # replace if your instructor set a key +) + +response = client.chat.completions.create( + model="qwen3.5-35b-a3b", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain gradient descent in simple terms."}, + ], + max_tokens=1024, + temperature=0.7, +) + +print(response.choices[0].message.content) +``` + +### 3. Streaming Responses + +```python +stream = client.chat.completions.create( + model="qwen3.5-35b-a3b", + messages=[ + {"role": "user", "content": "Write a haiku about machine learning."}, + ], + max_tokens=256, + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +print() +``` + +--- + +## Quick Start with curl + +```bash +curl http://silicon.fhgr.ch:7080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen3.5-35b-a3b", + "messages": [ + {"role": "user", "content": "What is the capital of Switzerland?"} + ], + "max_tokens": 256, + "temperature": 0.7 + }' +``` + +--- + +## Recommended Parameters + +| Parameter | Recommended | Notes | +|-----------------|-------------|----------------------------------------------| +| `temperature` | 0.7 | Lower = more deterministic, higher = creative | +| `max_tokens` | 1024–4096 | Increase for long-form output | +| `top_p` | 0.95 | Nucleus sampling | +| `stream` | `true` | Better UX for interactive use | + +--- + +## Tips & Etiquette + +- **Be mindful of context length**: Avoid excessively long prompts (>8K tokens) unless necessary. +- **Use streaming**: Makes responses feel faster and reduces perceived latency. +- **Don't spam requests**: The server is shared among ~15 students. +- **Check the model name**: Always use `qwen3.5-35b-a3b` as the model parameter. + +--- + +## Troubleshooting + +| Issue | Solution | +|-----------------------------|-----------------------------------------------------| +| Connection refused | Check you're on the university network / VPN | +| Model not found | Use model name `qwen3.5-35b-a3b` exactly | +| Slow responses | The model is shared — peak times may be slower | +| `401 Unauthorized` | Ask your instructor for the API key | +| Response cut off | Increase `max_tokens` in your request | diff --git a/test_server.py b/test_server.py new file mode 100644 index 0000000..8429080 --- /dev/null +++ b/test_server.py @@ -0,0 +1,70 @@ +""" +Quick test script to verify the vLLM server is running and responding. + +Usage: + pip install openai + python test_server.py [--host HOST] [--port PORT] [--api-key KEY] +""" + +import argparse +import sys + +from openai import OpenAI + + +def main(): + parser = argparse.ArgumentParser(description="Test vLLM inference server") + parser.add_argument("--host", default="localhost", help="Server hostname") + parser.add_argument("--port", default=7080, type=int, help="Server port") + parser.add_argument("--api-key", default="EMPTY", help="API key") + args = parser.parse_args() + + base_url = f"http://{args.host}:{args.port}/v1" + model = "qwen3.5-35b-a3b" + client = OpenAI(base_url=base_url, api_key=args.api_key) + + print(f"Connecting to {base_url} ...") + + print("\n--- Available Models ---") + try: + models = client.models.list() + for m in models.data: + print(f" {m.id}") + except Exception as e: + print(f"ERROR: Cannot connect to server: {e}") + sys.exit(1) + + print("\n--- Test Chat Completion ---") + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "user", "content": "What is 2 + 2? Answer in one sentence."} + ], + max_tokens=256, + temperature=0.7, + ) + print(f" Response: {response.choices[0].message.content}") + print(f" Tokens: prompt={response.usage.prompt_tokens}, " + f"completion={response.usage.completion_tokens}") + + print("\n--- Test Streaming ---") + stream = client.chat.completions.create( + model=model, + messages=[ + {"role": "user", "content": "Count from 1 to 5."} + ], + max_tokens=128, + temperature=0.7, + stream=True, + ) + print(" Response: ", end="") + for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) + print("\n") + + print("All tests passed!") + + +if __name__ == "__main__": + main() diff --git a/vllm_qwen.def b/vllm_qwen.def new file mode 100644 index 0000000..92f9777 --- /dev/null +++ b/vllm_qwen.def @@ -0,0 +1,23 @@ +Bootstrap: docker +From: vllm/vllm-openai:latest + +%labels + Author herzogfloria + Description vLLM nightly inference server for Qwen3.5-35B-A3B + Version 2.0 + +%environment + export HF_HOME=/tmp/hf_cache + export VLLM_USAGE_SOURCE=production + +%post + apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/* + pip install --no-cache-dir vllm --extra-index-url https://wheels.vllm.ai/nightly + pip install --no-cache-dir "transformers @ git+https://github.com/huggingface/transformers.git@main" + pip install --no-cache-dir huggingface_hub[cli] + +%runscript + exec python3 -m vllm.entrypoints.openai.api_server "$@" + +%help + Apptainer container for serving Qwen3.5-35B-A3B via vLLM (nightly).