Add vLLM inference setup for Qwen3.5-35B-A3B on Apptainer

Scripts to build container, download model, and serve Qwen3.5-35B-A3B via vLLM with OpenAI-compatible API on port 7080. Configured for 2x NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent students. Made-with: Cursor
2026-03-02 14:43:39 +01:00 · 2026-03-02 14:43:39 +01:00 · 076001b07f
commit 076001b07f
10 changed files with 740 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,14 @@
+# Apptainer container image (large binary)
+*.sif
+
+# Logs
+logs/
+
+# Model weights (downloaded separately)
+models/
+
+# HuggingFace cache
+.cache/
+
+# macOS
+.DS_Store
--- a/01_build_container.sh
+++ b/01_build_container.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 01_build_container.sh
+# Builds the Apptainer SIF image for vLLM inference.
+# This must be run FIRST — everything else runs inside the container.
+#
+# Usage:
+#   bash 01_build_container.sh
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+DEF_FILE="${SCRIPT_DIR}/vllm_qwen.def"
+SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
+
+if [ -f "$SIF_FILE" ]; then
+    echo "WARNING: ${SIF_FILE} already exists."
+    read -p "Rebuild? [y/N] " confirm
+    [[ "$confirm" =~ ^[Yy]$ ]] || exit 0
+fi
+
+echo "=== Building Apptainer image from ${DEF_FILE} ==="
+echo "    This will pull the vLLM Docker image and convert it."
+echo "    Estimated time: 10-20 minutes depending on network speed."
+echo ""
+
+apptainer build --nv "$SIF_FILE" "$DEF_FILE"
+
+echo ""
+echo "=== Build complete ==="
+echo "Image: ${SIF_FILE}"
+ls -lh "$SIF_FILE"
--- a/02_download_model.sh
+++ b/02_download_model.sh
@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 02_download_model.sh
+# Downloads Qwen3.5-35B-A3B weights from Hugging Face
+# using huggingface-cli INSIDE the Apptainer container.
+#
+# Prerequisites:
+#   - Container built via 01_build_container.sh
+#
+# Usage:
+#   bash 02_download_model.sh [TARGET_DIR]
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
+
+MODEL_ID="Qwen/Qwen3.5-35B-A3B"
+TARGET_DIR="${1:-$HOME/models/Qwen3.5-35B-A3B}"
+HF_CACHE_DIR="${HOME}/.cache/huggingface"
+
+if [ ! -f "$SIF_FILE" ]; then
+    echo "ERROR: Container image not found at ${SIF_FILE}"
+    echo "       Run 01_build_container.sh first."
+    exit 1
+fi
+
+echo "=== Downloading ${MODEL_ID} to ${TARGET_DIR} ==="
+echo "    Using huggingface-cli inside the container."
+echo ""
+
+mkdir -p "$TARGET_DIR" "$HF_CACHE_DIR"
+
+apptainer exec \
+    --writable-tmpfs \
+    --bind "$(dirname "$TARGET_DIR"):$(dirname "$TARGET_DIR")" \
+    --bind "${HF_CACHE_DIR}:${HF_CACHE_DIR}" \
+    --env HF_HOME="${HF_CACHE_DIR}" \
+    --env HF_HUB_CACHE="${HF_CACHE_DIR}/hub" \
+    --env XDG_CACHE_HOME="${HOME}/.cache" \
+    "$SIF_FILE" \
+    huggingface-cli download "$MODEL_ID" \
+        --local-dir "$TARGET_DIR" \
+        --local-dir-use-symlinks False
+
+echo ""
+echo "=== Download complete ==="
+echo "Model stored at: ${TARGET_DIR}"
+echo "Total size:"
+du -sh "$TARGET_DIR"
--- a/03_start_server.sh
+++ b/03_start_server.sh
@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 03_start_server.sh
+# Launches the vLLM inference server for Qwen3.5-35B-A3B
+# inside the Apptainer container.
+#
+# Usage:
+#   bash 03_start_server.sh
+#
+# Environment variables (override defaults):
+#   MODEL_DIR       - Path to model weights (default: ~/models/Qwen3.5-35B-A3B)
+#   PORT            - Server port (default: 7080)
+#   MAX_MODEL_LEN   - Maximum context length (default: 32768)
+#   GPU_MEM_UTIL    - GPU memory utilization fraction (default: 0.92)
+#   API_KEY         - API key for authentication (default: none)
+#   TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2)
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
+
+MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}"
+PORT="${PORT:-7080}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
+API_KEY="${API_KEY:-}"
+TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}"
+
+if [ ! -f "$SIF_FILE" ]; then
+    echo "ERROR: Container image not found at ${SIF_FILE}"
+    echo "       Run 01_build_container.sh first."
+    exit 1
+fi
+
+if [ ! -d "$MODEL_DIR" ]; then
+    echo "ERROR: Model directory not found at ${MODEL_DIR}"
+    echo "       Run 02_download_model.sh first."
+    exit 1
+fi
+
+MODEL_PARENT="$(dirname "$MODEL_DIR")"
+MODEL_NAME="$(basename "$MODEL_DIR")"
+
+VLLM_ARGS=(
+    --model "/models/${MODEL_NAME}"
+    --port "$PORT"
+    --host 0.0.0.0
+    --tensor-parallel-size "$TENSOR_PARALLEL"
+    --max-model-len "$MAX_MODEL_LEN"
+    --gpu-memory-utilization "$GPU_MEM_UTIL"
+    --dtype bfloat16
+    --trust-remote-code
+    --reasoning-parser qwen3
+    --served-model-name "qwen3.5-35b-a3b"
+    --max-num-seqs 16
+    --enable-prefix-caching
+)
+
+if [ -n "$API_KEY" ]; then
+    VLLM_ARGS+=(--api-key "$API_KEY")
+fi
+
+echo "=============================================="
+echo " vLLM Inference Server — Qwen3.5-35B-A3B"
+echo "=============================================="
+echo " Model:       ${MODEL_DIR}"
+echo " Container:   ${SIF_FILE}"
+echo " Port:        ${PORT}"
+echo " Context len: ${MAX_MODEL_LEN}"
+echo " GPU util:    ${GPU_MEM_UTIL}"
+echo " TP size:     ${TENSOR_PARALLEL}"
+echo " API key:     ${API_KEY:-<none>}"
+echo "=============================================="
+echo ""
+echo "Starting server... (Ctrl+C to stop)"
+echo "API will be available at: http://$(hostname):${PORT}/v1"
+echo ""
+
+apptainer exec --nv \
+    --writable-tmpfs \
+    --bind "${MODEL_PARENT}:/models" \
+    "$SIF_FILE" \
+    python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
--- a/04_start_server_background.sh
+++ b/04_start_server_background.sh
@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 04_start_server_background.sh
+# Launches the vLLM server in the background with logging.
+# Useful for long-running deployments or running inside tmux/screen.
+#
+# Usage:
+#   bash 04_start_server_background.sh
+#
+# Logs are written to: ./logs/vllm_server_<timestamp>.log
+# PID is written to:   ./logs/vllm_server.pid
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LOG_DIR="${SCRIPT_DIR}/logs"
+mkdir -p "$LOG_DIR"
+
+TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
+LOG_FILE="${LOG_DIR}/vllm_server_${TIMESTAMP}.log"
+PID_FILE="${LOG_DIR}/vllm_server.pid"
+
+if [ -f "$PID_FILE" ]; then
+    OLD_PID=$(cat "$PID_FILE")
+    if kill -0 "$OLD_PID" 2>/dev/null; then
+        echo "Server already running with PID ${OLD_PID}"
+        echo "Stop it first:  bash 05_stop_server.sh"
+        exit 1
+    fi
+fi
+
+echo "Starting vLLM server in background..."
+echo "Log file: ${LOG_FILE}"
+
+nohup bash "${SCRIPT_DIR}/03_start_server.sh" > "$LOG_FILE" 2>&1 &
+SERVER_PID=$!
+echo "$SERVER_PID" > "$PID_FILE"
+
+echo "Server PID: ${SERVER_PID}"
+echo ""
+echo "Monitor logs:    tail -f ${LOG_FILE}"
+echo "Stop server:     bash 05_stop_server.sh"
+echo ""
+
+sleep 3
+if kill -0 "$SERVER_PID" 2>/dev/null; then
+    echo "Server process is running. Waiting for model to load..."
+    echo "(This can take several minutes for Qwen3.5-35B-A3B)"
+else
+    echo "ERROR: Server process exited. Check logs:"
+    tail -20 "$LOG_FILE"
+    exit 1
+fi
--- a/05_stop_server.sh
+++ b/05_stop_server.sh
@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 05_stop_server.sh
+# Gracefully stops the background vLLM server.
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PID_FILE="${SCRIPT_DIR}/logs/vllm_server.pid"
+
+if [ ! -f "$PID_FILE" ]; then
+    echo "No PID file found. Server may not be running."
+    exit 0
+fi
+
+SERVER_PID=$(cat "$PID_FILE")
+
+if kill -0 "$SERVER_PID" 2>/dev/null; then
+    echo "Stopping server (PID: ${SERVER_PID})..."
+    kill "$SERVER_PID"
+    sleep 2
+    if kill -0 "$SERVER_PID" 2>/dev/null; then
+        echo "Process still alive, sending SIGKILL..."
+        kill -9 "$SERVER_PID"
+    fi
+    echo "Server stopped."
+else
+    echo "Server process (PID: ${SERVER_PID}) is not running."
+fi
+
+rm -f "$PID_FILE"
--- a/README.md
+++ b/README.md
@ -0,0 +1,265 @@
+# LLM Local — Qwen3.5-27B Inference Server
+
+Self-hosted LLM inference for ~15 concurrent students using **Qwen3.5-27B**,
+served via **vLLM** inside an **Apptainer** container on a GPU server.
+
+## Architecture
+
+```
+Students (OpenAI SDK / curl)
+        │
+        ▼
+  ┌─────────────────────────┐
+  │  silicon.fhgr.ch:7080   │
+  │  OpenAI-compatible API  │
+  ├─────────────────────────┤
+  │  vLLM Server            │
+  │  (Apptainer container)  │
+  ├─────────────────────────┤
+  │  Qwen3.5-27B weights    │
+  │  (bind-mounted)         │
+  ├─────────────────────────┤
+  │  NVIDIA GPU             │
+  └─────────────────────────┘
+```
+
+## Prerequisites
+
+- **GPU**: NVIDIA GPU with >=80 GB VRAM (A100-80GB or H100 recommended).
+  Qwen3.5-27B in BF16 requires ~56 GB VRAM plus KV cache overhead.
+- **Apptainer** (formerly Singularity) installed on the server.
+- **NVIDIA drivers** + **nvidia-container-cli** for GPU passthrough.
+- **~60 GB disk space** for model weights + ~15 GB for the container image.
+- **Network**: Students must be on the university network or VPN.
+
+## Hardware Sizing
+
+| Component | Minimum        | Recommended     |
+|-----------|----------------|-----------------|
+| GPU VRAM  | 80 GB (1× A100)| 80 GB (1× H100) |
+| RAM       | 64 GB          | 128 GB          |
+| Disk      | 100 GB free    | 200 GB free     |
+
+> **If your GPU has less than 80 GB VRAM**, you have two options:
+> 1. Use a **quantized** version (e.g., AWQ/GPTQ 4-bit — ~16 GB VRAM)
+> 2. Use **tensor parallelism** across multiple GPUs (set `TENSOR_PARALLEL=2`)
+
+---
+
+## Step-by-Step Setup
+
+### Step 0: SSH into the Server
+
+```bash
+ssh herzogfloria@silicon.fhgr.ch
+```
+
+### Step 1: Clone This Repository
+
+```bash
+# Or copy the files to the server
+git clone <your-repo-url> ~/LLM_local
+cd ~/LLM_local
+chmod +x *.sh
+```
+
+### Step 2: Check GPU and Environment
+
+```bash
+# Verify GPU is visible
+nvidia-smi
+
+# Verify Apptainer is installed
+apptainer --version
+
+# Check available disk space
+df -h ~
+```
+
+### Step 3: Download the Model (~60 GB)
+
+```bash
+# Install huggingface-cli if not available
+pip install --user huggingface_hub[cli]
+
+# Download Qwen3.5-27B
+bash 01_download_model.sh
+# Default target: ~/models/Qwen3.5-27B
+```
+
+This downloads the full BF16 weights. Takes 20-60 minutes depending on bandwidth.
+
+### Step 4: Build the Apptainer Container
+
+```bash
+bash 02_build_container.sh
+```
+
+This pulls the `vllm/vllm-openai:latest` Docker image and converts it to a `.sif` file.
+Takes 10-20 minutes. The resulting `vllm_qwen.sif` is ~12-15 GB.
+
+> **Tip**: If building fails due to network/proxy issues, you can pull the Docker image
+> first and convert manually:
+> ```bash
+> apptainer pull docker://vllm/vllm-openai:latest
+> ```
+
+### Step 5: Start the Server
+
+**Interactive (foreground):**
+```bash
+bash 03_start_server.sh
+```
+
+**Background (recommended for production):**
+```bash
+bash 04_start_server_background.sh
+```
+
+The server takes 2-5 minutes to load the model into GPU memory. Monitor with:
+```bash
+tail -f logs/vllm_server_*.log
+```
+
+Look for the line:
+```
+INFO:     Uvicorn running on http://0.0.0.0:8000
+```
+
+### Step 6: Test the Server
+
+```bash
+# Quick health check
+curl http://localhost:7080/v1/models
+
+# Full test
+pip install openai
+python test_server.py
+```
+
+### Step 7: Share with Students
+
+Distribute the `STUDENT_GUIDE.md` file or share the connection details:
+- **27B Base URL**: `http://silicon.fhgr.ch:7080/v1` — model name: `qwen3.5-27b`
+- **35B Base URL**: `http://silicon.fhgr.ch:7081/v1` — model name: `qwen3.5-35b-a3b`
+
+---
+
+## Configuration
+
+All configuration is via environment variables in `03_start_server.sh`:
+
+| Variable          | Default                      | Description                         |
+|-------------------|------------------------------|-------------------------------------|
+| `MODEL_DIR`       | `~/models/Qwen3.5-27B`      | Path to model weights               |
+| `PORT`            | `7080`                       | HTTP port                           |
+| `MAX_MODEL_LEN`   | `32768`                      | Max context length (tokens)         |
+| `GPU_MEM_UTIL`    | `0.92`                       | Fraction of GPU memory to use       |
+| `API_KEY`         | *(empty = no auth)*          | API key for authentication          |
+| `TENSOR_PARALLEL` | `1`                          | Number of GPUs                      |
+
+### Context Length Tuning
+
+The default `MAX_MODEL_LEN=32768` is conservative and ensures stable operation for 15
+concurrent users. If you have plenty of VRAM headroom:
+
+```bash
+MAX_MODEL_LEN=65536 bash 03_start_server.sh
+```
+
+Qwen3.5-27B natively supports up to 262,144 tokens, but longer contexts require
+significantly more GPU memory for KV cache.
+
+### Adding Authentication
+
+```bash
+API_KEY="your-secret-key-here" bash 03_start_server.sh
+```
+
+Students then use this key in their `api_key` parameter.
+
+### Multi-GPU Setup
+
+If you have multiple GPUs:
+
+```bash
+TENSOR_PARALLEL=2 bash 03_start_server.sh
+```
+
+---
+
+## Server Management
+
+```bash
+# Start in background
+bash 04_start_server_background.sh
+
+# Check if running
+curl -s http://localhost:7080/v1/models | python -m json.tool
+
+# View logs
+tail -f logs/vllm_server_*.log
+
+# Stop
+bash 05_stop_server.sh
+
+# Monitor GPU usage
+watch -n 2 nvidia-smi
+```
+
+### Running Persistently with tmux
+
+For a robust setup that survives SSH disconnects:
+
+```bash
+ssh herzogfloria@silicon.fhgr.ch
+tmux new -s llm_server
+bash 03_start_server.sh
+# Press Ctrl+B, then D to detach
+
+# Reconnect later:
+tmux attach -t llm_server
+```
+
+---
+
+## Files Overview
+
+| File                         | Purpose                                    |
+|------------------------------|------------------------------------------- |
+| `vllm_qwen.def`             | Apptainer container definition             |
+| `01_download_model.sh`       | Downloads model weights from Hugging Face  |
+| `02_build_container.sh`      | Builds the Apptainer .sif image            |
+| `03_start_server.sh`         | Starts vLLM server (foreground)            |
+| `04_start_server_background.sh` | Starts server in background with logging|
+| `05_stop_server.sh`          | Stops the background server                |
+| `test_server.py`             | Tests the running server                   |
+| `STUDENT_GUIDE.md`           | Instructions for students                  |
+
+---
+
+## Troubleshooting
+
+### "CUDA out of memory"
+- Reduce `MAX_MODEL_LEN` (e.g., 16384)
+- Reduce `GPU_MEM_UTIL` (e.g., 0.85)
+- Use a quantized model variant
+
+### Container build fails
+- Ensure you have internet access and sufficient disk space (~20 GB for build cache)
+- Try: `apptainer pull docker://vllm/vllm-openai:latest` first
+
+### "No NVIDIA GPU detected"
+- Check that `nvidia-smi` works outside the container
+- Ensure `--nv` flag is passed (already in scripts)
+- Verify nvidia-container-cli: `apptainer exec --nv vllm_qwen.sif nvidia-smi`
+
+### Server starts but students can't connect
+- Check firewall: `sudo ufw allow 7080:7090/tcp` or equivalent
+- Verify the server binds to `0.0.0.0` (not just localhost)
+- Students must use the server's hostname/IP, not `localhost`
+
+### Slow generation with many users
+- This is expected — vLLM batches requests but throughput is finite
+- Consider reducing `max_tokens` in student requests
+- Monitor with: `curl http://localhost:7080/metrics`
--- a/STUDENT_GUIDE.md
+++ b/STUDENT_GUIDE.md
@ -0,0 +1,118 @@
+# Student Guide — Qwen3.5-35B-A3B Inference Server
+
+## Overview
+
+A **Qwen3.5-35B-A3B** language model is running on our GPU server. It's a
+Mixture-of-Experts model (35B total parameters, 3B active per token), providing
+fast and high-quality responses. You can interact with it using the
+**OpenAI-compatible API**.
+
+## Connection Details
+
+| Parameter    | Value                                       |
+|------------- |---------------------------------------------|
+| **Base URL** | `http://silicon.fhgr.ch:7080/v1`            |
+| **Model**    | `qwen3.5-35b-a3b`                           |
+| **API Key**  | *(ask your instructor — may be `EMPTY`)*    |
+
+> **Note**: You must be on the university network or VPN to reach the server.
+
+---
+
+## Quick Start with Python
+
+### 1. Install the OpenAI SDK
+
+```bash
+pip install openai
+```
+
+### 2. Simple Chat
+
+```python
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="http://silicon.fhgr.ch:7080/v1",
+    api_key="EMPTY",  # replace if your instructor set a key
+)
+
+response = client.chat.completions.create(
+    model="qwen3.5-35b-a3b",
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "Explain gradient descent in simple terms."},
+    ],
+    max_tokens=1024,
+    temperature=0.7,
+)
+
+print(response.choices[0].message.content)
+```
+
+### 3. Streaming Responses
+
+```python
+stream = client.chat.completions.create(
+    model="qwen3.5-35b-a3b",
+    messages=[
+        {"role": "user", "content": "Write a haiku about machine learning."},
+    ],
+    max_tokens=256,
+    stream=True,
+)
+
+for chunk in stream:
+    if chunk.choices[0].delta.content:
+        print(chunk.choices[0].delta.content, end="", flush=True)
+print()
+```
+
+---
+
+## Quick Start with curl
+
+```bash
+curl http://silicon.fhgr.ch:7080/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "qwen3.5-35b-a3b",
+    "messages": [
+      {"role": "user", "content": "What is the capital of Switzerland?"}
+    ],
+    "max_tokens": 256,
+    "temperature": 0.7
+  }'
+```
+
+---
+
+## Recommended Parameters
+
+| Parameter       | Recommended | Notes                                        |
+|-----------------|-------------|----------------------------------------------|
+| `temperature`   | 0.7         | Lower = more deterministic, higher = creative |
+| `max_tokens`    | 1024–4096   | Increase for long-form output                |
+| `top_p`         | 0.95        | Nucleus sampling                             |
+| `stream`        | `true`      | Better UX for interactive use                |
+
+---
+
+## Tips & Etiquette
+
+- **Be mindful of context length**: Avoid excessively long prompts (>8K tokens) unless necessary.
+- **Use streaming**: Makes responses feel faster and reduces perceived latency.
+- **Don't spam requests**: The server is shared among ~15 students.
+- **Check the model name**: Always use `qwen3.5-35b-a3b` as the model parameter.
+
+---
+
+## Troubleshooting
+
+| Issue                       | Solution                                            |
+|-----------------------------|-----------------------------------------------------|
+| Connection refused          | Check you're on the university network / VPN        |
+| Model not found             | Use model name `qwen3.5-35b-a3b` exactly            |
+| Slow responses              | The model is shared — peak times may be slower      |
+| `401 Unauthorized`          | Ask your instructor for the API key                 |
+| Response cut off            | Increase `max_tokens` in your request               |
--- a/test_server.py
+++ b/test_server.py
@ -0,0 +1,70 @@
+"""
+Quick test script to verify the vLLM server is running and responding.
+
+Usage:
+    pip install openai
+    python test_server.py [--host HOST] [--port PORT] [--api-key KEY]
+"""
+
+import argparse
+import sys
+
+from openai import OpenAI
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Test vLLM inference server")
+    parser.add_argument("--host", default="localhost", help="Server hostname")
+    parser.add_argument("--port", default=7080, type=int, help="Server port")
+    parser.add_argument("--api-key", default="EMPTY", help="API key")
+    args = parser.parse_args()
+
+    base_url = f"http://{args.host}:{args.port}/v1"
+    model = "qwen3.5-35b-a3b"
+    client = OpenAI(base_url=base_url, api_key=args.api_key)
+
+    print(f"Connecting to {base_url} ...")
+
+    print("\n--- Available Models ---")
+    try:
+        models = client.models.list()
+        for m in models.data:
+            print(f"  {m.id}")
+    except Exception as e:
+        print(f"ERROR: Cannot connect to server: {e}")
+        sys.exit(1)
+
+    print("\n--- Test Chat Completion ---")
+    response = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "user", "content": "What is 2 + 2? Answer in one sentence."}
+        ],
+        max_tokens=256,
+        temperature=0.7,
+    )
+    print(f"  Response: {response.choices[0].message.content}")
+    print(f"  Tokens:   prompt={response.usage.prompt_tokens}, "
+          f"completion={response.usage.completion_tokens}")
+
+    print("\n--- Test Streaming ---")
+    stream = client.chat.completions.create(
+        model=model,
+        messages=[
+            {"role": "user", "content": "Count from 1 to 5."}
+        ],
+        max_tokens=128,
+        temperature=0.7,
+        stream=True,
+    )
+    print("  Response: ", end="")
+    for chunk in stream:
+        if chunk.choices[0].delta.content:
+            print(chunk.choices[0].delta.content, end="", flush=True)
+    print("\n")
+
+    print("All tests passed!")
+
+
+if __name__ == "__main__":
+    main()
--- a/vllm_qwen.def
+++ b/vllm_qwen.def
@ -0,0 +1,23 @@
+Bootstrap: docker
+From: vllm/vllm-openai:latest
+
+%labels
+    Author herzogfloria
+    Description vLLM nightly inference server for Qwen3.5-35B-A3B
+    Version 2.0
+
+%environment
+    export HF_HOME=/tmp/hf_cache
+    export VLLM_USAGE_SOURCE=production
+
+%post
+    apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
+    pip install --no-cache-dir vllm --extra-index-url https://wheels.vllm.ai/nightly
+    pip install --no-cache-dir "transformers @ git+https://github.com/huggingface/transformers.git@main"
+    pip install --no-cache-dir huggingface_hub[cli]
+
+%runscript
+    exec python3 -m vllm.entrypoints.openai.api_server "$@"
+
+%help
+    Apptainer container for serving Qwen3.5-35B-A3B via vLLM (nightly).