Add vLLM inference setup for Qwen3.5-35B-A3B on Apptainer

Scripts to build container, download model, and serve Qwen3.5-35B-A3B via vLLM with OpenAI-compatible API on port 7080. Configured for 2x NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent students. Made-with: Cursor
2026-03-02 14:43:39 +01:00 · 2026-03-02 14:43:39 +01:00 · 076001b07f
commit 076001b07f
10 changed files with 740 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,14 @@
 # Apptainer container image (large binary)
 *.sif
 # Logs
 logs/
 # Model weights (downloaded separately)
 models/
 # HuggingFace cache
 .cache/
 # macOS
 .DS_Store
--- a/01_build_container.sh
+++ b/01_build_container.sh
@ -0,0 +1,32 @@
 #!/usr/bin/env bash
 # ------------------------------------------------------------------
 # 01_build_container.sh
 # Builds the Apptainer SIF image for vLLM inference.
 # This must be run FIRST — everything else runs inside the container.
 #
 # Usage:
 #   bash 01_build_container.sh
 # ------------------------------------------------------------------
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 DEF_FILE="${SCRIPT_DIR}/vllm_qwen.def"
 SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
 if [ -f "$SIF_FILE" ]; then
    echo "WARNING: ${SIF_FILE} already exists."
    read -p "Rebuild? [y/N] " confirm
    [[ "$confirm" =~ ^[Yy]$ ]] || exit 0
 fi
 echo "=== Building Apptainer image from ${DEF_FILE} ==="
 echo "    This will pull the vLLM Docker image and convert it."
 echo "    Estimated time: 10-20 minutes depending on network speed."
 echo ""
 apptainer build --nv "$SIF_FILE" "$DEF_FILE"
 echo ""
 echo "=== Build complete ==="
 echo "Image: ${SIF_FILE}"
 ls -lh "$SIF_FILE"
--- a/02_download_model.sh
+++ b/02_download_model.sh
@ -0,0 +1,50 @@
 #!/usr/bin/env bash
 # ------------------------------------------------------------------
 # 02_download_model.sh
 # Downloads Qwen3.5-35B-A3B weights from Hugging Face
 # using huggingface-cli INSIDE the Apptainer container.
 #
 # Prerequisites:
 #   - Container built via 01_build_container.sh
 #
 # Usage:
 #   bash 02_download_model.sh [TARGET_DIR]
 # ------------------------------------------------------------------
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
 MODEL_ID="Qwen/Qwen3.5-35B-A3B"
 TARGET_DIR="${1:-$HOME/models/Qwen3.5-35B-A3B}"
 HF_CACHE_DIR="${HOME}/.cache/huggingface"
 if [ ! -f "$SIF_FILE" ]; then
    echo "ERROR: Container image not found at ${SIF_FILE}"
    echo "       Run 01_build_container.sh first."
    exit 1
 fi
 echo "=== Downloading ${MODEL_ID} to ${TARGET_DIR} ==="
 echo "    Using huggingface-cli inside the container."
 echo ""
 mkdir -p "$TARGET_DIR" "$HF_CACHE_DIR"
 apptainer exec \
    --writable-tmpfs \
    --bind "$(dirname "$TARGET_DIR"):$(dirname "$TARGET_DIR")" \
    --bind "${HF_CACHE_DIR}:${HF_CACHE_DIR}" \
    --env HF_HOME="${HF_CACHE_DIR}" \
    --env HF_HUB_CACHE="${HF_CACHE_DIR}/hub" \
    --env XDG_CACHE_HOME="${HOME}/.cache" \
    "$SIF_FILE" \
    huggingface-cli download "$MODEL_ID" \
        --local-dir "$TARGET_DIR" \
        --local-dir-use-symlinks False
 echo ""
 echo "=== Download complete ==="
 echo "Model stored at: ${TARGET_DIR}"
 echo "Total size:"
 du -sh "$TARGET_DIR"
--- a/03_start_server.sh
+++ b/03_start_server.sh
@ -0,0 +1,84 @@
 #!/usr/bin/env bash
 # ------------------------------------------------------------------
 # 03_start_server.sh
 # Launches the vLLM inference server for Qwen3.5-35B-A3B
 # inside the Apptainer container.
 #
 # Usage:
 #   bash 03_start_server.sh
 #
 # Environment variables (override defaults):
 #   MODEL_DIR       - Path to model weights (default: ~/models/Qwen3.5-35B-A3B)
 #   PORT            - Server port (default: 7080)
 #   MAX_MODEL_LEN   - Maximum context length (default: 32768)
 #   GPU_MEM_UTIL    - GPU memory utilization fraction (default: 0.92)
 #   API_KEY         - API key for authentication (default: none)
 #   TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2)
 # ------------------------------------------------------------------
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
 MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}"
 PORT="${PORT:-7080}"
 MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
 GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
 API_KEY="${API_KEY:-}"
 TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}"
 if [ ! -f "$SIF_FILE" ]; then
    echo "ERROR: Container image not found at ${SIF_FILE}"
    echo "       Run 01_build_container.sh first."
    exit 1
 fi
 if [ ! -d "$MODEL_DIR" ]; then
    echo "ERROR: Model directory not found at ${MODEL_DIR}"
    echo "       Run 02_download_model.sh first."
    exit 1
 fi
 MODEL_PARENT="$(dirname "$MODEL_DIR")"
 MODEL_NAME="$(basename "$MODEL_DIR")"
 VLLM_ARGS=(
    --model "/models/${MODEL_NAME}"
    --port "$PORT"
    --host 0.0.0.0
    --tensor-parallel-size "$TENSOR_PARALLEL"
    --max-model-len "$MAX_MODEL_LEN"
    --gpu-memory-utilization "$GPU_MEM_UTIL"
    --dtype bfloat16
    --trust-remote-code
    --reasoning-parser qwen3
    --served-model-name "qwen3.5-35b-a3b"
    --max-num-seqs 16
    --enable-prefix-caching
 )
 if [ -n "$API_KEY" ]; then
    VLLM_ARGS+=(--api-key "$API_KEY")
 fi
 echo "=============================================="
 echo " vLLM Inference Server — Qwen3.5-35B-A3B"
 echo "=============================================="
 echo " Model:       ${MODEL_DIR}"
 echo " Container:   ${SIF_FILE}"
 echo " Port:        ${PORT}"
 echo " Context len: ${MAX_MODEL_LEN}"
 echo " GPU util:    ${GPU_MEM_UTIL}"
 echo " TP size:     ${TENSOR_PARALLEL}"
 echo " API key:     ${API_KEY:-<none>}"
 echo "=============================================="
 echo ""
 echo "Starting server... (Ctrl+C to stop)"
 echo "API will be available at: http://$(hostname):${PORT}/v1"
 echo ""
 apptainer exec --nv \
    --writable-tmpfs \
    --bind "${MODEL_PARENT}:/models" \
    "$SIF_FILE" \
    python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
--- a/04_start_server_background.sh
+++ b/04_start_server_background.sh
@ -0,0 +1,53 @@
 #!/usr/bin/env bash
 # ------------------------------------------------------------------
 # 04_start_server_background.sh
 # Launches the vLLM server in the background with logging.
 # Useful for long-running deployments or running inside tmux/screen.
 #
 # Usage:
 #   bash 04_start_server_background.sh
 #
 # Logs are written to: ./logs/vllm_server_<timestamp>.log
 # PID is written to:   ./logs/vllm_server.pid
 # ------------------------------------------------------------------
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 LOG_DIR="${SCRIPT_DIR}/logs"
 mkdir -p "$LOG_DIR"
 TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
 LOG_FILE="${LOG_DIR}/vllm_server_${TIMESTAMP}.log"
 PID_FILE="${LOG_DIR}/vllm_server.pid"
 if [ -f "$PID_FILE" ]; then
    OLD_PID=$(cat "$PID_FILE")
    if kill -0 "$OLD_PID" 2>/dev/null; then
        echo "Server already running with PID ${OLD_PID}"
        echo "Stop it first:  bash 05_stop_server.sh"
        exit 1
    fi
 fi
 echo "Starting vLLM server in background..."
 echo "Log file: ${LOG_FILE}"
 nohup bash "${SCRIPT_DIR}/03_start_server.sh" > "$LOG_FILE" 2>&1 &
 SERVER_PID=$!
 echo "$SERVER_PID" > "$PID_FILE"
 echo "Server PID: ${SERVER_PID}"
 echo ""
 echo "Monitor logs:    tail -f ${LOG_FILE}"
 echo "Stop server:     bash 05_stop_server.sh"
 echo ""
 sleep 3
 if kill -0 "$SERVER_PID" 2>/dev/null; then
    echo "Server process is running. Waiting for model to load..."
    echo "(This can take several minutes for Qwen3.5-35B-A3B)"
 else
    echo "ERROR: Server process exited. Check logs:"
    tail -20 "$LOG_FILE"
    exit 1
 fi
--- a/05_stop_server.sh
+++ b/05_stop_server.sh
@ -0,0 +1,31 @@
 #!/usr/bin/env bash
 # ------------------------------------------------------------------
 # 05_stop_server.sh
 # Gracefully stops the background vLLM server.
 # ------------------------------------------------------------------
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PID_FILE="${SCRIPT_DIR}/logs/vllm_server.pid"
 if [ ! -f "$PID_FILE" ]; then
    echo "No PID file found. Server may not be running."
    exit 0
 fi
 SERVER_PID=$(cat "$PID_FILE")
 if kill -0 "$SERVER_PID" 2>/dev/null; then
    echo "Stopping server (PID: ${SERVER_PID})..."
    kill "$SERVER_PID"
    sleep 2
    if kill -0 "$SERVER_PID" 2>/dev/null; then
        echo "Process still alive, sending SIGKILL..."
        kill -9 "$SERVER_PID"
    fi
    echo "Server stopped."
 else
    echo "Server process (PID: ${SERVER_PID}) is not running."
 fi
 rm -f "$PID_FILE"
--- a/README.md
+++ b/README.md
@ -0,0 +1,265 @@
 # LLM Local — Qwen3.5-27B Inference Server
 Self-hosted LLM inference for ~15 concurrent students using **Qwen3.5-27B**,
 served via **vLLM** inside an **Apptainer** container on a GPU server.
 ## Architecture
 ```
 Students (OpenAI SDK / curl)
        │
        ▼
  ┌─────────────────────────┐
  │  silicon.fhgr.ch:7080   │
  │  OpenAI-compatible API  │
  ├─────────────────────────┤
  │  vLLM Server            │
  │  (Apptainer container)  │
  ├─────────────────────────┤
  │  Qwen3.5-27B weights    │
  │  (bind-mounted)         │
  ├─────────────────────────┤
  │  NVIDIA GPU             │
  └─────────────────────────┘
 ```
 ## Prerequisites
 - **GPU**: NVIDIA GPU with >=80 GB VRAM (A100-80GB or H100 recommended).
  Qwen3.5-27B in BF16 requires ~56 GB VRAM plus KV cache overhead.
 - **Apptainer** (formerly Singularity) installed on the server.
 - **NVIDIA drivers** + **nvidia-container-cli** for GPU passthrough.
 - **~60 GB disk space** for model weights + ~15 GB for the container image.
 - **Network**: Students must be on the university network or VPN.
 ## Hardware Sizing
 | Component | Minimum        | Recommended     |
 |-----------|----------------|-----------------|
 | GPU VRAM  | 80 GB (1× A100)| 80 GB (1× H100) |
 | RAM       | 64 GB          | 128 GB          |
 | Disk      | 100 GB free    | 200 GB free     |
 > **If your GPU has less than 80 GB VRAM**, you have two options:
 > 1. Use a **quantized** version (e.g., AWQ/GPTQ 4-bit — ~16 GB VRAM)
 > 2. Use **tensor parallelism** across multiple GPUs (set `TENSOR_PARALLEL=2`)
 ---
 ## Step-by-Step Setup
 ### Step 0: SSH into the Server
 ```bash
 ssh herzogfloria@silicon.fhgr.ch
 ```
 ### Step 1: Clone This Repository
 ```bash
 # Or copy the files to the server
 git clone <your-repo-url> ~/LLM_local
 cd ~/LLM_local
 chmod +x *.sh
 ```
 ### Step 2: Check GPU and Environment
 ```bash
 # Verify GPU is visible
 nvidia-smi
 # Verify Apptainer is installed
 apptainer --version
 # Check available disk space
 df -h ~
 ```
 ### Step 3: Download the Model (~60 GB)
 ```bash
 # Install huggingface-cli if not available
 pip install --user huggingface_hub[cli]
 # Download Qwen3.5-27B
 bash 01_download_model.sh
 # Default target: ~/models/Qwen3.5-27B
 ```
 This downloads the full BF16 weights. Takes 20-60 minutes depending on bandwidth.
 ### Step 4: Build the Apptainer Container
 ```bash
 bash 02_build_container.sh
 ```
 This pulls the `vllm/vllm-openai:latest` Docker image and converts it to a `.sif` file.
 Takes 10-20 minutes. The resulting `vllm_qwen.sif` is ~12-15 GB.
 > **Tip**: If building fails due to network/proxy issues, you can pull the Docker image
 > first and convert manually:
 > ```bash
 > apptainer pull docker://vllm/vllm-openai:latest
 > ```
 ### Step 5: Start the Server
 **Interactive (foreground):**
 ```bash
 bash 03_start_server.sh
 ```
 **Background (recommended for production):**
 ```bash
 bash 04_start_server_background.sh
 ```
 The server takes 2-5 minutes to load the model into GPU memory. Monitor with:
 ```bash
 tail -f logs/vllm_server_*.log
 ```
 Look for the line:
 ```
 INFO:     Uvicorn running on http://0.0.0.0:8000
 ```
 ### Step 6: Test the Server
 ```bash
 # Quick health check
 curl http://localhost:7080/v1/models
 # Full test
 pip install openai
 python test_server.py
 ```
 ### Step 7: Share with Students
 Distribute the `STUDENT_GUIDE.md` file or share the connection details:
 - **27B Base URL**: `http://silicon.fhgr.ch:7080/v1` — model name: `qwen3.5-27b`
 - **35B Base URL**: `http://silicon.fhgr.ch:7081/v1` — model name: `qwen3.5-35b-a3b`
 ---
 ## Configuration
 All configuration is via environment variables in `03_start_server.sh`:
 | Variable          | Default                      | Description                         |
 |-------------------|------------------------------|-------------------------------------|
 | `MODEL_DIR`       | `~/models/Qwen3.5-27B`      | Path to model weights               |
 | `PORT`            | `7080`                       | HTTP port                           |
 | `MAX_MODEL_LEN`   | `32768`                      | Max context length (tokens)         |
 | `GPU_MEM_UTIL`    | `0.92`                       | Fraction of GPU memory to use       |
 | `API_KEY`         | *(empty = no auth)*          | API key for authentication          |
 | `TENSOR_PARALLEL` | `1`                          | Number of GPUs                      |
 ### Context Length Tuning
 The default `MAX_MODEL_LEN=32768` is conservative and ensures stable operation for 15
 concurrent users. If you have plenty of VRAM headroom:
 ```bash
 MAX_MODEL_LEN=65536 bash 03_start_server.sh
 ```
 Qwen3.5-27B natively supports up to 262,144 tokens, but longer contexts require
 significantly more GPU memory for KV cache.
 ### Adding Authentication
 ```bash
 API_KEY="your-secret-key-here" bash 03_start_server.sh
 ```
 Students then use this key in their `api_key` parameter.
 ### Multi-GPU Setup
 If you have multiple GPUs:
 ```bash
 TENSOR_PARALLEL=2 bash 03_start_server.sh
 ```
 ---
 ## Server Management
 ```bash
 # Start in background
 bash 04_start_server_background.sh
 # Check if running
 curl -s http://localhost:7080/v1/models | python -m json.tool
 # View logs
 tail -f logs/vllm_server_*.log
 # Stop
 bash 05_stop_server.sh
 # Monitor GPU usage
 watch -n 2 nvidia-smi
 ```
 ### Running Persistently with tmux
 For a robust setup that survives SSH disconnects:
 ```bash
 ssh herzogfloria@silicon.fhgr.ch
 tmux new -s llm_server
 bash 03_start_server.sh
 # Press Ctrl+B, then D to detach
 # Reconnect later:
 tmux attach -t llm_server
 ```
 ---
 ## Files Overview
 | File                         | Purpose                                    |
 |------------------------------|------------------------------------------- |
 | `vllm_qwen.def`             | Apptainer container definition             |
 | `01_download_model.sh`       | Downloads model weights from Hugging Face  |
 | `02_build_container.sh`      | Builds the Apptainer .sif image            |
 | `03_start_server.sh`         | Starts vLLM server (foreground)            |
 | `04_start_server_background.sh` | Starts server in background with logging|
 | `05_stop_server.sh`          | Stops the background server                |
 | `test_server.py`             | Tests the running server                   |
 | `STUDENT_GUIDE.md`           | Instructions for students                  |
 ---
 ## Troubleshooting
 ### "CUDA out of memory"
 - Reduce `MAX_MODEL_LEN` (e.g., 16384)
 - Reduce `GPU_MEM_UTIL` (e.g., 0.85)
 - Use a quantized model variant
 ### Container build fails
 - Ensure you have internet access and sufficient disk space (~20 GB for build cache)
 - Try: `apptainer pull docker://vllm/vllm-openai:latest` first
 ### "No NVIDIA GPU detected"
 - Check that `nvidia-smi` works outside the container
 - Ensure `--nv` flag is passed (already in scripts)
 - Verify nvidia-container-cli: `apptainer exec --nv vllm_qwen.sif nvidia-smi`
 ### Server starts but students can't connect
 - Check firewall: `sudo ufw allow 7080:7090/tcp` or equivalent
 - Verify the server binds to `0.0.0.0` (not just localhost)
 - Students must use the server's hostname/IP, not `localhost`
 ### Slow generation with many users
 - This is expected — vLLM batches requests but throughput is finite
 - Consider reducing `max_tokens` in student requests
 - Monitor with: `curl http://localhost:7080/metrics`
--- a/STUDENT_GUIDE.md
+++ b/STUDENT_GUIDE.md
@ -0,0 +1,118 @@
 # Student Guide — Qwen3.5-35B-A3B Inference Server
 ## Overview
 A **Qwen3.5-35B-A3B** language model is running on our GPU server. It's a
 Mixture-of-Experts model (35B total parameters, 3B active per token), providing
 fast and high-quality responses. You can interact with it using the
 **OpenAI-compatible API**.
 ## Connection Details
 | Parameter    | Value                                       |
 |------------- |---------------------------------------------|
 | **Base URL** | `http://silicon.fhgr.ch:7080/v1`            |
 | **Model**    | `qwen3.5-35b-a3b`                           |
 | **API Key**  | *(ask your instructor — may be `EMPTY`)*    |
 > **Note**: You must be on the university network or VPN to reach the server.
 ---
 ## Quick Start with Python
 ### 1. Install the OpenAI SDK
 ```bash
 pip install openai
 ```
 ### 2. Simple Chat
 ```python
 from openai import OpenAI
 client = OpenAI(
    base_url="http://silicon.fhgr.ch:7080/v1",
    api_key="EMPTY",  # replace if your instructor set a key
 )
 response = client.chat.completions.create(
    model="qwen3.5-35b-a3b",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain gradient descent in simple terms."},
    ],
    max_tokens=1024,
    temperature=0.7,
 )
 print(response.choices[0].message.content)
 ```
 ### 3. Streaming Responses
 ```python
 stream = client.chat.completions.create(
    model="qwen3.5-35b-a3b",
    messages=[
        {"role": "user", "content": "Write a haiku about machine learning."},
    ],
    max_tokens=256,
    stream=True,
 )
 for chunk in stream:
    if chunk.choices[0].delta.content:
        print(chunk.choices[0].delta.content, end="", flush=True)
 print()
 ```
 ---
 ## Quick Start with curl
 ```bash
 curl http://silicon.fhgr.ch:7080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "qwen3.5-35b-a3b",
    "messages": [
      {"role": "user", "content": "What is the capital of Switzerland?"}
    ],
    "max_tokens": 256,
    "temperature": 0.7
  }'
 ```
 ---
 ## Recommended Parameters
 | Parameter       | Recommended | Notes                                        |
 |-----------------|-------------|----------------------------------------------|
 | `temperature`   | 0.7         | Lower = more deterministic, higher = creative |
 | `max_tokens`    | 1024–4096   | Increase for long-form output                |
 | `top_p`         | 0.95        | Nucleus sampling                             |
 | `stream`        | `true`      | Better UX for interactive use                |
 ---
 ## Tips & Etiquette
 - **Be mindful of context length**: Avoid excessively long prompts (>8K tokens) unless necessary.
 - **Use streaming**: Makes responses feel faster and reduces perceived latency.
 - **Don't spam requests**: The server is shared among ~15 students.
 - **Check the model name**: Always use `qwen3.5-35b-a3b` as the model parameter.
 ---
 ## Troubleshooting
 | Issue                       | Solution                                            |
 |-----------------------------|-----------------------------------------------------|
 | Connection refused          | Check you're on the university network / VPN        |
 | Model not found             | Use model name `qwen3.5-35b-a3b` exactly            |
 | Slow responses              | The model is shared — peak times may be slower      |
 | `401 Unauthorized`          | Ask your instructor for the API key                 |
 | Response cut off            | Increase `max_tokens` in your request               |
--- a/test_server.py
+++ b/test_server.py
@ -0,0 +1,70 @@
 """
 Quick test script to verify the vLLM server is running and responding.
 Usage:
    pip install openai
    python test_server.py [--host HOST] [--port PORT] [--api-key KEY]
 """
 import argparse
 import sys
 from openai import OpenAI
 def main():
    parser = argparse.ArgumentParser(description="Test vLLM inference server")
    parser.add_argument("--host", default="localhost", help="Server hostname")
    parser.add_argument("--port", default=7080, type=int, help="Server port")
    parser.add_argument("--api-key", default="EMPTY", help="API key")
    args = parser.parse_args()
    base_url = f"http://{args.host}:{args.port}/v1"
    model = "qwen3.5-35b-a3b"
    client = OpenAI(base_url=base_url, api_key=args.api_key)
    print(f"Connecting to {base_url} ...")
    print("\n--- Available Models ---")
    try:
        models = client.models.list()
        for m in models.data:
            print(f"  {m.id}")
    except Exception as e:
        print(f"ERROR: Cannot connect to server: {e}")
        sys.exit(1)
    print("\n--- Test Chat Completion ---")
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": "What is 2 + 2? Answer in one sentence."}
        ],
        max_tokens=256,
        temperature=0.7,
    )
    print(f"  Response: {response.choices[0].message.content}")
    print(f"  Tokens:   prompt={response.usage.prompt_tokens}, "
          f"completion={response.usage.completion_tokens}")
    print("\n--- Test Streaming ---")
    stream = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "user", "content": "Count from 1 to 5."}
        ],
        max_tokens=128,
        temperature=0.7,
        stream=True,
    )
    print("  Response: ", end="")
    for chunk in stream:
        if chunk.choices[0].delta.content:
            print(chunk.choices[0].delta.content, end="", flush=True)
    print("\n")
    print("All tests passed!")
 if __name__ == "__main__":
    main()
--- a/vllm_qwen.def
+++ b/vllm_qwen.def
@ -0,0 +1,23 @@
 Bootstrap: docker
 From: vllm/vllm-openai:latest
 %labels
    Author herzogfloria
    Description vLLM nightly inference server for Qwen3.5-35B-A3B
    Version 2.0
 %environment
    export HF_HOME=/tmp/hf_cache
    export VLLM_USAGE_SOURCE=production
 %post
    apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
    pip install --no-cache-dir vllm --extra-index-url https://wheels.vllm.ai/nightly
    pip install --no-cache-dir "transformers @ git+https://github.com/huggingface/transformers.git@main"
    pip install --no-cache-dir huggingface_hub[cli]
 %runscript
    exec python3 -m vllm.entrypoints.openai.api_server "$@"
 %help
    Apptainer container for serving Qwen3.5-35B-A3B via vLLM (nightly).