Add vLLM inference setup for Qwen3.5-35B-A3B on Apptainer
Scripts to build container, download model, and serve Qwen3.5-35B-A3B via vLLM with OpenAI-compatible API on port 7080. Configured for 2x NVIDIA L40S GPUs with tensor parallelism, supporting ~15 concurrent students. Made-with: Cursor
This commit is contained in:
commit
076001b07f
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
# Apptainer container image (large binary)
|
||||
*.sif
|
||||
|
||||
# Logs
|
||||
logs/
|
||||
|
||||
# Model weights (downloaded separately)
|
||||
models/
|
||||
|
||||
# HuggingFace cache
|
||||
.cache/
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
32
01_build_container.sh
Executable file
32
01_build_container.sh
Executable file
@ -0,0 +1,32 @@
|
||||
#!/usr/bin/env bash
|
||||
# ------------------------------------------------------------------
|
||||
# 01_build_container.sh
|
||||
# Builds the Apptainer SIF image for vLLM inference.
|
||||
# This must be run FIRST — everything else runs inside the container.
|
||||
#
|
||||
# Usage:
|
||||
# bash 01_build_container.sh
|
||||
# ------------------------------------------------------------------
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
DEF_FILE="${SCRIPT_DIR}/vllm_qwen.def"
|
||||
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
|
||||
|
||||
if [ -f "$SIF_FILE" ]; then
|
||||
echo "WARNING: ${SIF_FILE} already exists."
|
||||
read -p "Rebuild? [y/N] " confirm
|
||||
[[ "$confirm" =~ ^[Yy]$ ]] || exit 0
|
||||
fi
|
||||
|
||||
echo "=== Building Apptainer image from ${DEF_FILE} ==="
|
||||
echo " This will pull the vLLM Docker image and convert it."
|
||||
echo " Estimated time: 10-20 minutes depending on network speed."
|
||||
echo ""
|
||||
|
||||
apptainer build --nv "$SIF_FILE" "$DEF_FILE"
|
||||
|
||||
echo ""
|
||||
echo "=== Build complete ==="
|
||||
echo "Image: ${SIF_FILE}"
|
||||
ls -lh "$SIF_FILE"
|
||||
50
02_download_model.sh
Executable file
50
02_download_model.sh
Executable file
@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env bash
|
||||
# ------------------------------------------------------------------
|
||||
# 02_download_model.sh
|
||||
# Downloads Qwen3.5-35B-A3B weights from Hugging Face
|
||||
# using huggingface-cli INSIDE the Apptainer container.
|
||||
#
|
||||
# Prerequisites:
|
||||
# - Container built via 01_build_container.sh
|
||||
#
|
||||
# Usage:
|
||||
# bash 02_download_model.sh [TARGET_DIR]
|
||||
# ------------------------------------------------------------------
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
|
||||
|
||||
MODEL_ID="Qwen/Qwen3.5-35B-A3B"
|
||||
TARGET_DIR="${1:-$HOME/models/Qwen3.5-35B-A3B}"
|
||||
HF_CACHE_DIR="${HOME}/.cache/huggingface"
|
||||
|
||||
if [ ! -f "$SIF_FILE" ]; then
|
||||
echo "ERROR: Container image not found at ${SIF_FILE}"
|
||||
echo " Run 01_build_container.sh first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Downloading ${MODEL_ID} to ${TARGET_DIR} ==="
|
||||
echo " Using huggingface-cli inside the container."
|
||||
echo ""
|
||||
|
||||
mkdir -p "$TARGET_DIR" "$HF_CACHE_DIR"
|
||||
|
||||
apptainer exec \
|
||||
--writable-tmpfs \
|
||||
--bind "$(dirname "$TARGET_DIR"):$(dirname "$TARGET_DIR")" \
|
||||
--bind "${HF_CACHE_DIR}:${HF_CACHE_DIR}" \
|
||||
--env HF_HOME="${HF_CACHE_DIR}" \
|
||||
--env HF_HUB_CACHE="${HF_CACHE_DIR}/hub" \
|
||||
--env XDG_CACHE_HOME="${HOME}/.cache" \
|
||||
"$SIF_FILE" \
|
||||
huggingface-cli download "$MODEL_ID" \
|
||||
--local-dir "$TARGET_DIR" \
|
||||
--local-dir-use-symlinks False
|
||||
|
||||
echo ""
|
||||
echo "=== Download complete ==="
|
||||
echo "Model stored at: ${TARGET_DIR}"
|
||||
echo "Total size:"
|
||||
du -sh "$TARGET_DIR"
|
||||
84
03_start_server.sh
Executable file
84
03_start_server.sh
Executable file
@ -0,0 +1,84 @@
|
||||
#!/usr/bin/env bash
|
||||
# ------------------------------------------------------------------
|
||||
# 03_start_server.sh
|
||||
# Launches the vLLM inference server for Qwen3.5-35B-A3B
|
||||
# inside the Apptainer container.
|
||||
#
|
||||
# Usage:
|
||||
# bash 03_start_server.sh
|
||||
#
|
||||
# Environment variables (override defaults):
|
||||
# MODEL_DIR - Path to model weights (default: ~/models/Qwen3.5-35B-A3B)
|
||||
# PORT - Server port (default: 7080)
|
||||
# MAX_MODEL_LEN - Maximum context length (default: 32768)
|
||||
# GPU_MEM_UTIL - GPU memory utilization fraction (default: 0.92)
|
||||
# API_KEY - API key for authentication (default: none)
|
||||
# TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 2)
|
||||
# ------------------------------------------------------------------
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
|
||||
|
||||
MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-35B-A3B}"
|
||||
PORT="${PORT:-7080}"
|
||||
MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
|
||||
GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
|
||||
API_KEY="${API_KEY:-}"
|
||||
TENSOR_PARALLEL="${TENSOR_PARALLEL:-2}"
|
||||
|
||||
if [ ! -f "$SIF_FILE" ]; then
|
||||
echo "ERROR: Container image not found at ${SIF_FILE}"
|
||||
echo " Run 01_build_container.sh first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d "$MODEL_DIR" ]; then
|
||||
echo "ERROR: Model directory not found at ${MODEL_DIR}"
|
||||
echo " Run 02_download_model.sh first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
MODEL_PARENT="$(dirname "$MODEL_DIR")"
|
||||
MODEL_NAME="$(basename "$MODEL_DIR")"
|
||||
|
||||
VLLM_ARGS=(
|
||||
--model "/models/${MODEL_NAME}"
|
||||
--port "$PORT"
|
||||
--host 0.0.0.0
|
||||
--tensor-parallel-size "$TENSOR_PARALLEL"
|
||||
--max-model-len "$MAX_MODEL_LEN"
|
||||
--gpu-memory-utilization "$GPU_MEM_UTIL"
|
||||
--dtype bfloat16
|
||||
--trust-remote-code
|
||||
--reasoning-parser qwen3
|
||||
--served-model-name "qwen3.5-35b-a3b"
|
||||
--max-num-seqs 16
|
||||
--enable-prefix-caching
|
||||
)
|
||||
|
||||
if [ -n "$API_KEY" ]; then
|
||||
VLLM_ARGS+=(--api-key "$API_KEY")
|
||||
fi
|
||||
|
||||
echo "=============================================="
|
||||
echo " vLLM Inference Server — Qwen3.5-35B-A3B"
|
||||
echo "=============================================="
|
||||
echo " Model: ${MODEL_DIR}"
|
||||
echo " Container: ${SIF_FILE}"
|
||||
echo " Port: ${PORT}"
|
||||
echo " Context len: ${MAX_MODEL_LEN}"
|
||||
echo " GPU util: ${GPU_MEM_UTIL}"
|
||||
echo " TP size: ${TENSOR_PARALLEL}"
|
||||
echo " API key: ${API_KEY:-<none>}"
|
||||
echo "=============================================="
|
||||
echo ""
|
||||
echo "Starting server... (Ctrl+C to stop)"
|
||||
echo "API will be available at: http://$(hostname):${PORT}/v1"
|
||||
echo ""
|
||||
|
||||
apptainer exec --nv \
|
||||
--writable-tmpfs \
|
||||
--bind "${MODEL_PARENT}:/models" \
|
||||
"$SIF_FILE" \
|
||||
python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
|
||||
53
04_start_server_background.sh
Executable file
53
04_start_server_background.sh
Executable file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
# ------------------------------------------------------------------
|
||||
# 04_start_server_background.sh
|
||||
# Launches the vLLM server in the background with logging.
|
||||
# Useful for long-running deployments or running inside tmux/screen.
|
||||
#
|
||||
# Usage:
|
||||
# bash 04_start_server_background.sh
|
||||
#
|
||||
# Logs are written to: ./logs/vllm_server_<timestamp>.log
|
||||
# PID is written to: ./logs/vllm_server.pid
|
||||
# ------------------------------------------------------------------
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
LOG_DIR="${SCRIPT_DIR}/logs"
|
||||
mkdir -p "$LOG_DIR"
|
||||
|
||||
TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
|
||||
LOG_FILE="${LOG_DIR}/vllm_server_${TIMESTAMP}.log"
|
||||
PID_FILE="${LOG_DIR}/vllm_server.pid"
|
||||
|
||||
if [ -f "$PID_FILE" ]; then
|
||||
OLD_PID=$(cat "$PID_FILE")
|
||||
if kill -0 "$OLD_PID" 2>/dev/null; then
|
||||
echo "Server already running with PID ${OLD_PID}"
|
||||
echo "Stop it first: bash 05_stop_server.sh"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Starting vLLM server in background..."
|
||||
echo "Log file: ${LOG_FILE}"
|
||||
|
||||
nohup bash "${SCRIPT_DIR}/03_start_server.sh" > "$LOG_FILE" 2>&1 &
|
||||
SERVER_PID=$!
|
||||
echo "$SERVER_PID" > "$PID_FILE"
|
||||
|
||||
echo "Server PID: ${SERVER_PID}"
|
||||
echo ""
|
||||
echo "Monitor logs: tail -f ${LOG_FILE}"
|
||||
echo "Stop server: bash 05_stop_server.sh"
|
||||
echo ""
|
||||
|
||||
sleep 3
|
||||
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||
echo "Server process is running. Waiting for model to load..."
|
||||
echo "(This can take several minutes for Qwen3.5-35B-A3B)"
|
||||
else
|
||||
echo "ERROR: Server process exited. Check logs:"
|
||||
tail -20 "$LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
31
05_stop_server.sh
Executable file
31
05_stop_server.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
# ------------------------------------------------------------------
|
||||
# 05_stop_server.sh
|
||||
# Gracefully stops the background vLLM server.
|
||||
# ------------------------------------------------------------------
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PID_FILE="${SCRIPT_DIR}/logs/vllm_server.pid"
|
||||
|
||||
if [ ! -f "$PID_FILE" ]; then
|
||||
echo "No PID file found. Server may not be running."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
SERVER_PID=$(cat "$PID_FILE")
|
||||
|
||||
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||
echo "Stopping server (PID: ${SERVER_PID})..."
|
||||
kill "$SERVER_PID"
|
||||
sleep 2
|
||||
if kill -0 "$SERVER_PID" 2>/dev/null; then
|
||||
echo "Process still alive, sending SIGKILL..."
|
||||
kill -9 "$SERVER_PID"
|
||||
fi
|
||||
echo "Server stopped."
|
||||
else
|
||||
echo "Server process (PID: ${SERVER_PID}) is not running."
|
||||
fi
|
||||
|
||||
rm -f "$PID_FILE"
|
||||
265
README.md
Normal file
265
README.md
Normal file
@ -0,0 +1,265 @@
|
||||
# LLM Local — Qwen3.5-27B Inference Server
|
||||
|
||||
Self-hosted LLM inference for ~15 concurrent students using **Qwen3.5-27B**,
|
||||
served via **vLLM** inside an **Apptainer** container on a GPU server.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Students (OpenAI SDK / curl)
|
||||
│
|
||||
▼
|
||||
┌─────────────────────────┐
|
||||
│ silicon.fhgr.ch:7080 │
|
||||
│ OpenAI-compatible API │
|
||||
├─────────────────────────┤
|
||||
│ vLLM Server │
|
||||
│ (Apptainer container) │
|
||||
├─────────────────────────┤
|
||||
│ Qwen3.5-27B weights │
|
||||
│ (bind-mounted) │
|
||||
├─────────────────────────┤
|
||||
│ NVIDIA GPU │
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- **GPU**: NVIDIA GPU with >=80 GB VRAM (A100-80GB or H100 recommended).
|
||||
Qwen3.5-27B in BF16 requires ~56 GB VRAM plus KV cache overhead.
|
||||
- **Apptainer** (formerly Singularity) installed on the server.
|
||||
- **NVIDIA drivers** + **nvidia-container-cli** for GPU passthrough.
|
||||
- **~60 GB disk space** for model weights + ~15 GB for the container image.
|
||||
- **Network**: Students must be on the university network or VPN.
|
||||
|
||||
## Hardware Sizing
|
||||
|
||||
| Component | Minimum | Recommended |
|
||||
|-----------|----------------|-----------------|
|
||||
| GPU VRAM | 80 GB (1× A100)| 80 GB (1× H100) |
|
||||
| RAM | 64 GB | 128 GB |
|
||||
| Disk | 100 GB free | 200 GB free |
|
||||
|
||||
> **If your GPU has less than 80 GB VRAM**, you have two options:
|
||||
> 1. Use a **quantized** version (e.g., AWQ/GPTQ 4-bit — ~16 GB VRAM)
|
||||
> 2. Use **tensor parallelism** across multiple GPUs (set `TENSOR_PARALLEL=2`)
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step Setup
|
||||
|
||||
### Step 0: SSH into the Server
|
||||
|
||||
```bash
|
||||
ssh herzogfloria@silicon.fhgr.ch
|
||||
```
|
||||
|
||||
### Step 1: Clone This Repository
|
||||
|
||||
```bash
|
||||
# Or copy the files to the server
|
||||
git clone <your-repo-url> ~/LLM_local
|
||||
cd ~/LLM_local
|
||||
chmod +x *.sh
|
||||
```
|
||||
|
||||
### Step 2: Check GPU and Environment
|
||||
|
||||
```bash
|
||||
# Verify GPU is visible
|
||||
nvidia-smi
|
||||
|
||||
# Verify Apptainer is installed
|
||||
apptainer --version
|
||||
|
||||
# Check available disk space
|
||||
df -h ~
|
||||
```
|
||||
|
||||
### Step 3: Download the Model (~60 GB)
|
||||
|
||||
```bash
|
||||
# Install huggingface-cli if not available
|
||||
pip install --user huggingface_hub[cli]
|
||||
|
||||
# Download Qwen3.5-27B
|
||||
bash 01_download_model.sh
|
||||
# Default target: ~/models/Qwen3.5-27B
|
||||
```
|
||||
|
||||
This downloads the full BF16 weights. Takes 20-60 minutes depending on bandwidth.
|
||||
|
||||
### Step 4: Build the Apptainer Container
|
||||
|
||||
```bash
|
||||
bash 02_build_container.sh
|
||||
```
|
||||
|
||||
This pulls the `vllm/vllm-openai:latest` Docker image and converts it to a `.sif` file.
|
||||
Takes 10-20 minutes. The resulting `vllm_qwen.sif` is ~12-15 GB.
|
||||
|
||||
> **Tip**: If building fails due to network/proxy issues, you can pull the Docker image
|
||||
> first and convert manually:
|
||||
> ```bash
|
||||
> apptainer pull docker://vllm/vllm-openai:latest
|
||||
> ```
|
||||
|
||||
### Step 5: Start the Server
|
||||
|
||||
**Interactive (foreground):**
|
||||
```bash
|
||||
bash 03_start_server.sh
|
||||
```
|
||||
|
||||
**Background (recommended for production):**
|
||||
```bash
|
||||
bash 04_start_server_background.sh
|
||||
```
|
||||
|
||||
The server takes 2-5 minutes to load the model into GPU memory. Monitor with:
|
||||
```bash
|
||||
tail -f logs/vllm_server_*.log
|
||||
```
|
||||
|
||||
Look for the line:
|
||||
```
|
||||
INFO: Uvicorn running on http://0.0.0.0:8000
|
||||
```
|
||||
|
||||
### Step 6: Test the Server
|
||||
|
||||
```bash
|
||||
# Quick health check
|
||||
curl http://localhost:7080/v1/models
|
||||
|
||||
# Full test
|
||||
pip install openai
|
||||
python test_server.py
|
||||
```
|
||||
|
||||
### Step 7: Share with Students
|
||||
|
||||
Distribute the `STUDENT_GUIDE.md` file or share the connection details:
|
||||
- **27B Base URL**: `http://silicon.fhgr.ch:7080/v1` — model name: `qwen3.5-27b`
|
||||
- **35B Base URL**: `http://silicon.fhgr.ch:7081/v1` — model name: `qwen3.5-35b-a3b`
|
||||
|
||||
---
|
||||
|
||||
## Configuration
|
||||
|
||||
All configuration is via environment variables in `03_start_server.sh`:
|
||||
|
||||
| Variable | Default | Description |
|
||||
|-------------------|------------------------------|-------------------------------------|
|
||||
| `MODEL_DIR` | `~/models/Qwen3.5-27B` | Path to model weights |
|
||||
| `PORT` | `7080` | HTTP port |
|
||||
| `MAX_MODEL_LEN` | `32768` | Max context length (tokens) |
|
||||
| `GPU_MEM_UTIL` | `0.92` | Fraction of GPU memory to use |
|
||||
| `API_KEY` | *(empty = no auth)* | API key for authentication |
|
||||
| `TENSOR_PARALLEL` | `1` | Number of GPUs |
|
||||
|
||||
### Context Length Tuning
|
||||
|
||||
The default `MAX_MODEL_LEN=32768` is conservative and ensures stable operation for 15
|
||||
concurrent users. If you have plenty of VRAM headroom:
|
||||
|
||||
```bash
|
||||
MAX_MODEL_LEN=65536 bash 03_start_server.sh
|
||||
```
|
||||
|
||||
Qwen3.5-27B natively supports up to 262,144 tokens, but longer contexts require
|
||||
significantly more GPU memory for KV cache.
|
||||
|
||||
### Adding Authentication
|
||||
|
||||
```bash
|
||||
API_KEY="your-secret-key-here" bash 03_start_server.sh
|
||||
```
|
||||
|
||||
Students then use this key in their `api_key` parameter.
|
||||
|
||||
### Multi-GPU Setup
|
||||
|
||||
If you have multiple GPUs:
|
||||
|
||||
```bash
|
||||
TENSOR_PARALLEL=2 bash 03_start_server.sh
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Server Management
|
||||
|
||||
```bash
|
||||
# Start in background
|
||||
bash 04_start_server_background.sh
|
||||
|
||||
# Check if running
|
||||
curl -s http://localhost:7080/v1/models | python -m json.tool
|
||||
|
||||
# View logs
|
||||
tail -f logs/vllm_server_*.log
|
||||
|
||||
# Stop
|
||||
bash 05_stop_server.sh
|
||||
|
||||
# Monitor GPU usage
|
||||
watch -n 2 nvidia-smi
|
||||
```
|
||||
|
||||
### Running Persistently with tmux
|
||||
|
||||
For a robust setup that survives SSH disconnects:
|
||||
|
||||
```bash
|
||||
ssh herzogfloria@silicon.fhgr.ch
|
||||
tmux new -s llm_server
|
||||
bash 03_start_server.sh
|
||||
# Press Ctrl+B, then D to detach
|
||||
|
||||
# Reconnect later:
|
||||
tmux attach -t llm_server
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files Overview
|
||||
|
||||
| File | Purpose |
|
||||
|------------------------------|------------------------------------------- |
|
||||
| `vllm_qwen.def` | Apptainer container definition |
|
||||
| `01_download_model.sh` | Downloads model weights from Hugging Face |
|
||||
| `02_build_container.sh` | Builds the Apptainer .sif image |
|
||||
| `03_start_server.sh` | Starts vLLM server (foreground) |
|
||||
| `04_start_server_background.sh` | Starts server in background with logging|
|
||||
| `05_stop_server.sh` | Stops the background server |
|
||||
| `test_server.py` | Tests the running server |
|
||||
| `STUDENT_GUIDE.md` | Instructions for students |
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### "CUDA out of memory"
|
||||
- Reduce `MAX_MODEL_LEN` (e.g., 16384)
|
||||
- Reduce `GPU_MEM_UTIL` (e.g., 0.85)
|
||||
- Use a quantized model variant
|
||||
|
||||
### Container build fails
|
||||
- Ensure you have internet access and sufficient disk space (~20 GB for build cache)
|
||||
- Try: `apptainer pull docker://vllm/vllm-openai:latest` first
|
||||
|
||||
### "No NVIDIA GPU detected"
|
||||
- Check that `nvidia-smi` works outside the container
|
||||
- Ensure `--nv` flag is passed (already in scripts)
|
||||
- Verify nvidia-container-cli: `apptainer exec --nv vllm_qwen.sif nvidia-smi`
|
||||
|
||||
### Server starts but students can't connect
|
||||
- Check firewall: `sudo ufw allow 7080:7090/tcp` or equivalent
|
||||
- Verify the server binds to `0.0.0.0` (not just localhost)
|
||||
- Students must use the server's hostname/IP, not `localhost`
|
||||
|
||||
### Slow generation with many users
|
||||
- This is expected — vLLM batches requests but throughput is finite
|
||||
- Consider reducing `max_tokens` in student requests
|
||||
- Monitor with: `curl http://localhost:7080/metrics`
|
||||
118
STUDENT_GUIDE.md
Normal file
118
STUDENT_GUIDE.md
Normal file
@ -0,0 +1,118 @@
|
||||
# Student Guide — Qwen3.5-35B-A3B Inference Server
|
||||
|
||||
## Overview
|
||||
|
||||
A **Qwen3.5-35B-A3B** language model is running on our GPU server. It's a
|
||||
Mixture-of-Experts model (35B total parameters, 3B active per token), providing
|
||||
fast and high-quality responses. You can interact with it using the
|
||||
**OpenAI-compatible API**.
|
||||
|
||||
## Connection Details
|
||||
|
||||
| Parameter | Value |
|
||||
|------------- |---------------------------------------------|
|
||||
| **Base URL** | `http://silicon.fhgr.ch:7080/v1` |
|
||||
| **Model** | `qwen3.5-35b-a3b` |
|
||||
| **API Key** | *(ask your instructor — may be `EMPTY`)* |
|
||||
|
||||
> **Note**: You must be on the university network or VPN to reach the server.
|
||||
|
||||
---
|
||||
|
||||
## Quick Start with Python
|
||||
|
||||
### 1. Install the OpenAI SDK
|
||||
|
||||
```bash
|
||||
pip install openai
|
||||
```
|
||||
|
||||
### 2. Simple Chat
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="http://silicon.fhgr.ch:7080/v1",
|
||||
api_key="EMPTY", # replace if your instructor set a key
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="qwen3.5-35b-a3b",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": "Explain gradient descent in simple terms."},
|
||||
],
|
||||
max_tokens=1024,
|
||||
temperature=0.7,
|
||||
)
|
||||
|
||||
print(response.choices[0].message.content)
|
||||
```
|
||||
|
||||
### 3. Streaming Responses
|
||||
|
||||
```python
|
||||
stream = client.chat.completions.create(
|
||||
model="qwen3.5-35b-a3b",
|
||||
messages=[
|
||||
{"role": "user", "content": "Write a haiku about machine learning."},
|
||||
],
|
||||
max_tokens=256,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
for chunk in stream:
|
||||
if chunk.choices[0].delta.content:
|
||||
print(chunk.choices[0].delta.content, end="", flush=True)
|
||||
print()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick Start with curl
|
||||
|
||||
```bash
|
||||
curl http://silicon.fhgr.ch:7080/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3.5-35b-a3b",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What is the capital of Switzerland?"}
|
||||
],
|
||||
"max_tokens": 256,
|
||||
"temperature": 0.7
|
||||
}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Recommended Parameters
|
||||
|
||||
| Parameter | Recommended | Notes |
|
||||
|-----------------|-------------|----------------------------------------------|
|
||||
| `temperature` | 0.7 | Lower = more deterministic, higher = creative |
|
||||
| `max_tokens` | 1024–4096 | Increase for long-form output |
|
||||
| `top_p` | 0.95 | Nucleus sampling |
|
||||
| `stream` | `true` | Better UX for interactive use |
|
||||
|
||||
---
|
||||
|
||||
## Tips & Etiquette
|
||||
|
||||
- **Be mindful of context length**: Avoid excessively long prompts (>8K tokens) unless necessary.
|
||||
- **Use streaming**: Makes responses feel faster and reduces perceived latency.
|
||||
- **Don't spam requests**: The server is shared among ~15 students.
|
||||
- **Check the model name**: Always use `qwen3.5-35b-a3b` as the model parameter.
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Issue | Solution |
|
||||
|-----------------------------|-----------------------------------------------------|
|
||||
| Connection refused | Check you're on the university network / VPN |
|
||||
| Model not found | Use model name `qwen3.5-35b-a3b` exactly |
|
||||
| Slow responses | The model is shared — peak times may be slower |
|
||||
| `401 Unauthorized` | Ask your instructor for the API key |
|
||||
| Response cut off | Increase `max_tokens` in your request |
|
||||
70
test_server.py
Normal file
70
test_server.py
Normal file
@ -0,0 +1,70 @@
|
||||
"""
|
||||
Quick test script to verify the vLLM server is running and responding.
|
||||
|
||||
Usage:
|
||||
pip install openai
|
||||
python test_server.py [--host HOST] [--port PORT] [--api-key KEY]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test vLLM inference server")
|
||||
parser.add_argument("--host", default="localhost", help="Server hostname")
|
||||
parser.add_argument("--port", default=7080, type=int, help="Server port")
|
||||
parser.add_argument("--api-key", default="EMPTY", help="API key")
|
||||
args = parser.parse_args()
|
||||
|
||||
base_url = f"http://{args.host}:{args.port}/v1"
|
||||
model = "qwen3.5-35b-a3b"
|
||||
client = OpenAI(base_url=base_url, api_key=args.api_key)
|
||||
|
||||
print(f"Connecting to {base_url} ...")
|
||||
|
||||
print("\n--- Available Models ---")
|
||||
try:
|
||||
models = client.models.list()
|
||||
for m in models.data:
|
||||
print(f" {m.id}")
|
||||
except Exception as e:
|
||||
print(f"ERROR: Cannot connect to server: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
print("\n--- Test Chat Completion ---")
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": "What is 2 + 2? Answer in one sentence."}
|
||||
],
|
||||
max_tokens=256,
|
||||
temperature=0.7,
|
||||
)
|
||||
print(f" Response: {response.choices[0].message.content}")
|
||||
print(f" Tokens: prompt={response.usage.prompt_tokens}, "
|
||||
f"completion={response.usage.completion_tokens}")
|
||||
|
||||
print("\n--- Test Streaming ---")
|
||||
stream = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "user", "content": "Count from 1 to 5."}
|
||||
],
|
||||
max_tokens=128,
|
||||
temperature=0.7,
|
||||
stream=True,
|
||||
)
|
||||
print(" Response: ", end="")
|
||||
for chunk in stream:
|
||||
if chunk.choices[0].delta.content:
|
||||
print(chunk.choices[0].delta.content, end="", flush=True)
|
||||
print("\n")
|
||||
|
||||
print("All tests passed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
23
vllm_qwen.def
Normal file
23
vllm_qwen.def
Normal file
@ -0,0 +1,23 @@
|
||||
Bootstrap: docker
|
||||
From: vllm/vllm-openai:latest
|
||||
|
||||
%labels
|
||||
Author herzogfloria
|
||||
Description vLLM nightly inference server for Qwen3.5-35B-A3B
|
||||
Version 2.0
|
||||
|
||||
%environment
|
||||
export HF_HOME=/tmp/hf_cache
|
||||
export VLLM_USAGE_SOURCE=production
|
||||
|
||||
%post
|
||||
apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
|
||||
pip install --no-cache-dir vllm --extra-index-url https://wheels.vllm.ai/nightly
|
||||
pip install --no-cache-dir "transformers @ git+https://github.com/huggingface/transformers.git@main"
|
||||
pip install --no-cache-dir huggingface_hub[cli]
|
||||
|
||||
%runscript
|
||||
exec python3 -m vllm.entrypoints.openai.api_server "$@"
|
||||
|
||||
%help
|
||||
Apptainer container for serving Qwen3.5-35B-A3B via vLLM (nightly).
|
||||
Loading…
x
Reference in New Issue
Block a user