Add dynamic model discovery and improve code extraction in app

Auto-detect available models from the vLLM API instead of hardcoding. Extract code blocks by matching on language tag and picking the largest block, avoiding false matches on short pip/run commands. Made-with: Cursor
Fix 122B model download to use Python API instead of huggingface-cli
2026-03-02 20:03:45 +01:00 · 2026-03-02 20:03:39 +01:00 · 2026-03-02 19:00:32 +01:00
6 changed files with 327 additions and 67 deletions
--- a/10_download_model_122b.sh
+++ b/10_download_model_122b.sh
@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 10_download_model_122b.sh
+# Downloads Qwen3.5-122B-A10B-FP8 weights from Hugging Face
+# using huggingface-cli INSIDE the Apptainer container.
+#
+# Prerequisites:
+#   - Container built via 01_build_container.sh
+#
+# Usage:
+#   bash 10_download_model_122b.sh [TARGET_DIR]
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
+
+MODEL_ID="Qwen/Qwen3.5-122B-A10B-FP8"
+TARGET_DIR="${1:-$HOME/models/Qwen3.5-122B-A10B-FP8}"
+HF_CACHE_DIR="${HOME}/.cache/huggingface"
+
+if [ ! -f "$SIF_FILE" ]; then
+    echo "ERROR: Container image not found at ${SIF_FILE}"
+    echo "       Run 01_build_container.sh first."
+    exit 1
+fi
+
+echo "=== Downloading ${MODEL_ID} to ${TARGET_DIR} ==="
+echo "    Using huggingface-cli inside the container."
+echo "    This is ~125 GB and may take a while."
+echo ""
+
+mkdir -p "$TARGET_DIR" "$HF_CACHE_DIR"
+
+apptainer exec \
+    --writable-tmpfs \
+    --bind "$(dirname "$TARGET_DIR"):$(dirname "$TARGET_DIR")" \
+    --bind "${HF_CACHE_DIR}:${HF_CACHE_DIR}" \
+    --env HF_HOME="${HF_CACHE_DIR}" \
+    --env HF_HUB_CACHE="${HF_CACHE_DIR}/hub" \
+    --env XDG_CACHE_HOME="${HOME}/.cache" \
+    "$SIF_FILE" \
+    python3 -c "
+from huggingface_hub import snapshot_download
+snapshot_download(
+    '${MODEL_ID}',
+    local_dir='${TARGET_DIR}',
+    local_dir_use_symlinks=False,
+)
+"
+
+echo ""
+echo "=== Download complete ==="
+echo "Model stored at: ${TARGET_DIR}"
+echo "Total size:"
+du -sh "$TARGET_DIR"
--- a/11_start_server_122b.sh
+++ b/11_start_server_122b.sh
@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 11_start_server_122b.sh
+# Launches the vLLM inference server for Qwen3.5-122B-A10B-FP8
+# inside the Apptainer container using all 4 GPUs.
+#
+# NOTE: Only one model can run on port 7080 at a time.
+#       Stop the 35B model first:  bash 05_stop_server.sh
+#
+# Usage:
+#   bash 11_start_server_122b.sh
+#
+# Environment variables (override defaults):
+#   MODEL_DIR       - Path to model weights (default: ~/models/Qwen3.5-122B-A10B-FP8)
+#   PORT            - Server port (default: 7080)
+#   MAX_MODEL_LEN   - Maximum context length (default: 32768)
+#   GPU_MEM_UTIL    - GPU memory utilization fraction (default: 0.92)
+#   API_KEY         - API key for authentication (default: none)
+#   TENSOR_PARALLEL - Number of GPUs for tensor parallelism (default: 4)
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SIF_FILE="${SCRIPT_DIR}/vllm_qwen.sif"
+
+MODEL_DIR="${MODEL_DIR:-$HOME/models/Qwen3.5-122B-A10B-FP8}"
+PORT="${PORT:-7080}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-32768}"
+GPU_MEM_UTIL="${GPU_MEM_UTIL:-0.92}"
+API_KEY="${API_KEY:-}"
+TENSOR_PARALLEL="${TENSOR_PARALLEL:-4}"
+
+if [ ! -f "$SIF_FILE" ]; then
+    echo "ERROR: Container image not found at ${SIF_FILE}"
+    echo "       Run 01_build_container.sh first."
+    exit 1
+fi
+
+if [ ! -d "$MODEL_DIR" ]; then
+    echo "ERROR: Model directory not found at ${MODEL_DIR}"
+    echo "       Run 10_download_model_122b.sh first."
+    exit 1
+fi
+
+MODEL_PARENT="$(dirname "$MODEL_DIR")"
+MODEL_NAME="$(basename "$MODEL_DIR")"
+
+VLLM_ARGS=(
+    --model "/models/${MODEL_NAME}"
+    --port "$PORT"
+    --host 0.0.0.0
+    --tensor-parallel-size "$TENSOR_PARALLEL"
+    --max-model-len "$MAX_MODEL_LEN"
+    --gpu-memory-utilization "$GPU_MEM_UTIL"
+    --dtype auto
+    --trust-remote-code
+    --reasoning-parser qwen3
+    --served-model-name "qwen3.5-122b-a10b-fp8"
+    --max-num-seqs 16
+    --enable-prefix-caching
+)
+
+if [ -n "$API_KEY" ]; then
+    VLLM_ARGS+=(--api-key "$API_KEY")
+fi
+
+echo "=============================================="
+echo " vLLM Inference Server — Qwen3.5-122B-A10B-FP8"
+echo "=============================================="
+echo " Model:       ${MODEL_DIR}"
+echo " Container:   ${SIF_FILE}"
+echo " Port:        ${PORT}"
+echo " Context len: ${MAX_MODEL_LEN}"
+echo " GPU util:    ${GPU_MEM_UTIL}"
+echo " TP size:     ${TENSOR_PARALLEL}"
+echo " dtype:       auto (FP8)"
+echo " API key:     ${API_KEY:-<none>}"
+echo "=============================================="
+echo ""
+echo "Starting server... (Ctrl+C to stop)"
+echo "API will be available at: http://$(hostname):${PORT}/v1"
+echo ""
+
+apptainer exec --nv \
+    --writable-tmpfs \
+    --bind "${MODEL_PARENT}:/models" \
+    "$SIF_FILE" \
+    python3 -m vllm.entrypoints.openai.api_server "${VLLM_ARGS[@]}"
--- a/12_start_server_122b_background.sh
+++ b/12_start_server_122b_background.sh
@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------
+# 12_start_server_122b_background.sh
+# Launches the vLLM server for Qwen3.5-122B-A10B-FP8 in background.
+#
+# NOTE: Only one model can run on port 7080 at a time.
+#       Stop the current model first:  bash 05_stop_server.sh
+#
+# Usage:
+#   bash 12_start_server_122b_background.sh
+#
+# Logs are written to: ./logs/vllm_server_122b_<timestamp>.log
+# PID is written to:   ./logs/vllm_server.pid
+# ------------------------------------------------------------------
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+LOG_DIR="${SCRIPT_DIR}/logs"
+mkdir -p "$LOG_DIR"
+
+TIMESTAMP="$(date +%Y%m%d_%H%M%S)"
+LOG_FILE="${LOG_DIR}/vllm_server_122b_${TIMESTAMP}.log"
+PID_FILE="${LOG_DIR}/vllm_server.pid"
+
+if [ -f "$PID_FILE" ]; then
+    OLD_PID=$(cat "$PID_FILE")
+    if kill -0 "$OLD_PID" 2>/dev/null; then
+        echo "A vLLM server is already running with PID ${OLD_PID}"
+        echo "Stop it first:  bash 05_stop_server.sh"
+        exit 1
+    fi
+fi
+
+echo "Starting vLLM server (122B) in background..."
+echo "Log file: ${LOG_FILE}"
+
+nohup bash "${SCRIPT_DIR}/11_start_server_122b.sh" > "$LOG_FILE" 2>&1 &
+SERVER_PID=$!
+echo "$SERVER_PID" > "$PID_FILE"
+
+echo "Server PID: ${SERVER_PID}"
+echo ""
+echo "Monitor logs:    tail -f ${LOG_FILE}"
+echo "Stop server:     bash 05_stop_server.sh"
+echo ""
+
+sleep 3
+if kill -0 "$SERVER_PID" 2>/dev/null; then
+    echo "Server process is running. Waiting for model to load..."
+    echo "(This can take 5-10 minutes for Qwen3.5-122B-A10B-FP8)"
+else
+    echo "ERROR: Server process exited. Check logs:"
+    tail -20 "$LOG_FILE"
+    exit 1
+fi
--- a/README.md
+++ b/README.md
@ -1,10 +1,16 @@
-# LLM Inferenz Server — Qwen3.5-35B-A3B
+# LLM Inferenz Server — Qwen3.5

-Self-hosted LLM inference for ~15 concurrent students using **Qwen3.5-35B-A3B**
-(MoE, 35B total / 3B active per token), served via **vLLM** inside an
-**Apptainer** container on a GPU server. Two front-ends are provided:
-**Open WebUI** (server-hosted ChatGPT-like UI) and a **Streamlit app**
-(local chat + file editor with code execution).
+Self-hosted LLM inference for ~15 concurrent students, served via **vLLM**
+inside an **Apptainer** container on a GPU server. Two models are available
+(one at a time):
+
+| Model | Params | Active | Weights | GPUs |
+|-------|--------|--------|---------|------|
+| **Qwen3.5-35B-A3B** | 35B MoE | 3B | ~67 GB BF16 | 2× L40S (TP=2) |
+| **Qwen3.5-122B-A10B-FP8** | 122B MoE | 10B | ~125 GB FP8 | 4× L40S (TP=4) |
+
+Two front-ends are provided: **Open WebUI** (server-hosted ChatGPT-like UI)
+and a **Streamlit app** (local chat + file editor with code execution).

 ## Architecture

@ -25,33 +31,33 @@ Students
          │  vLLM Server (nightly)      │
          │  Apptainer container (.sif) │
          ├──────────────────────────────┤
-          │  Qwen3.5-35B-A3B weights    │
+          │  Model weights              │
          │  (bind-mounted from host)   │
          ├──────────────────────────────┤
-          │  2× NVIDIA L40S (46 GB ea.) │
-          │  Tensor Parallel = 2        │
+          │  4× NVIDIA L40S (46 GB ea.) │
+          │  184 GB total VRAM          │
          └──────────────────────────────┘
 ```

 ## Hardware

-The server `silicon.fhgr.ch` has **4× NVIDIA L40S** GPUs (46 GB VRAM each).
-The inference server uses **2 GPUs** with tensor parallelism, leaving 2 GPUs free.
+The server `silicon.fhgr.ch` has **4× NVIDIA L40S** GPUs (46 GB VRAM each,
+184 GB total). Only one model runs at a time on port 7080.

-| Component | Value |
-|-----------|-------|
-| GPUs used | 2× NVIDIA L40S |
-| VRAM used | ~92 GB total |
-| Model size (BF16) | ~67 GB |
-| Active params/token | 3B (MoE) |
-| Context length | 32,768 tokens |
-| Port | 7080 |
+| | Qwen3.5-35B-A3B | Qwen3.5-122B-A10B-FP8 |
+|---|---|---|
+| GPUs used | 2× L40S (TP=2) | 4× L40S (TP=4) |
+| VRAM used | ~92 GB | ~184 GB |
+| Weight size | ~67 GB (BF16) | ~125 GB (FP8) |
+| Active params/token | 3B (MoE) | 10B (MoE) |
+| Context length | 32,768 tokens | 32,768 tokens |
+| Port | 7080 | 7080 |

 ## Prerequisites

 - **Apptainer** (formerly Singularity) installed on the server
 - **NVIDIA drivers** with GPU passthrough support (`--nv` flag)
- **~80 GB disk** for model weights + ~8 GB for the container image
+- **~200 GB disk** for model weights (both models) + ~8 GB for the container image
 - **Network access** to Hugging Face (for model download) and Docker Hub (for container build)

 > **Note**: No `pip` or `python` is needed on the host — everything runs inside
@ -97,32 +103,45 @@ Pulls the `vllm/vllm-openai:nightly` Docker image (required for Qwen3.5
 support), installs latest `transformers` from source, and packages everything
 into `vllm_qwen.sif` (~8 GB). Takes 15-20 minutes.

-### Step 4: Download the Model (~67 GB)
+### Step 4: Download Model Weights

+**35B model (~67 GB):**
 ```bash
 bash 02_download_model.sh
 ```

-Downloads Qwen3.5-35B-A3B weights using `huggingface-cli` **inside the
-container**. Stored at `~/models/Qwen3.5-35B-A3B`. Takes 5-30 minutes
-depending on bandwidth.
+**122B model (~125 GB):**
+```bash
+bash 10_download_model_122b.sh
+```
+
+Both use `huggingface-cli` **inside the container**. Stored at
+`~/models/Qwen3.5-35B-A3B` and `~/models/Qwen3.5-122B-A10B-FP8` respectively.

 ### Step 5: Start the Server

-**Interactive (foreground) — recommended with tmux:**
+Only one model can run at a time on port 7080. Choose one:
+
+**35B model (2 GPUs, faster per-token, smaller):**
 ```bash
-tmux new -s llm
-bash 03_start_server.sh
-# Ctrl+B, then D to detach
+bash 03_start_server.sh                  # foreground
+bash 04_start_server_background.sh       # background
 ```

-**Background with logging:**
+**122B model (4 GPUs, more capable, FP8):**
 ```bash
-bash 04_start_server_background.sh
-tail -f logs/vllm_server_*.log
+bash 11_start_server_122b.sh             # foreground
+bash 12_start_server_122b_background.sh  # background
 ```

-The model takes 2-5 minutes to load into GPU memory. It's ready when you see:
+**To switch models:**
+```bash
+bash 05_stop_server.sh           # stop whichever is running
+bash 11_start_server_122b.sh     # start the other one
+```
+
+The model takes 2-5 minutes (35B) or 5-10 minutes (122B) to load. It's ready
+when you see:
 ```
 INFO:     Uvicorn running on http://0.0.0.0:7080
 ```
@ -175,7 +194,7 @@ Access it at `http://silicon.fhgr.ch:7081`.
 Distribute `STUDENT_GUIDE.md` with connection details:
 - **Open WebUI**: `http://silicon.fhgr.ch:7081` (recommended for most students)
 - **API Base URL**: `http://silicon.fhgr.ch:7080/v1` (for SDK / programmatic use)
- **Model name**: `qwen3.5-35b-a3b`
+- **Model name**: `qwen3.5-35b-a3b` or `qwen3.5-122b-a10b-fp8` (depending on which is running)

 ---

@ -271,28 +290,28 @@ Opens at `http://localhost:8501` with two tabs:

 ## Server Configuration

-All configuration is via environment variables passed to `03_start_server.sh`:
+Both start scripts accept the same environment variables:

-| Variable          | Default                          | Description                    |
-|-------------------|----------------------------------|--------------------------------|
-| `MODEL_DIR`       | `~/models/Qwen3.5-35B-A3B`      | Path to model weights          |
-| `PORT`            | `7080`                           | HTTP port                      |
-| `MAX_MODEL_LEN`   | `32768`                          | Max context length (tokens)    |
-| `GPU_MEM_UTIL`    | `0.92`                           | Fraction of GPU memory to use  |
-| `API_KEY`         | *(empty = no auth)*              | API key for authentication     |
-| `TENSOR_PARALLEL` | `2`                              | Number of GPUs                 |
+| Variable | 35B default | 122B default | Description |
+|----------|-------------|--------------|-------------|
+| `MODEL_DIR` | `~/models/Qwen3.5-35B-A3B` | `~/models/Qwen3.5-122B-A10B-FP8` | Model weights path |
+| `PORT` | `7080` | `7080` | HTTP port |
+| `MAX_MODEL_LEN` | `32768` | `32768` | Max context length |
+| `GPU_MEM_UTIL` | `0.92` | `0.92` | GPU memory fraction |
+| `API_KEY` | *(none)* | *(none)* | API key for auth |
+| `TENSOR_PARALLEL` | `2` | `4` | Number of GPUs |

 ### Examples

 ```bash
-# Increase context length
+# Increase context length (35B)
 MAX_MODEL_LEN=65536 bash 03_start_server.sh

-# Add API key authentication
-API_KEY="your-secret-key" bash 03_start_server.sh
+# Increase context length (122B — has room with FP8)
+MAX_MODEL_LEN=65536 bash 11_start_server_122b.sh

-# Use all 4 GPUs (more KV cache headroom)
-TENSOR_PARALLEL=4 bash 03_start_server.sh
+# Add API key authentication (works for either model)
+API_KEY="your-secret-key" bash 11_start_server_122b.sh
 ```

 ---
@ -327,14 +346,17 @@ tmux attach -t llm
 |------------------------------------|------------------------------------------------------|
 | `vllm_qwen.def`                   | Apptainer container definition (vLLM nightly + deps) |
 | `01_build_container.sh`            | Builds the Apptainer `.sif` image                    |
-| `02_download_model.sh`             | Downloads model weights (runs inside container)      |
-| `03_start_server.sh`               | Starts vLLM server (foreground)                      |
-| `04_start_server_background.sh`    | Starts vLLM server in background with logging        |
-| `05_stop_server.sh`                | Stops the background vLLM server                     |
+| `02_download_model.sh`             | Downloads 35B model weights                          |
+| `03_start_server.sh`               | Starts 35B vLLM server (foreground, TP=2)            |
+| `04_start_server_background.sh`    | Starts 35B server in background with logging         |
+| `05_stop_server.sh`                | Stops whichever background vLLM server is running    |
 | `06_setup_openwebui.sh`            | Pulls the Open WebUI container image                 |
 | `07_start_openwebui.sh`            | Starts Open WebUI (foreground)                       |
 | `08_start_openwebui_background.sh` | Starts Open WebUI in background with logging         |
 | `09_stop_openwebui.sh`             | Stops the background Open WebUI                      |
+| `10_download_model_122b.sh`        | Downloads 122B FP8 model weights                     |
+| `11_start_server_122b.sh`          | Starts 122B vLLM server (foreground, TP=4)           |
+| `12_start_server_122b_background.sh` | Starts 122B server in background with logging      |
 | `app.py`                           | Streamlit chat & file editor web app                 |
 | `requirements.txt`                 | Python dependencies for the Streamlit app            |
 | `test_server.py`                   | Tests the running server via CLI                     |
--- a/STUDENT_GUIDE.md
+++ b/STUDENT_GUIDE.md
@ -1,10 +1,15 @@
-# Student Guide — Qwen3.5-35B-A3B Inference Server
+# Student Guide — Qwen3.5 Inference Server

 ## Overview

-A **Qwen3.5-35B-A3B** language model is running on our GPU server. It's a
-Mixture-of-Experts model (35B total parameters, 3B active per token), providing
-fast and high-quality responses.
+A **Qwen3.5** large language model is running on our GPU server. Two models
+may be available at different times (your instructor will let you know which
+one is active):
+
+| Model | Params | Best for |
+|-------|--------|----------|
+| `qwen3.5-35b-a3b` | 35B (3B active) | Fast responses, everyday tasks |
+| `qwen3.5-122b-a10b-fp8` | 122B (10B active) | Complex reasoning, coding, research |

 There are **three ways** to interact with the model:

@ -12,7 +17,7 @@ There are **three ways** to interact with the model:
 2. **Streamlit App** — Local app with chat, file editor, and code execution
 3. **Python SDK / curl** — Programmatic access via the OpenAI-compatible API

-> **Note**: You must be on the university network or VPN to reach the server.
+> **Note**: You must be on the fhgr network or VPN to reach the server.

 ## Connection Details

@ -20,9 +25,13 @@ There are **three ways** to interact with the model:
 |------------------|---------------------------------------------|
 | **Open WebUI**   | `http://silicon.fhgr.ch:7081`               |
 | **API Base URL** | `http://silicon.fhgr.ch:7080/v1`            |
-| **Model**        | `qwen3.5-35b-a3b`                           |
+| **Model**        | *(check Open WebUI model selector or ask your instructor)* |
 | **API Key**      | *(ask your instructor — may be `EMPTY`)*    |

+> **Tip**: In Open WebUI, the model dropdown at the top automatically shows
+> whichever model is currently running. For the API, use
+> `curl http://silicon.fhgr.ch:7080/v1/models` to check.
+
 ---

 ## Option 1: Open WebUI (Recommended)
--- a/app.py
+++ b/app.py
@ -1,5 +1,5 @@
 """
-Streamlit Chat & File Editor for Qwen3.5-35B-A3B
+Streamlit Chat & File Editor for Qwen3.5

 A minimal interface to:
  1. Chat with the local LLM (OpenAI-compatible API)
@ -22,12 +22,29 @@ from pathlib import Path
 st.sidebar.header("Connection")
 API_BASE = st.sidebar.text_input("API Base URL", "http://silicon.fhgr.ch:7080/v1")
 API_KEY = st.sidebar.text_input("API Key", "EMPTY", type="password")
-MODEL = "qwen3.5-35b-a3b"
 WORKSPACE = Path("workspace")
 WORKSPACE.mkdir(exist_ok=True)

 client = OpenAI(base_url=API_BASE, api_key=API_KEY)

+
+@st.cache_data(ttl=30)
+def fetch_models(base_url: str, api_key: str) -> list[str]:
+    """Fetch available model IDs from the vLLM server."""
+    try:
+        c = OpenAI(base_url=base_url, api_key=api_key)
+        return [m.id for m in c.models.list().data]
+    except Exception:
+        return []
+
+
+available_models = fetch_models(API_BASE, API_KEY)
+if available_models:
+    MODEL = st.sidebar.selectbox("Model", available_models)
+else:
+    MODEL = st.sidebar.text_input("Model (server unreachable)", "qwen3.5-35b-a3b")
+    st.sidebar.warning("Could not fetch models from server.")
+
 # ---------------------------------------------------------------------------
 # Sidebar — LLM Parameters
 # ---------------------------------------------------------------------------
@ -56,14 +73,27 @@ MAX_CONTEXT = 32768


 def extract_code(text: str, lang: str = "") -> str:
-    """Extract the first fenced code block from markdown text.
-    Falls back to the full text if no code block is found."""
-    pattern = r"```(?:\w*)\n(.*?)```"
-    match = re.search(pattern, text, re.DOTALL)
-    if match:
-        return match.group(1).strip()
+    """Extract the best code block from markdown text.
+
+    Strategy:
+      1. Prefer blocks tagged with the target language (e.g. ```python)
+      2. Among candidates, pick the longest block (skip trivial one-liners)
+      3. Fall back to the longest block of any language
+      4. Fall back to the full text if no fenced block is found
+    """
+    tagged_pattern = r"```(\w*)\n(.*?)```"
+    matches = re.findall(tagged_pattern, text, re.DOTALL)
+    if not matches:
        return text.strip()

+    lang_lower = lang.lower()
+    lang_matches = [code for tag, code in matches if tag.lower() == lang_lower]
+    if lang_matches:
+        return max(lang_matches, key=len).strip()
+
+    all_blocks = [code for _, code in matches]
+    return max(all_blocks, key=len).strip()
+

 def estimate_tokens(messages: list[dict]) -> int:
    """Rough token estimate: ~4 characters per token."""
@ -140,7 +170,7 @@ tab_chat, tab_editor = st.tabs(["Chat", "File Editor"])
 # Tab 1: Chat
 # ---------------------------------------------------------------------------
 with tab_chat:
-    st.header("Chat with Qwen3.5")
+    st.header(f"Chat with {MODEL}")

    if "messages" not in st.session_state:
        st.session_state.messages = []