216 lines
6.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
server_utils.py Shared utilities for AISE501 Prompting Exercises
======================================================================
Connects to the vLLM inference server at silicon.fhgr.ch via the
OpenAI-compatible API.
This file is complete — no TODOs here.
"""
from openai import OpenAI
# ── Server configuration ──────────────────────────────────────────────────────
HOST = "silicon.fhgr.ch"
PORT = 7080
API_KEY = "EMPTY"
MODEL = "qwen3.5-35b-a3b" # model ID served on silicon.fhgr.ch
def get_client() -> OpenAI:
"""Return an OpenAI-compatible client pointing at the vLLM server."""
base_url = f"http://{HOST}:{PORT}/v1"
return OpenAI(base_url=base_url, api_key=API_KEY)
def list_models(client: OpenAI) -> list[str]:
"""Return all model IDs available on the server."""
return [m.id for m in client.models.list().data]
def chat(
client: OpenAI,
messages: list[dict],
model: str = MODEL,
temperature: float = 0.2,
max_tokens: int = 2048,
) -> str:
"""
Send a list of chat messages to the LLM and return the response text.
Qwen3's built-in chain-of-thought "think" mode is disabled via
``extra_body`` so that replies are direct and not wrapped in
<think>…</think> blocks.
Parameters
----------
client : OpenAI client returned by get_client()
messages : List of {"role": ..., "content": ...} dicts
model : Model ID (default: module-level MODEL constant)
temperature : Sampling temperature (0 = deterministic, 1 = creative)
max_tokens : Maximum number of tokens in the response
"""
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
return response.choices[0].message.content
def chat_json(
client: OpenAI,
messages: list[dict],
model: str = MODEL,
temperature: float = 0.2,
max_tokens: int = 2048,
) -> str:
"""
Like chat(), but forces the model to emit syntactically valid JSON via
response_format={"type": "json_object"}.
The server constrains token sampling so the output is always parseable
by json.loads() — no post-processing needed. Use this whenever you
need structured JSON output (Exercises 3 and 4).
Parameters are the same as chat(); temperature defaults to 0.2 because
deterministic output is usually preferable for structured data.
"""
response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
response_format={"type": "json_object"},
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
)
return response.choices[0].message.content
def _repair_json_strings(text: str) -> str:
"""
Replace unescaped control characters (newline, tab, carriage return)
inside JSON string values with their proper escape sequences.
LLMs frequently emit literal newlines inside long string values, which
is invalid JSON. This function fixes that without touching structural
whitespace outside strings.
"""
result: list[str] = []
in_string = False
escape = False
_escapes = {'\n': '\\n', '\r': '\\r', '\t': '\\t'}
for ch in text:
if escape:
result.append(ch)
escape = False
continue
if ch == '\\' and in_string:
result.append(ch)
escape = True
continue
if ch == '"':
in_string = not in_string
result.append(ch)
continue
if in_string and ch in _escapes:
result.append(_escapes[ch])
continue
result.append(ch)
return ''.join(result)
def extract_json(text: str) -> str:
"""
Extract and repair a JSON object or array from an LLM response that may
contain extra prose, markdown code fences, or unescaped control characters.
Strategy:
1. Strip markdown ```json ... ``` or ``` ... ``` fences.
2. Find the first '{' or '[' and extract to the matching closing bracket.
3. Repair unescaped newlines/tabs inside string values.
Returns the cleaned JSON string, or the original text as a fallback
(so json.loads can raise a meaningful error with context).
"""
import re
# 1. Strip markdown fences
fenced = re.sub(r"```(?:json)?\s*([\s\S]*?)\s*```", r"\1", text.strip())
if fenced != text.strip():
return _repair_json_strings(fenced.strip())
# 2. Find first JSON container and extract to matching close
extracted = text
for start_char, end_char in [('{', '}'), ('[', ']')]:
idx = text.find(start_char)
if idx == -1:
continue
depth = 0
in_string = False
escape = False
for i, ch in enumerate(text[idx:], start=idx):
if escape:
escape = False
continue
if ch == '\\' and in_string:
escape = True
continue
if ch == '"':
in_string = not in_string
continue
if in_string:
continue
if ch == start_char:
depth += 1
elif ch == end_char:
depth -= 1
if depth == 0:
extracted = text[idx: i + 1]
break
break
# 3. Repair unescaped control characters inside string values
return _repair_json_strings(extracted)
def strip_code_fences(text: str) -> str:
"""Remove markdown code fences (```python ... ```) from LLM output.
LLMs often wrap code in fences even when told not to. Call this before
writing LLM-generated code to a .py file so it is directly executable.
"""
import re
text = text.strip()
text = re.sub(r"^```\w*\n?", "", text)
text = re.sub(r"\n?```\s*$", "", text)
return text.strip()
def print_messages(messages: list[dict]) -> None:
"""Print the full messages list before sending it to the LLM.
Call this before chat() or chat_json() to inspect the exact prompt
hierarchy (system + user + assistant turns) that the model receives.
This is the primary debugging and learning tool for prompt engineering.
"""
width = 64
print("\n" + "" * width)
print(" PROMPT SENT TO LLM")
print("" * width)
for msg in messages:
role = msg["role"].upper()
print(f"\n── [{role}] " + "" * max(0, width - len(role) - 6))
print(msg["content"])
print("\n" + "" * width)
def print_separator(title: str = "") -> None:
"""Print a visual separator with an optional title."""
width = 64
print("\n" + "" * width)
if title:
print(f" {title}")
print("" * width)