""" server_utils.py – Shared utilities for AISE501 Prompting Exercises ====================================================================== Connects to the vLLM inference server at silicon.fhgr.ch via the OpenAI-compatible API. This file is complete — no TODOs here. """ from openai import OpenAI # ── Server configuration ────────────────────────────────────────────────────── HOST = "silicon.fhgr.ch" PORT = 7080 API_KEY = "EMPTY" MODEL = "qwen3.5-35b-a3b" # model ID served on silicon.fhgr.ch def get_client() -> OpenAI: """Return an OpenAI-compatible client pointing at the vLLM server.""" base_url = f"http://{HOST}:{PORT}/v1" return OpenAI(base_url=base_url, api_key=API_KEY) def list_models(client: OpenAI) -> list[str]: """Return all model IDs available on the server.""" return [m.id for m in client.models.list().data] def chat( client: OpenAI, messages: list[dict], model: str = MODEL, temperature: float = 0.2, max_tokens: int = 2048, ) -> str: """ Send a list of chat messages to the LLM and return the response text. Qwen3's built-in chain-of-thought "think" mode is disabled via ``extra_body`` so that replies are direct and not wrapped in blocks. Parameters ---------- client : OpenAI client returned by get_client() messages : List of {"role": ..., "content": ...} dicts model : Model ID (default: module-level MODEL constant) temperature : Sampling temperature (0 = deterministic, 1 = creative) max_tokens : Maximum number of tokens in the response """ response = client.chat.completions.create( model=model, messages=messages, max_tokens=max_tokens, temperature=temperature, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) return response.choices[0].message.content def chat_json( client: OpenAI, messages: list[dict], model: str = MODEL, temperature: float = 0.2, max_tokens: int = 2048, ) -> str: """ Like chat(), but forces the model to emit syntactically valid JSON via response_format={"type": "json_object"}. The server constrains token sampling so the output is always parseable by json.loads() — no post-processing needed. Use this whenever you need structured JSON output (Exercises 3 and 4). Parameters are the same as chat(); temperature defaults to 0.2 because deterministic output is usually preferable for structured data. """ response = client.chat.completions.create( model=model, messages=messages, max_tokens=max_tokens, temperature=temperature, response_format={"type": "json_object"}, extra_body={"chat_template_kwargs": {"enable_thinking": False}}, ) return response.choices[0].message.content def _repair_json_strings(text: str) -> str: """ Replace unescaped control characters (newline, tab, carriage return) inside JSON string values with their proper escape sequences. LLMs frequently emit literal newlines inside long string values, which is invalid JSON. This function fixes that without touching structural whitespace outside strings. """ result: list[str] = [] in_string = False escape = False _escapes = {'\n': '\\n', '\r': '\\r', '\t': '\\t'} for ch in text: if escape: result.append(ch) escape = False continue if ch == '\\' and in_string: result.append(ch) escape = True continue if ch == '"': in_string = not in_string result.append(ch) continue if in_string and ch in _escapes: result.append(_escapes[ch]) continue result.append(ch) return ''.join(result) def extract_json(text: str) -> str: """ Extract and repair a JSON object or array from an LLM response that may contain extra prose, markdown code fences, or unescaped control characters. Strategy: 1. Strip markdown ```json ... ``` or ``` ... ``` fences. 2. Find the first '{' or '[' and extract to the matching closing bracket. 3. Repair unescaped newlines/tabs inside string values. Returns the cleaned JSON string, or the original text as a fallback (so json.loads can raise a meaningful error with context). """ import re # 1. Strip markdown fences fenced = re.sub(r"```(?:json)?\s*([\s\S]*?)\s*```", r"\1", text.strip()) if fenced != text.strip(): return _repair_json_strings(fenced.strip()) # 2. Find first JSON container and extract to matching close extracted = text for start_char, end_char in [('{', '}'), ('[', ']')]: idx = text.find(start_char) if idx == -1: continue depth = 0 in_string = False escape = False for i, ch in enumerate(text[idx:], start=idx): if escape: escape = False continue if ch == '\\' and in_string: escape = True continue if ch == '"': in_string = not in_string continue if in_string: continue if ch == start_char: depth += 1 elif ch == end_char: depth -= 1 if depth == 0: extracted = text[idx: i + 1] break break # 3. Repair unescaped control characters inside string values return _repair_json_strings(extracted) def strip_code_fences(text: str) -> str: """Remove markdown code fences (```python ... ```) from LLM output. LLMs often wrap code in fences even when told not to. Call this before writing LLM-generated code to a .py file so it is directly executable. """ import re text = text.strip() text = re.sub(r"^```\w*\n?", "", text) text = re.sub(r"\n?```\s*$", "", text) return text.strip() def print_messages(messages: list[dict]) -> None: """Print the full messages list before sending it to the LLM. Call this before chat() or chat_json() to inspect the exact prompt hierarchy (system + user + assistant turns) that the model receives. This is the primary debugging and learning tool for prompt engineering. """ width = 64 print("\n" + "═" * width) print(" PROMPT SENT TO LLM") print("═" * width) for msg in messages: role = msg["role"].upper() print(f"\n── [{role}] " + "─" * max(0, width - len(role) - 6)) print(msg["content"]) print("\n" + "═" * width) def print_separator(title: str = "") -> None: """Print a visual separator with an optional title.""" width = 64 print("\n" + "─" * width) if title: print(f" {title}") print("─" * width)