diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aab4afb --- /dev/null +++ b/.gitignore @@ -0,0 +1,172 @@ +# ---> Python +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +**/.env +**/.venv + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..ab1f416 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Ignored default folder with query files +/queries/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/aise-501_aise_in_se_i.iml b/.idea/aise-501_aise_in_se_i.iml new file mode 100644 index 0000000..4b71d81 --- /dev/null +++ b/.idea/aise-501_aise_in_se_i.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 0000000..28d6d4c --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,12 @@ + + + + + postgresql + true + org.postgresql.Driver + jdbc:postgresql://localhost:5432/postgres + $ProjectFileDir$ + + + \ No newline at end of file diff --git a/.idea/data_source_mapping.xml b/.idea/data_source_mapping.xml new file mode 100644 index 0000000..744fe16 --- /dev/null +++ b/.idea/data_source_mapping.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..dd4c951 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,7 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..27c5270 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..5977a4a --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/AISE501 LLM Zugang/STUDENT_GUIDE.md b/AISE501 LLM Zugang/STUDENT_GUIDE.md new file mode 100644 index 0000000..33f2696 --- /dev/null +++ b/AISE501 LLM Zugang/STUDENT_GUIDE.md @@ -0,0 +1,272 @@ +# Student Guide — Qwen3.5 Inference Server + +## Overview + +A **Qwen3.5** large language model is running on our GPU server. Two models +may be available at different times (your instructor will let you know which +one is active): + +| Model | Params | Best for | +|-------|--------|----------| +| `qwen3.5-35b-a3b` | 35B (3B active) | Fast responses, everyday tasks | +| `qwen3.5-122b-a10b-fp8` | 122B (10B active) | Complex reasoning, coding, research | + +There are **three ways** to interact with the model: + +1. **Open WebUI** — ChatGPT-like interface in your browser (easiest) +2. **Streamlit App** — Local app with chat, file editor, and code execution +3. **Python SDK / curl** — Programmatic access via the OpenAI-compatible API + +> **Note**: You must be on the fhgr network or VPN to reach the server. + +## Connection Details + +| Parameter | Value | +|------------------|---------------------------------------------| +| **Open WebUI** | `http://silicon.fhgr.ch:7081` | +| **API Base URL** | `http://silicon.fhgr.ch:7080/v1` | +| **Model** | *(check Open WebUI model selector or ask your instructor)* | +| **API Key** | *(ask your instructor — may be `EMPTY`)* | + +> **Tip**: In Open WebUI, the model dropdown at the top automatically shows +> whichever model is currently running. For the API, use +> `curl http://silicon.fhgr.ch:7080/v1/models` to check. + +--- + +## Option 1: Open WebUI (Recommended) + +The easiest way to chat with the model — no installation required. + +### Getting Started + +1. Make sure you are connected to the **university network** (or VPN). +2. Open your browser and go to **http://silicon.fhgr.ch:7081** +3. Click **"Sign Up"** to create a new account: + - Enter your **name** (e.g. your first and last name) + - Enter your **email** (use your university email) + - Choose a **password** + - Click **"Create Account"** +4. After signing up you are logged in automatically. +5. Select the model **qwen3.5-35b-a3b** from the model dropdown at the top. +6. Type a message and press Enter — you're chatting with the LLM. + +### Returning Later + +- Go to **http://silicon.fhgr.ch:7081** and click **"Sign In"**. +- Enter the email and password you used during sign-up. +- All your previous chats are still there. + +### Features + +- **Chat history** — all conversations are saved on the server and persist across sessions +- **Markdown rendering** with syntax-highlighted code blocks +- **Model selector** — auto-discovers available models from the server +- **Conversation branching** — edit previous messages and explore alternative responses +- **File upload** — attach files to your messages for the model to analyze +- **Search** — search across all your past conversations + +### Tips + +- Your account and chat history are stored on the server. You can log in + from any device on the university network. +- If you forget your password, ask your instructor to reset it via the + Admin Panel. +- The model works best when you provide clear, specific instructions. +- For code tasks, mention the programming language explicitly (e.g. + "Write a Python function that..."). +- Long conversations use more context. Start a **New Chat** (top-left + button) when switching topics to get faster, more focused responses. + +--- + +## Option 2: Streamlit App (Chat + File Editor) + +A local app with chat, file editing, and Python/LaTeX execution. +See the [Streamlit section below](#streamlit-chat--file-editor-app) for setup. + +--- + +## Option 3: Python SDK / curl + +For programmatic access and scripting. + +### Quick Start with Python + +#### 1. Install the OpenAI SDK + +```bash +pip install openai +``` + +#### 2. Simple Chat + +```python +from openai import OpenAI + +client = OpenAI( + base_url="http://silicon.fhgr.ch:7080/v1", + api_key="EMPTY", # replace if your instructor set a key +) + +response = client.chat.completions.create( + model="qwen3.5-35b-a3b", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Explain gradient descent in simple terms."}, + ], + max_tokens=1024, + temperature=0.7, +) + +print(response.choices[0].message.content) +``` + +#### 3. Streaming Responses + +```python +stream = client.chat.completions.create( + model="qwen3.5-35b-a3b", + messages=[ + {"role": "user", "content": "Write a haiku about machine learning."}, + ], + max_tokens=256, + stream=True, +) + +for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) +print() +``` + +--- + +### Quick Start with curl + +```bash +curl http://silicon.fhgr.ch:7080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen3.5-35b-a3b", + "messages": [ + {"role": "user", "content": "What is the capital of Switzerland?"} + ], + "max_tokens": 256, + "temperature": 0.7 + }' +``` + +--- + +## Recommended Parameters + +| Parameter | Recommended | Notes | +|-----------------|-------------|----------------------------------------------| +| `temperature` | 0.7 | Lower = more deterministic, higher = creative | +| `max_tokens` | 1024–4096 | Increase for long-form output | +| `top_p` | 0.95 | Nucleus sampling | +| `stream` | `true` | Better UX for interactive use | + +--- + +## Tips & Etiquette + +- **Be mindful of context length**: Avoid excessively long prompts (>8K tokens) unless necessary. +- **Use streaming**: Makes responses feel faster and reduces perceived latency. +- **Don't spam requests**: The server is shared among ~15 students. +- **Check the model name**: Always use `qwen3.5-35b-a3b` as the model parameter. + +--- + +## Streamlit Chat & File Editor App + +A web UI is included for chatting with the model and editing files. It runs +on your own machine and connects to the GPU server. + +### Setup + +```bash +# Clone the repository +git clone https://gitea.fhgr.ch/herzogfloria/LLM_Inferenz_Server_1.git +cd LLM_Inferenz_Server_1 + +# Create a virtual environment and install dependencies +python3 -m venv .venv +source .venv/bin/activate # macOS / Linux +# .venv\Scripts\activate # Windows +pip install -r requirements.txt +``` + +### Run + +```bash +streamlit run app.py +``` + +Opens at `http://localhost:8501` in your browser. + +### Features + +**Chat Tab** +- Conversational interface with streaming responses +- "Save code" button extracts code from the LLM response and saves it to a + workspace file (strips markdown formatting automatically) + +**File Editor Tab** +- Create and edit `.py`, `.tex`, `.html`, or any text file +- Syntax-highlighted preview of file content +- "Generate with LLM" button: describe a change in natural language and the + model rewrites the file (e.g. "add error handling", "fix the LaTeX formatting", + "translate comments to German") + +**Sidebar Controls** +- **Connection**: API Base URL and API Key +- **LLM Parameters**: Adjustable for each request + +| Parameter | Default | What it does | +|-----------|---------|--------------| +| Thinking Mode | Off | Toggle chain-of-thought reasoning (better for complex tasks, slower) | +| Temperature | 0.7 | Lower = predictable, higher = creative | +| Max Tokens | 4096 | Maximum response length | +| Top P | 0.95 | Nucleus sampling threshold | +| Presence Penalty | 0.0 | Encourage diverse topics | + +- **File Manager**: Create new files and switch between them + +All generated files are stored in a `workspace/` folder next to `app.py`. + +> **Tip**: The app runs entirely on your local machine. Only the LLM requests +> go to the server — your files stay local. + +--- + +## Thinking Mode + +By default, the model "thinks" before answering (internal chain-of-thought). +This is great for complex reasoning but adds latency for simple questions. + +To disable thinking and get faster direct responses, add this to your API call: + +```python +response = client.chat.completions.create( + model="qwen3.5-35b-a3b", + messages=[...], + max_tokens=1024, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, +) +``` + +--- + +## Troubleshooting + +| Issue | Solution | +|-----------------------------|-----------------------------------------------------| +| Connection refused | Check you're on the university network / VPN | +| Model not found | Use model name `qwen3.5-35b-a3b` exactly | +| Slow responses | The model is shared — peak times may be slower | +| `401 Unauthorized` | Ask your instructor for the API key | +| Response cut off | Increase `max_tokens` in your request | +| Open WebUI login fails | Make sure you created an account first (Sign Up) | +| Open WebUI shows no models | The vLLM server may still be loading — wait a few minutes | diff --git a/AISE501 LLM Zugang/app.py b/AISE501 LLM Zugang/app.py new file mode 100644 index 0000000..7a6dc39 --- /dev/null +++ b/AISE501 LLM Zugang/app.py @@ -0,0 +1,346 @@ +""" +Streamlit Chat & File Editor for Qwen3.5 + +A minimal interface to: + 1. Chat with the local LLM (OpenAI-compatible API) + 2. Edit, save, and generate code / LaTeX files + +Usage: + pip install streamlit openai + streamlit run app.py +""" + +import re +import subprocess +import streamlit as st +from openai import OpenAI +from pathlib import Path + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +st.sidebar.header("Connection") +API_BASE = st.sidebar.text_input("API Base URL", "http://silicon.fhgr.ch:7080/v1") +API_KEY = st.sidebar.text_input("API Key", "EMPTY", type="password") +WORKSPACE = Path("workspace") +WORKSPACE.mkdir(exist_ok=True) + +client = OpenAI(base_url=API_BASE, api_key=API_KEY) + + +@st.cache_data(ttl=30) +def fetch_models(base_url: str, api_key: str) -> list[str]: + """Fetch available model IDs from the vLLM server.""" + try: + c = OpenAI(base_url=base_url, api_key=api_key) + return [m.id for m in c.models.list().data] + except Exception: + return [] + + +available_models = fetch_models(API_BASE, API_KEY) +if available_models: + MODEL = st.sidebar.selectbox("Model", available_models) +else: + MODEL = st.sidebar.text_input("Model (server unreachable)", "qwen3.5-35b-a3b") + st.sidebar.warning("Could not fetch models from server.") + +# --------------------------------------------------------------------------- +# Sidebar — LLM Parameters +# --------------------------------------------------------------------------- +st.sidebar.markdown("---") +st.sidebar.header("LLM Parameters") + +thinking_mode = st.sidebar.toggle("Thinking Mode", value=False, + help="Enable chain-of-thought reasoning. Better for complex tasks, slower for simple ones.") +temperature = st.sidebar.slider("Temperature", 0.0, 2.0, 0.7, 0.05, + help="Lower = deterministic, higher = creative.") +max_tokens = st.sidebar.slider("Max Tokens", 256, 16384, 4096, 256, + help="Maximum length of the response.") +top_p = st.sidebar.slider("Top P", 0.0, 1.0, 0.95, 0.05, + help="Nucleus sampling: only consider tokens within this cumulative probability.") +presence_penalty = st.sidebar.slider("Presence Penalty", 0.0, 2.0, 0.0, 0.1, + help="Penalize repeated topics. Higher values encourage the model to talk about new topics.") + +LANG_MAP = { + ".py": "python", ".tex": "latex", ".js": "javascript", + ".html": "html", ".css": "css", ".sh": "bash", + ".json": "json", ".yaml": "yaml", ".yml": "yaml", +} + + +MAX_CONTEXT = 32768 + + +def extract_code(text: str, lang: str = "") -> str: + """Extract the best code block from markdown text. + + Strategy: + 1. Prefer blocks tagged with the target language (e.g. ```python) + 2. Among candidates, pick the longest block (skip trivial one-liners) + 3. Fall back to the longest block of any language + 4. Fall back to the full text if no fenced block is found + """ + tagged_pattern = r"```(\w*)\n(.*?)```" + matches = re.findall(tagged_pattern, text, re.DOTALL) + if not matches: + return text.strip() + + lang_lower = lang.lower() + lang_matches = [code for tag, code in matches if tag.lower() == lang_lower] + if lang_matches: + return max(lang_matches, key=len).strip() + + all_blocks = [code for _, code in matches] + return max(all_blocks, key=len).strip() + + +def estimate_tokens(messages: list[dict]) -> int: + """Rough token estimate: ~4 characters per token.""" + return sum(len(m["content"]) for m in messages) // 4 + + +def trim_history(messages: list[dict], reserved: int) -> list[dict]: + """Drop oldest message pairs to fit within context budget. + Always keeps the latest user message.""" + budget = MAX_CONTEXT - reserved + while len(messages) > 1 and estimate_tokens(messages) > budget: + messages.pop(0) + return messages + + +RUNNABLE_EXTENSIONS = {".py", ".tex"} +RUN_TIMEOUT = 30 + + +def run_file(file_path: Path) -> dict: + """Execute a .py or .tex file and return stdout, stderr, and return code.""" + suffix = file_path.suffix + cwd = file_path.parent.resolve() + + if suffix == ".py": + cmd = ["python3", file_path.name] + elif suffix == ".tex": + cmd = [ + "pdflatex", + "-interaction=nonstopmode", + f"-output-directory={cwd}", + file_path.name, + ] + else: + return {"stdout": "", "stderr": f"Unsupported file type: {suffix}", "rc": 1} + + try: + proc = subprocess.run( + cmd, + cwd=cwd, + capture_output=True, + text=True, + timeout=RUN_TIMEOUT, + ) + return {"stdout": proc.stdout, "stderr": proc.stderr, "rc": proc.returncode} + except subprocess.TimeoutExpired: + return {"stdout": "", "stderr": f"Timed out after {RUN_TIMEOUT}s", "rc": -1} + except FileNotFoundError as e: + return {"stdout": "", "stderr": str(e), "rc": -1} + + +# --------------------------------------------------------------------------- +# Sidebar — File Manager +# --------------------------------------------------------------------------- +st.sidebar.markdown("---") +st.sidebar.header("File Manager") + +new_filename = st.sidebar.text_input("New file name", placeholder="main.tex") +if st.sidebar.button("Create File") and new_filename: + (WORKSPACE / new_filename).touch() + st.sidebar.success(f"Created {new_filename}") + st.rerun() + +files = sorted(WORKSPACE.iterdir()) if WORKSPACE.exists() else [] +file_names = [f.name for f in files if f.is_file()] +selected_file = st.sidebar.selectbox("Open file", file_names if file_names else ["(no files)"]) + +# --------------------------------------------------------------------------- +# Main Layout — Two Tabs +# --------------------------------------------------------------------------- +tab_chat, tab_editor = st.tabs(["Chat", "File Editor"]) + +# --------------------------------------------------------------------------- +# Tab 1: Chat +# --------------------------------------------------------------------------- +with tab_chat: + st.header(f"Chat with {MODEL}") + + if "messages" not in st.session_state: + st.session_state.messages = [] + + for msg in st.session_state.messages: + with st.chat_message(msg["role"]): + st.markdown(msg["content"]) + + if prompt := st.chat_input("Ask anything..."): + st.session_state.messages.append({"role": "user", "content": prompt}) + with st.chat_message("user"): + st.markdown(prompt) + + st.session_state.messages = trim_history( + st.session_state.messages, reserved=max_tokens + ) + + with st.chat_message("assistant"): + placeholder = st.empty() + full_response = "" + + stream = client.chat.completions.create( + model=MODEL, + messages=st.session_state.messages, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + presence_penalty=presence_penalty, + stream=True, + extra_body={"chat_template_kwargs": {"enable_thinking": thinking_mode}}, + ) + for chunk in stream: + delta = chunk.choices[0].delta.content or "" + full_response += delta + placeholder.markdown(full_response + "▌") + placeholder.markdown(full_response) + + st.session_state.messages.append({"role": "assistant", "content": full_response}) + + if st.session_state.messages: + used = estimate_tokens(st.session_state.messages) + pct = min(used / MAX_CONTEXT, 1.0) + label = f"Context: ~{used:,} / {MAX_CONTEXT:,} tokens" + if pct > 0.8: + label += " ⚠️ nearing limit — older messages will be trimmed" + st.progress(pct, text=label) + + col_clear, col_save = st.columns([1, 3]) + with col_clear: + if st.button("Clear Chat"): + st.session_state.messages = [] + st.rerun() + with col_save: + if selected_file and selected_file != "(no files)": + if st.button(f"Save code → {selected_file}"): + last = st.session_state.messages[-1]["content"] + suffix = Path(selected_file).suffix + lang = LANG_MAP.get(suffix, "") + code = extract_code(last, lang) + (WORKSPACE / selected_file).write_text(code) + st.success(f"Extracted code saved to workspace/{selected_file}") + +# --------------------------------------------------------------------------- +# Tab 2: File Editor +# --------------------------------------------------------------------------- +with tab_editor: + st.header("File Editor") + + if selected_file and selected_file != "(no files)": + file_path = WORKSPACE / selected_file + content = file_path.read_text() if file_path.exists() else "" + suffix = file_path.suffix + lang = LANG_MAP.get(suffix, "text") + runnable = suffix in RUNNABLE_EXTENSIONS + + if runnable: + col_edit, col_term = st.columns([3, 2]) + else: + col_edit = st.container() + + with col_edit: + st.code(content, language=lang if lang != "text" else None, line_numbers=True) + + edited = st.text_area( + "Edit below:", + value=content, + height=400, + key=f"editor_{selected_file}_{hash(content)}", + ) + + col_save, col_gen = st.columns(2) + + with col_save: + if st.button("Save File"): + file_path.write_text(edited) + st.success(f"Saved {selected_file}") + st.rerun() + + with col_gen: + gen_prompt = st.text_input( + "Generation instruction", + placeholder="e.g. Add error handling / Fix the LaTeX formatting", + key="gen_prompt", + ) + if st.button("Generate with LLM") and gen_prompt: + with st.spinner("Generating..."): + response = client.chat.completions.create( + model=MODEL, + messages=[ + {"role": "system", "content": ( + f"You are a coding assistant. The user has a {lang} file. " + "Return ONLY the raw file content inside a single code block. " + "No explanations, no comments about changes." + )}, + {"role": "user", "content": ( + f"Here is my {lang} file:\n\n```\n{edited}\n```\n\n" + f"Instruction: {gen_prompt}" + )}, + ], + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + extra_body={"chat_template_kwargs": {"enable_thinking": thinking_mode}}, + ) + result = response.choices[0].message.content + code = extract_code(result, lang) + file_path.write_text(code) + st.success("File updated by LLM") + st.rerun() + + if runnable: + with col_term: + run_label = "Compile LaTeX" if suffix == ".tex" else "Run Python" + st.subheader("Terminal Output") + + if st.button(run_label, type="primary"): + file_path.write_text(edited) + with st.spinner(f"{'Compiling' if suffix == '.tex' else 'Running'}..."): + result = run_file(file_path) + st.session_state["last_run"] = result + + result = st.session_state.get("last_run") + if result: + if result["rc"] == 0: + st.success(f"Exit code: {result['rc']}") + else: + st.error(f"Exit code: {result['rc']}") + + if result["stdout"]: + st.text_area( + "stdout", + value=result["stdout"], + height=300, + disabled=True, + key="run_stdout", + ) + if result["stderr"]: + st.text_area( + "stderr", + value=result["stderr"], + height=200, + disabled=True, + key="run_stderr", + ) + if not result["stdout"] and not result["stderr"]: + st.info("No output produced.") + else: + st.caption( + f"Click **{run_label}** to execute the file " + f"(timeout: {RUN_TIMEOUT}s)." + ) + else: + st.info("Create a file in the sidebar to start editing.") diff --git a/AISE501 LLM Zugang/requirements.txt b/AISE501 LLM Zugang/requirements.txt new file mode 100644 index 0000000..d218a70 --- /dev/null +++ b/AISE501 LLM Zugang/requirements.txt @@ -0,0 +1,2 @@ +streamlit +openai diff --git a/AISE501 LLM Zugang/test_server.py b/AISE501 LLM Zugang/test_server.py new file mode 100644 index 0000000..ff88635 --- /dev/null +++ b/AISE501 LLM Zugang/test_server.py @@ -0,0 +1,70 @@ +""" +Quick test script to verify the vLLM server is running and responding. + +Usage: + pip install openai + python test_server.py [--host HOST] [--port PORT] [--api-key KEY] +""" + +import argparse +import sys + +from openai import OpenAI + + +def main(): + parser = argparse.ArgumentParser(description="Test vLLM inference server") + parser.add_argument("--host", default="localhost", help="Server hostname") + parser.add_argument("--port", default=7080, type=int, help="Server port") + parser.add_argument("--api-key", default="EMPTY", help="API key") + args = parser.parse_args() + + base_url = f"http://{args.host}:{args.port}/v1" + model = "qwen3.5-35b-a3b" + client = OpenAI(base_url=base_url, api_key=args.api_key) + + print(f"Connecting to {base_url} ...") + + print("\n--- Available Models ---") + try: + models = client.models.list() + for m in models.data: + print(f" {m.id}") + except Exception as e: + print(f"ERROR: Cannot connect to server: {e}") + sys.exit(1) + + print("\n--- Test Chat Completion ---") + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "user", "content": "Create a latex document that derives and explains the principle component analysis (pca). Make a self contain document with introduction, derivation, examples of applications. This is for computer science undergraduate class."} + ], + max_tokens=16384, + temperature=0.7, + ) + print(f" Response: {response.choices[0].message.content}") + print(f" Tokens: prompt={response.usage.prompt_tokens}, " + f"completion={response.usage.completion_tokens}") + + print("\n--- Test Streaming ---") + stream = client.chat.completions.create( + model=model, + messages=[ + {"role": "user", "content": "Count from 1 to 5."} + ], + max_tokens=16384, + temperature=0.7, + stream=True, + ) + print(" Response: ", end="") + for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) + print("\n") + + print("All tests passed!") + + +if __name__ == "__main__": + main() diff --git a/Clean Code exercise/example1_calculator/calculator_analysis.aux b/Clean Code exercise/example1_calculator/calculator_analysis.aux new file mode 100644 index 0000000..ec105ec --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_analysis.aux @@ -0,0 +1,18 @@ +\relax +\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo } +\@nameuse{bbl@beforestart} +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\babel@aux{english}{} +\@writefile{toc}{\contentsline {section}{\numberline {1}Overview}{2}{section.1}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {2}Violation 1: Unused and Poorly Formatted Imports}{2}{section.2}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {3}Violation 2: No Module Docstring or Documentation}{2}{section.3}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {4}Violation 3: Poor Naming Conventions}{3}{section.4}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {5}Violation 4: Formatting and Whitespace}{4}{section.5}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {6}Violation 5: Error Handling}{5}{section.6}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {7}Violation 6: Function Structure and Single Responsibility}{6}{section.7}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {8}Violation 7: Missing \texttt {\_\_main\_\_} Guard}{7}{section.8}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {9}Violation 8: String Concatenation Instead of f-Strings}{7}{section.9}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {10}Summary of Violations}{8}{section.10}\protected@file@percent } +\gdef \@abspage@last{8} diff --git a/Clean Code exercise/example1_calculator/calculator_analysis.out b/Clean Code exercise/example1_calculator/calculator_analysis.out new file mode 100644 index 0000000..11c7918 --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_analysis.out @@ -0,0 +1,10 @@ +\BOOKMARK [1][-]{section.1}{\376\377\000O\000v\000e\000r\000v\000i\000e\000w}{}% 1 +\BOOKMARK [1][-]{section.2}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0001\000:\000\040\000U\000n\000u\000s\000e\000d\000\040\000a\000n\000d\000\040\000P\000o\000o\000r\000l\000y\000\040\000F\000o\000r\000m\000a\000t\000t\000e\000d\000\040\000I\000m\000p\000o\000r\000t\000s}{}% 2 +\BOOKMARK [1][-]{section.3}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0002\000:\000\040\000N\000o\000\040\000M\000o\000d\000u\000l\000e\000\040\000D\000o\000c\000s\000t\000r\000i\000n\000g\000\040\000o\000r\000\040\000D\000o\000c\000u\000m\000e\000n\000t\000a\000t\000i\000o\000n}{}% 3 +\BOOKMARK [1][-]{section.4}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0003\000:\000\040\000P\000o\000o\000r\000\040\000N\000a\000m\000i\000n\000g\000\040\000C\000o\000n\000v\000e\000n\000t\000i\000o\000n\000s}{}% 4 +\BOOKMARK [1][-]{section.5}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0004\000:\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g\000\040\000a\000n\000d\000\040\000W\000h\000i\000t\000e\000s\000p\000a\000c\000e}{}% 5 +\BOOKMARK [1][-]{section.6}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0005\000:\000\040\000E\000r\000r\000o\000r\000\040\000H\000a\000n\000d\000l\000i\000n\000g}{}% 6 +\BOOKMARK [1][-]{section.7}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0006\000:\000\040\000F\000u\000n\000c\000t\000i\000o\000n\000\040\000S\000t\000r\000u\000c\000t\000u\000r\000e\000\040\000a\000n\000d\000\040\000S\000i\000n\000g\000l\000e\000\040\000R\000e\000s\000p\000o\000n\000s\000i\000b\000i\000l\000i\000t\000y}{}% 7 +\BOOKMARK [1][-]{section.8}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0007\000:\000\040\000M\000i\000s\000s\000i\000n\000g\000\040\000\137\000\137\000m\000a\000i\000n\000\137\000\137\000\040\000G\000u\000a\000r\000d}{}% 8 +\BOOKMARK [1][-]{section.9}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0008\000:\000\040\000S\000t\000r\000i\000n\000g\000\040\000C\000o\000n\000c\000a\000t\000e\000n\000a\000t\000i\000o\000n\000\040\000I\000n\000s\000t\000e\000a\000d\000\040\000o\000f\000\040\000f\000-\000S\000t\000r\000i\000n\000g\000s}{}% 9 +\BOOKMARK [1][-]{section.10}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000o\000f\000\040\000V\000i\000o\000l\000a\000t\000i\000o\000n\000s}{}% 10 diff --git a/Clean Code exercise/example1_calculator/calculator_analysis.pdf b/Clean Code exercise/example1_calculator/calculator_analysis.pdf new file mode 100644 index 0000000..9b7d765 Binary files /dev/null and b/Clean Code exercise/example1_calculator/calculator_analysis.pdf differ diff --git a/Clean Code exercise/example1_calculator/calculator_analysis.tex b/Clean Code exercise/example1_calculator/calculator_analysis.tex new file mode 100644 index 0000000..954adeb --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_analysis.tex @@ -0,0 +1,415 @@ +\documentclass[12pt,a4paper]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[english]{babel} +\usepackage{geometry} +\geometry{margin=2.5cm} +\usepackage{xcolor} +\usepackage{tcolorbox} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{listings} +\usepackage{enumitem} + +\definecolor{seblue}{rgb}{0.0,0.28,0.67} +\definecolor{segreen}{rgb}{0.13,0.55,0.13} +\definecolor{sered}{rgb}{0.7,0.13,0.13} +\definecolor{backcolour}{rgb}{0.95,0.95,0.92} +\definecolor{codegreen}{rgb}{0,0.6,0} +\definecolor{codepurple}{rgb}{0.58,0,0.82} + +\lstdefinestyle{pystyle}{ + backgroundcolor=\color{backcolour}, + commentstyle=\color{codegreen}, + keywordstyle=\color{blue}, + stringstyle=\color{codepurple}, + basicstyle=\ttfamily\footnotesize, + breaklines=true, + keepspaces=true, + showstringspaces=false, + tabsize=4, + language=Python +} +\lstset{style=pystyle} + +\newtcolorbox{badbox}{ + colback=red!5!white, + colframe=sered, + title=Bad Code, + fonttitle=\bfseries\small, + boxrule=0.8pt, arc=2pt, + top=2pt, bottom=2pt, left=4pt, right=4pt +} + +\newtcolorbox{goodbox}{ + colback=green!5!white, + colframe=segreen, + title=Clean Code, + fonttitle=\bfseries\small, + boxrule=0.8pt, arc=2pt, + top=2pt, bottom=2pt, left=4pt, right=4pt +} + +\newtcolorbox{principlebox}[1][]{ + colback=blue!5!white, + colframe=seblue, + title=#1, + fonttitle=\bfseries\small, + boxrule=0.8pt, arc=2pt, + top=2pt, bottom=2pt, left=4pt, right=4pt +} + +\title{\textcolor{seblue}{Code Analysis: Arithmetic Expression Calculator}\\[0.3em] +\large What Makes Code Bad and How to Fix It\\[0.3em] +\normalsize AISE501 -- AI in Software Engineering I} +\author{Dr.\ Florian Herzog} +\date{Spring Semester 2026} + +\begin{document} +\maketitle +\tableofcontents +\newpage + +% ============================================ +\section{Overview} +% ============================================ + +This document analyses two implementations of the same program --- an arithmetic expression calculator that parses and evaluates strings like \texttt{"3 + 5 * 2"} without using Python's \texttt{eval()}. +Both produce correct results, but the first version (\texttt{calculator\_bad.py}) violates numerous PEP\,8 and clean code principles, while the second (\texttt{calculator\_good.py}) follows them consistently. + +The analysis is structured by violation category, with side-by-side comparisons of the bad and good code and references to the specific PEP\,8 rules or clean code principles that apply. + +% ============================================ +\section{Violation 1: Unused and Poorly Formatted Imports} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +import sys,os,re;from typing import * +\end{lstlisting} +\end{badbox} + +\textbf{What is wrong:} +\begin{itemize} + \item \texttt{sys}, \texttt{os}, and \texttt{re} are imported but \textbf{never used} anywhere in the code. + \item Multiple imports are crammed onto \textbf{one line separated by commas}, violating PEP\,8's rule that imports should be on separate lines. + \item A \textbf{semicolon} joins two import statements on one line. + \item \texttt{from typing import *} is a \textbf{wildcard import} that pollutes the namespace. +\end{itemize} + +\begin{goodbox} +The good version has \textbf{no imports at all} --- the calculator uses only built-in Python features. +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Imports}: ``Imports should usually be on separate lines.'' Wildcard imports (\texttt{from X import *}) should be avoided. + \item \textbf{KISS}: Unused imports add unnecessary complexity. + \item \textbf{Clean Code}: Dead code (unused imports) confuses readers about dependencies. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 2: No Module Docstring or Documentation} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +# calculator program +def scicalc(s): +\end{lstlisting} +The only ``documentation'' is a single vague comment. No module docstring, no function docstrings. +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +"""Simple arithmetic expression calculator with a recursive-descent parser. + +Supported operations: +, -, *, / and parentheses. +Does NOT use Python's eval(). + +Grammar: + expression = term (('+' | '-') term)* + term = factor (('*' | '/') factor)* + factor = NUMBER | '(' expression ')' +""" +\end{lstlisting} +The good version opens with a module docstring that explains the purpose, supported operations, and even the formal grammar. Every function also has a docstring. +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,257}: All public modules, functions, classes, and methods should have docstrings. + \item \textbf{Clean Code -- Documentation}: Good documentation helps current and future developers understand the intent. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 3: Poor Naming Conventions} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +def scicalc(s): # What does "scicalc" mean? +def doPlusMinus(s,a,b):# camelCase, not snake_case +def doMulDiv(s,a,b): # "do" is vague +def getNum(s, a,b): # inconsistent spacing + t=s[a:b] # "t" for what? + c=t[i] # "c" for what? + L=doPlusMinus(...) # uppercase "L" for a local variable + R=doMulDiv(...) # uppercase "R" for a local variable + r=doPlusMinus(...) # "r" for result? +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +def tokenize(expression_text): +def parse_expression(tokens, position): +def parse_term(tokens, position): +def parse_factor(tokens, position): +def calculate(expression_text): + character = expression_text[position] + operator = tokens[position] + right_value, position = parse_term(tokens, position) + result, final_position = parse_expression(tokens, 0) +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong in the bad version:} +\begin{itemize} + \item Function names use \textbf{camelCase} (\texttt{doPlusMinus}) instead of \textbf{snake\_case}. + \item Variable names are \textbf{single letters} (\texttt{s}, \texttt{a}, \texttt{b}, \texttt{t}, \texttt{c}, \texttt{r}) --- impossible to understand without reading every line. + \item \texttt{L} and \texttt{R} use \textbf{uppercase} for local variables, which PEP\,8 reserves for constants. + \item Names like \texttt{scicalc} are \textbf{abbreviations} that are not pronounceable or self-explanatory. + \item The list of test data is called \texttt{Data} (capitalised like a class) and results \texttt{Res}. +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Naming}: Functions and variables use \texttt{lower\_case\_with\_underscores}. Constants use \texttt{UPPER\_CASE}. + \item \textbf{Clean Code -- Descriptive Names}: Names should reveal intent. A reader should know what a variable holds without tracing its assignment. + \item \textbf{Clean Code -- Pronounceable Names}: \texttt{scicalc} is not a word anyone would say in a conversation. + \item \textbf{Clean Code -- No Abbreviations}: \texttt{doPlusMinus} is better than \texttt{dPM}, but \texttt{parse\_expression} communicates the actual operation. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 4: Formatting and Whitespace} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +def scicalc(s): + s=s.replace(' ','') # 2-space indent + if s=='':return 0 # no spaces around == + r=doPlusMinus(s,0,len(s)) + return r + +def doPlusMinus(s,a,b): + t=s[a:b]; level=0; i=len(t)-1 # 4-space indent, semicolons + while i>=0: # no space around >= + if level==0 and(c=='*' or c=='/'): # missing space before ( + L = doMulDiv(s,a,a+i); R = getNum(s,a+i+1,b) +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +def parse_expression(tokens, position): + result, position = parse_term(tokens, position) + + while position < len(tokens) and tokens[position] in ("+", "-"): + operator = tokens[position] + position += 1 + right_value, position = parse_term(tokens, position) +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong:} +\begin{itemize} + \item \textbf{Inconsistent indentation}: \texttt{scicalc} uses 2 spaces, other functions use 4 spaces. PEP\,8 requires 4 spaces consistently. + \item \textbf{Semicolons} to put multiple statements on one line (\texttt{t=s[a:b]; level=0; i=len(t)-1}). + \item \textbf{Missing whitespace} around operators: \texttt{s=s.replace}, \texttt{i>=0}, \texttt{level==0 and(c==...}. + \item \textbf{No blank lines} between logical sections within functions or between function definitions. PEP\,8 requires two blank lines before and after top-level functions. + \item Multiple \texttt{return} or assignment statements \textbf{on the same line} as \texttt{if}: \texttt{if s=='':return 0}. +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Indentation}: Use 4 spaces per indentation level. + \item \textbf{PEP\,8 -- Whitespace}: Surround binary operators with single spaces. Avoid compound statements on one line. + \item \textbf{PEP\,8 -- Blank Lines}: Two blank lines around top-level definitions. + \item \textbf{Zen of Python}: ``Sparse is better than dense.'' +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 5: Error Handling} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +if R==0:print("ERROR division by zero!!!") ;return 0 +\end{lstlisting} +\begin{lstlisting} +try: + x = float(t) +except: + print("bad number: "+t);x=0 +return x +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +if right_value == 0: + raise ZeroDivisionError("Division by zero") +\end{lstlisting} +\begin{lstlisting} +try: + tokens = tokenize(expression_text) + result, final_position = parse_expression(tokens, 0) + ... +except (ValueError, ZeroDivisionError) as error: + return f"Error: {error}" +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong in the bad version:} +\begin{itemize} + \item \textbf{Bare \texttt{except}} catches every exception including \texttt{KeyboardInterrupt} and \texttt{SystemExit} --- masking real bugs. + \item Errors are handled by \textbf{printing and returning a dummy value} (0), which silently produces wrong results. The caller has no way to know an error occurred. + \item The error message style is inconsistent: \texttt{"ERROR division by zero!!!"} vs.\ \texttt{"bad number: ..."}. +\end{itemize} + +\textbf{What the good version does:} +\begin{itemize} + \item Errors \textbf{raise specific exceptions} (\texttt{ValueError}, \texttt{ZeroDivisionError}) at the point of detection. + \item The top-level \texttt{calculate()} function catches \textbf{only expected exceptions} and returns a formatted error string. + \item Errors \textbf{propagate} rather than being silently swallowed. +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Exceptions}: Catch specific exceptions, never use bare \texttt{except}. + \item \textbf{Zen of Python}: ``Errors should never pass silently. Unless explicitly silenced.'' + \item \textbf{Clean Code -- Error Handling}: Anticipate errors and handle them gracefully. Returning magic values (0 for an error) is an anti-pattern. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 6: Function Structure and Single Responsibility} +% ============================================ + +\begin{badbox} +The bad version has three intertwined functions (\texttt{doPlusMinus}, \texttt{doMulDiv}, \texttt{getNum}) that each take the \textbf{entire string plus two index parameters} and internally slice the string. Parsing, tokenisation, and evaluation are all mixed together. +\begin{lstlisting} +def doPlusMinus(s,a,b): + t=s[a:b]; level=0; i=len(t)-1 + while i>=0: + ... + L=doPlusMinus(s,a,a+i);R=doMulDiv(s,a+i+1,b) + ... + return doMulDiv(s,a,b) +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +The good version separates \textbf{tokenisation} from \textbf{parsing}: +\begin{lstlisting} +tokens = tokenize(expression_text) # Step 1: tokenise +result, position = parse_expression(tokens, 0) # Step 2: parse +\end{lstlisting} +Each parser function has a single, clear responsibility: +\begin{itemize}[nosep] + \item \texttt{tokenize()} -- converts text to tokens + \item \texttt{parse\_expression()} -- handles \texttt{+} and \texttt{-} + \item \texttt{parse\_term()} -- handles \texttt{*} and \texttt{/} + \item \texttt{parse\_factor()} -- handles numbers and parentheses + \item \texttt{calculate()} -- orchestrates the pipeline and error handling +\end{itemize} +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{SRP (Single Responsibility Principle)}: Each function should do one thing. + \item \textbf{SoC (Separation of Concerns)}: Tokenisation and parsing are different concerns. + \item \textbf{Clean Code -- Short Functions}: If a function takes more than a few minutes to comprehend, it should be refactored. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 7: Missing \texttt{\_\_main\_\_} Guard} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +main() +\end{lstlisting} +The bad version calls \texttt{main()} at the module level. If another script imports this file, the calculator runs immediately as a side effect. +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +if __name__ == "__main__": + main() +\end{lstlisting} +The good version uses the standard \texttt{\_\_main\_\_} guard, so the module can be safely imported without executing the calculator. +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{Clean Code -- Avoid Side Effects}: Importing a module should not trigger execution. + \item \textbf{Python Best Practice}: The \texttt{if \_\_name\_\_ == "\_\_main\_\_"} guard is standard for all runnable scripts. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 8: String Concatenation Instead of f-Strings} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +print(d+" = "+str(Res)) +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +print(f"{display_expr} = {result}") +\end{lstlisting} +\end{goodbox} + +String concatenation with \texttt{+} and manual \texttt{str()} calls is harder to read than f-strings, which are the idiomatic Python 3.6+ way to format output. + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{Pythonic Code}: Use f-strings for string formatting (readable, efficient). + \item \textbf{Clean Code -- Readability}: f-strings make the output format immediately visible. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Summary of Violations} +% ============================================ + +\begin{center} +\small +\begin{tabular}{@{}rp{5cm}p{5.5cm}@{}} +\toprule +\textbf{\#} & \textbf{Violation} & \textbf{Principle / PEP\,8 Rule} \\ +\midrule +1 & Unused imports, wildcard import, one-line imports & PEP\,8 Imports, KISS \\ +2 & No docstrings or documentation & PEP\,257, Clean Code Documentation \\ +3 & camelCase names, single-letter variables, abbreviations & PEP\,8 Naming, Descriptive Names \\ +4 & Inconsistent indent, semicolons, missing whitespace & PEP\,8 Indentation \& Whitespace \\ +5 & Bare except, silent error swallowing & PEP\,8 Exceptions, Zen of Python \\ +6 & Mixed concerns, long tangled functions & SRP, SoC, Short Functions \\ +7 & No \texttt{\_\_main\_\_} guard & Avoid Side Effects \\ +8 & String concatenation instead of f-strings & Pythonic Code, Readability \\ +\bottomrule +\end{tabular} +\end{center} + +\end{document} diff --git a/Clean Code exercise/example1_calculator/calculator_analysis.toc b/Clean Code exercise/example1_calculator/calculator_analysis.toc new file mode 100644 index 0000000..9d748aa --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_analysis.toc @@ -0,0 +1,11 @@ +\babel@toc {english}{}\relax +\contentsline {section}{\numberline {1}Overview}{2}{section.1}% +\contentsline {section}{\numberline {2}Violation 1: Unused and Poorly Formatted Imports}{2}{section.2}% +\contentsline {section}{\numberline {3}Violation 2: No Module Docstring or Documentation}{2}{section.3}% +\contentsline {section}{\numberline {4}Violation 3: Poor Naming Conventions}{3}{section.4}% +\contentsline {section}{\numberline {5}Violation 4: Formatting and Whitespace}{4}{section.5}% +\contentsline {section}{\numberline {6}Violation 5: Error Handling}{5}{section.6}% +\contentsline {section}{\numberline {7}Violation 6: Function Structure and Single Responsibility}{6}{section.7}% +\contentsline {section}{\numberline {8}Violation 7: Missing \texttt {\_\_main\_\_} Guard}{7}{section.8}% +\contentsline {section}{\numberline {9}Violation 8: String Concatenation Instead of f-Strings}{7}{section.9}% +\contentsline {section}{\numberline {10}Summary of Violations}{8}{section.10}% diff --git a/Clean Code exercise/example1_calculator/calculator_bad.py b/Clean Code exercise/example1_calculator/calculator_bad.py new file mode 100644 index 0000000..a076cc9 --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_bad.py @@ -0,0 +1,64 @@ +import sys,os,re;from typing import * + +# calculator program +def scicalc(s): + s=s.replace(' ','') + if s=='':return 0 + r=doPlusMinus(s,0,len(s)) + return r + +def doPlusMinus(s,a,b): + t=s[a:b]; level=0; i=len(t)-1 + while i>=0: + c=t[i] + if c==')':level=level+1 + if c=='(':level=level-1 + if level==0 and (c=='+' or c=='-'): + L=doPlusMinus(s,a,a+i);R=doMulDiv(s,a+i+1,b) + if c=='+': return L+R + else: return L-R + i=i-1 + return doMulDiv(s,a,b) + +def doMulDiv(s,a,b): + t=s[a:b];level=0;i=len(t)-1 + while i >= 0: + c=t[i] + if c==')':level+=1 + if c=='(':level-=1 + if level==0 and(c=='*' or c=='/'): + L = doMulDiv(s,a,a+i); R = getNum(s,a+i+1,b) + if c=='*':return L*R + else: + if R==0:print("ERROR division by zero!!!") ;return 0 + return L/R + i -= 1 + return getNum(s,a,b) + +def getNum(s, a,b): + t = s[a:b] + if t[0]=='(' and t[-1]==')': + return doPlusMinus(s,a+1,b-1) + try: + x = float(t) + except: + print("bad number: "+t);x=0 + return x + +def main(): + Data = [ + "3 + 5", + "10 - 2 * 3", + "( 4 + 6 ) * 2", + "100 / ( 5 * 2 )", + "3.5 + 2.5 * 4", + "( 1 + 2 ) * ( 3 + 4 )", + "", + "10 / 0", + "abc + 1", + ] + for d in Data: + Res=scicalc(d) + print(d+" = "+str(Res)) + +main() diff --git a/Clean Code exercise/example1_calculator/calculator_good.py b/Clean Code exercise/example1_calculator/calculator_good.py new file mode 100644 index 0000000..c5fb638 --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_good.py @@ -0,0 +1,153 @@ +"""Simple arithmetic expression calculator with a recursive-descent parser. + +Supported operations: +, -, *, / and parentheses. +Does NOT use Python's eval(). + +Grammar: + expression = term (('+' | '-') term)* + term = factor (('*' | '/') factor)* + factor = NUMBER | '(' expression ')' +""" + + +def tokenize(expression_text): + """Convert an expression string into a list of tokens. + + Tokens are either numbers (float) or single-character operators / parentheses. + Raises ValueError for characters that are not part of a valid expression. + """ + tokens = [] + position = 0 + + while position < len(expression_text): + character = expression_text[position] + + if character.isspace(): + position += 1 + continue + + if character in "+-*/()": + tokens.append(character) + position += 1 + continue + + if character.isdigit() or character == ".": + start = position + while position < len(expression_text) and ( + expression_text[position].isdigit() + or expression_text[position] == "." + ): + position += 1 + number_text = expression_text[start:position] + tokens.append(float(number_text)) + continue + + raise ValueError( + f"Unexpected character '{character}' at position {position}" + ) + + return tokens + + +def parse_expression(tokens, position): + """Parse an expression: term (('+' | '-') term)*.""" + result, position = parse_term(tokens, position) + + while position < len(tokens) and tokens[position] in ("+", "-"): + operator = tokens[position] + position += 1 + right_value, position = parse_term(tokens, position) + + if operator == "+": + result += right_value + else: + result -= right_value + + return result, position + + +def parse_term(tokens, position): + """Parse a term: factor (('*' | '/') factor)*.""" + result, position = parse_factor(tokens, position) + + while position < len(tokens) and tokens[position] in ("*", "/"): + operator = tokens[position] + position += 1 + right_value, position = parse_factor(tokens, position) + + if operator == "*": + result *= right_value + else: + if right_value == 0: + raise ZeroDivisionError("Division by zero") + result /= right_value + + return result, position + + +def parse_factor(tokens, position): + """Parse a factor: NUMBER | '(' expression ')'.""" + if position >= len(tokens): + raise ValueError("Unexpected end of expression") + + token = tokens[position] + + if token == "(": + position += 1 + result, position = parse_expression(tokens, position) + if position >= len(tokens) or tokens[position] != ")": + raise ValueError("Missing closing parenthesis") + position += 1 + return result, position + + if isinstance(token, float): + return token, position + 1 + + raise ValueError(f"Unexpected token: {token}") + + +def calculate(expression_text): + """Evaluate an arithmetic expression string and return the result. + + Returns the numeric result or an error message string. + """ + if not expression_text.strip(): + return "Error: empty expression" + + try: + tokens = tokenize(expression_text) + result, final_position = parse_expression(tokens, 0) + + if final_position != len(tokens): + return f"Error: unexpected token '{tokens[final_position]}'" + + if result == int(result): + return int(result) + return round(result, 10) + + except (ValueError, ZeroDivisionError) as error: + return f"Error: {error}" + + +def main(): + """Run the calculator on a set of test expressions.""" + test_expressions = [ + "3 + 5", + "10 - 2 * 3", + "(4 + 6) * 2", + "100 / (5 * 2)", + "3.5 + 2.5 * 4", + "(1 + 2) * (3 + 4)", + "", + "10 / 0", + "abc + 1", + ] + + for expression in test_expressions: + result = calculate(expression) + display_expr = expression if expression else "(empty)" + print(f"{display_expr} = {result}") + + +if __name__ == "__main__": + main() diff --git a/Clean Code exercise/example1_calculator/calculator_usecase.aux b/Clean Code exercise/example1_calculator/calculator_usecase.aux new file mode 100644 index 0000000..ee32f96 --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_usecase.aux @@ -0,0 +1,11 @@ +\relax +\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo } +\@nameuse{bbl@beforestart} +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\babel@aux{english}{} +\@writefile{toc}{\contentsline {section}{\numberline {1}Use Case}{1}{section.1}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {2}Example Input / Output}{1}{section.2}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {3}Exercise}{1}{section.3}\protected@file@percent } +\gdef \@abspage@last{2} diff --git a/Clean Code exercise/example1_calculator/calculator_usecase.out b/Clean Code exercise/example1_calculator/calculator_usecase.out new file mode 100644 index 0000000..4b8f6e6 --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_usecase.out @@ -0,0 +1,3 @@ +\BOOKMARK [1][-]{section.1}{\376\377\000U\000s\000e\000\040\000C\000a\000s\000e}{}% 1 +\BOOKMARK [1][-]{section.2}{\376\377\000E\000x\000a\000m\000p\000l\000e\000\040\000I\000n\000p\000u\000t\000\040\000/\000\040\000O\000u\000t\000p\000u\000t}{}% 2 +\BOOKMARK [1][-]{section.3}{\376\377\000E\000x\000e\000r\000c\000i\000s\000e}{}% 3 diff --git a/Clean Code exercise/example1_calculator/calculator_usecase.pdf b/Clean Code exercise/example1_calculator/calculator_usecase.pdf new file mode 100644 index 0000000..fda7975 Binary files /dev/null and b/Clean Code exercise/example1_calculator/calculator_usecase.pdf differ diff --git a/Clean Code exercise/example1_calculator/calculator_usecase.tex b/Clean Code exercise/example1_calculator/calculator_usecase.tex new file mode 100644 index 0000000..9be9f61 --- /dev/null +++ b/Clean Code exercise/example1_calculator/calculator_usecase.tex @@ -0,0 +1,90 @@ +\documentclass[12pt,a4paper]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[english]{babel} +\usepackage{geometry} +\geometry{margin=2.5cm} +\usepackage{xcolor} +\usepackage{tcolorbox} +\usepackage{booktabs} +\usepackage{hyperref} + +\definecolor{seblue}{rgb}{0.0,0.28,0.67} + +\title{\textcolor{seblue}{Exercise 1: Arithmetic Expression Calculator}\\[0.3em] +\large AISE501 -- AI in Software Engineering I} +\author{Dr.\ Florian Herzog} +\date{Spring Semester 2026} + +\begin{document} +\maketitle + +\section{Use Case} + +A user enters an arithmetic expression as a text string, for example \texttt{"3 + 5 * 2"}. +The program evaluates the expression and prints the result. + +The calculator must: +\begin{itemize} + \item Support the four basic operations: \texttt{+}, \texttt{-}, \texttt{*}, \texttt{/} + \item Respect standard operator precedence (\texttt{*} and \texttt{/} bind more tightly than \texttt{+} and \texttt{-}) + \item Support parentheses for grouping, e.g.\ \texttt{"(4 + 6) * 2"} + \item Support decimal numbers, e.g.\ \texttt{"3.5 + 2.5"} + \item Handle errors gracefully (division by zero, invalid characters, empty input) + \item \textbf{Not} use Python's built-in \texttt{eval()} function +\end{itemize} + +\section{Example Input / Output} + +\begin{center} +\begin{tabular}{ll} +\toprule +\textbf{Input Expression} & \textbf{Expected Output} \\ +\midrule +\texttt{3 + 5} & \texttt{8} \\ +\texttt{10 - 2 * 3} & \texttt{4} \\ +\texttt{(4 + 6) * 2} & \texttt{20} \\ +\texttt{100 / (5 * 2)} & \texttt{10} \\ +\texttt{3.5 + 2.5 * 4} & \texttt{13.5} \\ +\texttt{(1 + 2) * (3 + 4)} & \texttt{21} \\ +\texttt{(empty)} & Error message \\ +\texttt{10 / 0} & Error message \\ +\texttt{abc + 1} & Error message \\ +\bottomrule +\end{tabular} +\end{center} + +\section{Exercise} + +Two implementations are provided: + +\begin{enumerate} + \item \textbf{\texttt{calculator\_bad.py}} -- A working but poorly written version that violates many clean code and PEP\,8 principles. + \item \textbf{\texttt{calculator\_good.py}} -- A clean, well-structured version following PEP\,8 and clean code best practices. +\end{enumerate} + +\subsection*{Tasks} + +\begin{enumerate} + \item Run both programs and verify they produce the same results. + \item Read the bad version and list all clean code / PEP\,8 violations you can find. + \item For each violation, explain which principle is broken and why it makes the code harder to read or maintain. + \item Compare your list with the good version to see how each issue was resolved. +\end{enumerate} + +\subsection*{Violations to Look For} + +\begin{itemize} + \item Unused imports + \item Missing or misleading comments and docstrings + \item Poor variable and function names (abbreviations, single letters) + \item Inconsistent indentation and spacing + \item Multiple statements on one line (semicolons) + \item Missing whitespace around operators + \item No proper error handling (bare \texttt{except}, printing instead of raising) + \item Magic numbers and unclear logic flow + \item Missing \texttt{if \_\_name\_\_ == "\_\_main\_\_"} guard + \item No type clarity in function signatures +\end{itemize} + +\end{document} diff --git a/Clean Code exercise/example2_bank/accounts.json b/Clean Code exercise/example2_bank/accounts.json new file mode 100644 index 0000000..1ee054b --- /dev/null +++ b/Clean Code exercise/example2_bank/accounts.json @@ -0,0 +1,25 @@ +{ + "accounts": [ + { + "account_id": "ACC-001", + "holder": "Alice Mueller", + "balance": 5000.00, + "currency": "CHF", + "status": "active" + }, + { + "account_id": "ACC-002", + "holder": "Bob Schneider", + "balance": 1200.50, + "currency": "CHF", + "status": "active" + }, + { + "account_id": "ACC-003", + "holder": "Clara Brunner", + "balance": 300.00, + "currency": "CHF", + "status": "frozen" + } + ] +} diff --git a/Clean Code exercise/example2_bank/accounts_updated_bad.json b/Clean Code exercise/example2_bank/accounts_updated_bad.json new file mode 100644 index 0000000..ecf31b5 --- /dev/null +++ b/Clean Code exercise/example2_bank/accounts_updated_bad.json @@ -0,0 +1,25 @@ +{ + "accounts": [ + { + "account_id": "ACC-001", + "holder": "Alice Mueller", + "balance": 4550.0, + "currency": "CHF", + "status": "active" + }, + { + "account_id": "ACC-002", + "holder": "Bob Schneider", + "balance": 1950.5, + "currency": "CHF", + "status": "active" + }, + { + "account_id": "ACC-003", + "holder": "Clara Brunner", + "balance": 300.0, + "currency": "CHF", + "status": "frozen" + } + ] +} \ No newline at end of file diff --git a/Clean Code exercise/example2_bank/accounts_updated_good.json b/Clean Code exercise/example2_bank/accounts_updated_good.json new file mode 100644 index 0000000..ecf31b5 --- /dev/null +++ b/Clean Code exercise/example2_bank/accounts_updated_good.json @@ -0,0 +1,25 @@ +{ + "accounts": [ + { + "account_id": "ACC-001", + "holder": "Alice Mueller", + "balance": 4550.0, + "currency": "CHF", + "status": "active" + }, + { + "account_id": "ACC-002", + "holder": "Bob Schneider", + "balance": 1950.5, + "currency": "CHF", + "status": "active" + }, + { + "account_id": "ACC-003", + "holder": "Clara Brunner", + "balance": 300.0, + "currency": "CHF", + "status": "frozen" + } + ] +} \ No newline at end of file diff --git a/Clean Code exercise/example2_bank/bank_analysis.aux b/Clean Code exercise/example2_bank/bank_analysis.aux new file mode 100644 index 0000000..9d052fe --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_analysis.aux @@ -0,0 +1,20 @@ +\relax +\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo } +\@nameuse{bbl@beforestart} +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\babel@aux{english}{} +\@writefile{toc}{\contentsline {section}{\numberline {1}Overview}{2}{section.1}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {2}Violation 1: Unused Imports and Import Formatting}{2}{section.2}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {3}Violation 2: No Documentation or Docstrings}{2}{section.3}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {4}Violation 3: Implicit Data Model}{3}{section.4}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {5}Violation 4: Poor Naming}{4}{section.5}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {6}Violation 5: Formatting -- Semicolons and Dense Lines}{5}{section.6}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {7}Violation 6: No Context Managers for File I/O}{6}{section.7}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {8}Violation 7: God Function -- Single Responsibility Violation}{7}{section.8}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {9}Violation 8: Magic Strings Instead of Constants}{8}{section.9}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {10}Violation 9: Comparison with \texttt {None}}{8}{section.10}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {11}Violation 10: Missing \texttt {\_\_main\_\_} Guard and String Formatting}{9}{section.11}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {12}Summary of Violations}{10}{section.12}\protected@file@percent } +\gdef \@abspage@last{10} diff --git a/Clean Code exercise/example2_bank/bank_analysis.out b/Clean Code exercise/example2_bank/bank_analysis.out new file mode 100644 index 0000000..e6ce316 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_analysis.out @@ -0,0 +1,12 @@ +\BOOKMARK [1][-]{section.1}{\376\377\000O\000v\000e\000r\000v\000i\000e\000w}{}% 1 +\BOOKMARK [1][-]{section.2}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0001\000:\000\040\000U\000n\000u\000s\000e\000d\000\040\000I\000m\000p\000o\000r\000t\000s\000\040\000a\000n\000d\000\040\000I\000m\000p\000o\000r\000t\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g}{}% 2 +\BOOKMARK [1][-]{section.3}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0002\000:\000\040\000N\000o\000\040\000D\000o\000c\000u\000m\000e\000n\000t\000a\000t\000i\000o\000n\000\040\000o\000r\000\040\000D\000o\000c\000s\000t\000r\000i\000n\000g\000s}{}% 3 +\BOOKMARK [1][-]{section.4}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0003\000:\000\040\000I\000m\000p\000l\000i\000c\000i\000t\000\040\000D\000a\000t\000a\000\040\000M\000o\000d\000e\000l}{}% 4 +\BOOKMARK [1][-]{section.5}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0004\000:\000\040\000P\000o\000o\000r\000\040\000N\000a\000m\000i\000n\000g}{}% 5 +\BOOKMARK [1][-]{section.6}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0005\000:\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g\000\040\040\023\000\040\000S\000e\000m\000i\000c\000o\000l\000o\000n\000s\000\040\000a\000n\000d\000\040\000D\000e\000n\000s\000e\000\040\000L\000i\000n\000e\000s}{}% 6 +\BOOKMARK [1][-]{section.7}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0006\000:\000\040\000N\000o\000\040\000C\000o\000n\000t\000e\000x\000t\000\040\000M\000a\000n\000a\000g\000e\000r\000s\000\040\000f\000o\000r\000\040\000F\000i\000l\000e\000\040\000I\000/\000O}{}% 7 +\BOOKMARK [1][-]{section.8}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0007\000:\000\040\000G\000o\000d\000\040\000F\000u\000n\000c\000t\000i\000o\000n\000\040\040\023\000\040\000S\000i\000n\000g\000l\000e\000\040\000R\000e\000s\000p\000o\000n\000s\000i\000b\000i\000l\000i\000t\000y\000\040\000V\000i\000o\000l\000a\000t\000i\000o\000n}{}% 8 +\BOOKMARK [1][-]{section.9}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0008\000:\000\040\000M\000a\000g\000i\000c\000\040\000S\000t\000r\000i\000n\000g\000s\000\040\000I\000n\000s\000t\000e\000a\000d\000\040\000o\000f\000\040\000C\000o\000n\000s\000t\000a\000n\000t\000s}{}% 9 +\BOOKMARK [1][-]{section.10}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0009\000:\000\040\000C\000o\000m\000p\000a\000r\000i\000s\000o\000n\000\040\000w\000i\000t\000h\000\040\000N\000o\000n\000e}{}% 10 +\BOOKMARK [1][-]{section.11}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0001\0000\000:\000\040\000M\000i\000s\000s\000i\000n\000g\000\040\000\137\000\137\000m\000a\000i\000n\000\137\000\137\000\040\000G\000u\000a\000r\000d\000\040\000a\000n\000d\000\040\000S\000t\000r\000i\000n\000g\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g}{}% 11 +\BOOKMARK [1][-]{section.12}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000o\000f\000\040\000V\000i\000o\000l\000a\000t\000i\000o\000n\000s}{}% 12 diff --git a/Clean Code exercise/example2_bank/bank_analysis.pdf b/Clean Code exercise/example2_bank/bank_analysis.pdf new file mode 100644 index 0000000..579d750 Binary files /dev/null and b/Clean Code exercise/example2_bank/bank_analysis.pdf differ diff --git a/Clean Code exercise/example2_bank/bank_analysis.tex b/Clean Code exercise/example2_bank/bank_analysis.tex new file mode 100644 index 0000000..1f91ad2 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_analysis.tex @@ -0,0 +1,526 @@ +\documentclass[12pt,a4paper]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[english]{babel} +\usepackage{geometry} +\geometry{margin=2.5cm} +\usepackage{xcolor} +\usepackage{tcolorbox} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{listings} +\usepackage{enumitem} + +\definecolor{seblue}{rgb}{0.0,0.28,0.67} +\definecolor{segreen}{rgb}{0.13,0.55,0.13} +\definecolor{sered}{rgb}{0.7,0.13,0.13} +\definecolor{backcolour}{rgb}{0.95,0.95,0.92} +\definecolor{codegreen}{rgb}{0,0.6,0} +\definecolor{codepurple}{rgb}{0.58,0,0.82} + +\lstdefinestyle{pystyle}{ + backgroundcolor=\color{backcolour}, + commentstyle=\color{codegreen}, + keywordstyle=\color{blue}, + stringstyle=\color{codepurple}, + basicstyle=\ttfamily\footnotesize, + breaklines=true, + keepspaces=true, + showstringspaces=false, + tabsize=4, + language=Python +} +\lstset{style=pystyle} + +\newtcolorbox{badbox}{ + colback=red!5!white, + colframe=sered, + title=Bad Code, + fonttitle=\bfseries\small, + boxrule=0.8pt, arc=2pt, + top=2pt, bottom=2pt, left=4pt, right=4pt +} + +\newtcolorbox{goodbox}{ + colback=green!5!white, + colframe=segreen, + title=Clean Code, + fonttitle=\bfseries\small, + boxrule=0.8pt, arc=2pt, + top=2pt, bottom=2pt, left=4pt, right=4pt +} + +\newtcolorbox{principlebox}[1][]{ + colback=blue!5!white, + colframe=seblue, + title=#1, + fonttitle=\bfseries\small, + boxrule=0.8pt, arc=2pt, + top=2pt, bottom=2pt, left=4pt, right=4pt +} + +\title{\textcolor{seblue}{Code Analysis: Bank Account Transaction Processor}\\[0.3em] +\large What Makes Code Bad and How to Fix It\\[0.3em] +\normalsize AISE501 -- AI in Software Engineering I} +\author{Dr.\ Florian Herzog} +\date{Spring Semester 2026} + +\begin{document} +\maketitle +\tableofcontents +\newpage + +% ============================================ +\section{Overview} +% ============================================ + +This document analyses two implementations of a bank account transaction processor. +Both read account state and transactions from JSON files, validate each transaction, apply valid ones, reject invalid ones, and write results. +Both produce identical output, but \texttt{bank\_bad.py} violates many PEP\,8 and clean code principles, while \texttt{bank\_good.py} follows them consistently. + +% ============================================ +\section{Violation 1: Unused Imports and Import Formatting} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +import json,sys,os,copy;from datetime import datetime +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +import json +from typing import TypedDict, Optional +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong:} +\begin{itemize} + \item \texttt{sys}, \texttt{os}, \texttt{copy}, and \texttt{datetime} are imported but \textbf{never used}. + \item All imports are \textbf{on a single line} separated by commas, with a semicolon joining two import statements. + \item PEP\,8 requires each import on its own line and groups separated by blank lines (standard library, third-party, local). +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Imports}: Imports should be on separate lines. Remove unused imports. + \item \textbf{KISS}: Unused imports add noise and suggest false dependencies. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 2: No Documentation or Docstrings} +% ============================================ + +\begin{badbox} +The file has \textbf{no module docstring} and \textbf{no function docstrings}. The only comment in the entire file is: +\begin{lstlisting} + # find account + ... + # print results +\end{lstlisting} +These comments describe \textit{what} the next line does (which is already obvious from the code), not \textit{why}. +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +"""Bank account transaction processor. + +Reads account state and a list of transactions from JSON files, +validates and applies each transaction, then writes updated account +state and a transaction log (accepted / rejected) to output files. +""" +\end{lstlisting} +Every function has a docstring: +\begin{lstlisting} +def validate_common( + account: Optional[Account], + amount: float, +) -> Optional[str]: + """Run validations shared by all transaction types. + + Returns an error message string, or None if valid. + """ +\end{lstlisting} +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,257}: All public modules and functions should have docstrings. + \item \textbf{Clean Code -- Comments}: Don't add noise comments that just restate the code. Comments should explain \textit{why}, not \textit{what}. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 3: Implicit Data Model} +% ============================================ + +\begin{badbox} +The bad version operates on raw dictionaries with no type declarations. +A reader must trace through the JSON file and every dictionary access to understand the data shape: +\begin{lstlisting} +def proc(accs,txns): + for t in txns: + tp=t['type'];aid=t['account_id'];amt=t['amount'];tid=t['id'] + a=None + for x in accs: + if x['account_id']==aid:a=x +\end{lstlisting} +What fields does \texttt{t} have? What fields does \texttt{a} have? There is no way to know without reading the JSON file. +\end{badbox} + +\begin{goodbox} +The good version defines explicit data types: +\begin{lstlisting} +class Account(TypedDict): + """A bank account with its current state.""" + account_id: str + holder: str + balance: float + currency: str + status: str # "active" or "frozen" + +class Transaction(TypedDict, total=False): + """A financial transaction to be processed.""" + id: str + type: str # "deposit", "withdrawal", or "transfer" + account_id: str + amount: float + description: str + to_account_id: str # only for transfers + status: str # added after processing + reason: str # added on rejection +\end{lstlisting} +All function signatures carry type annotations: +\begin{lstlisting} +def find_account(accounts: list[Account], account_id: str) -> Optional[Account]: +\end{lstlisting} +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{Zen of Python}: ``Explicit is better than implicit.'' + \item \textbf{Clean Code -- Readability}: A reader should understand the data contract without tracing through runtime data. + \item \textbf{PEP\,484 / PEP\,589}: Use type hints and \texttt{TypedDict} to document the structure of dictionary-based data. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 4: Poor Naming} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +def loadJ(p): # "J" for JSON? "p" for path? +def saveJ(p,d): # "d" for data? +def proc(accs,txns): # "proc" does what exactly? + ok=[];bad=[] # acceptable vs. rejected + tp=t['type'] # "tp" is unpronounceable + aid=t['account_id'] # "aid" looks like "aid" (help) + amt=t['amount'] # "amt" -- abbreviation + tid=t['id'] # "tid" -- never used again! + a=None # "a" for account + ta=None # "ta" for target account + for x in accs: # "x" for what? + D=loadJ(...) # capital "D" for a local variable + T=loadJ(...) # capital "T" for a local variable +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +def load_json(file_path): +def save_json(file_path, data): +def find_account(accounts, account_id): +def validate_common(account, amount): +def process_deposit(accounts, transaction): +def process_withdrawal(accounts, transaction): +def process_transfer(accounts, transaction): +def process_all_transactions(accounts, transactions): +def print_results(accounts, accepted, rejected): +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong:} +\begin{itemize} + \item Function names use \textbf{abbreviations} (\texttt{loadJ}, \texttt{saveJ}, \texttt{proc}) instead of descriptive snake\_case names. + \item Variable names are \textbf{single letters or short abbreviations} (\texttt{a}, \texttt{t}, \texttt{x}, \texttt{tp}, \texttt{aid}, \texttt{amt}, \texttt{ta}). + \item \texttt{tid} is assigned but \textbf{never used} --- dead code. + \item \texttt{D} and \texttt{T} use \textbf{uppercase}, suggesting constants, but they are local variables. + \item The name \texttt{ok} for accepted transactions and \texttt{bad} for rejected ones is \textbf{imprecise}. +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Naming}: Functions and variables use \texttt{lower\_case\_with\_underscores}. Constants use \texttt{UPPER\_CASE}. + \item \textbf{Clean Code -- Descriptive Names}: ``Other developers should figure out what a variable stores just by reading its name.'' + \item \textbf{Clean Code -- Consistent Vocabulary}: Don't mix \texttt{ok}/\texttt{bad} with \texttt{accepted}/\texttt{rejected}. + \item \textbf{Clean Code -- No Abbreviations}: \texttt{amt}, \texttt{tp}, \texttt{tid} are not words. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 5: Formatting -- Semicolons and Dense Lines} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +f=open(p,'r');d=json.load(f);f.close();return d +\end{lstlisting} +\begin{lstlisting} +tp=t['type'];aid=t['account_id'];amt=t['amount'];tid=t['id'] +\end{lstlisting} +\begin{lstlisting} +a['balance']=a['balance']+amt;t['status']='accepted';ok.append(t) +\end{lstlisting} +\begin{lstlisting} +if a==None: + t['reason']='account not found';bad.append(t);continue +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +Every statement is on its own line with proper whitespace: +\begin{lstlisting} +account = find_account(accounts, transaction["account_id"]) +error = validate_common(account, transaction["amount"]) +if error: + return False, error + +account["balance"] += transaction["amount"] +return True, "accepted" +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong:} +\begin{itemize} + \item \textbf{Semicolons} pack 3--4 statements onto one line, making it nearly impossible to follow the logic. + \item \textbf{No whitespace} around \texttt{=} and after commas. + \item Control flow (\texttt{continue}) is \textbf{hidden at the end of a dense line}. + \item PEP\,8 explicitly states: ``Compound statements (multiple statements on the same line) are generally discouraged.'' +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Compound Statements}: Generally discouraged. Each statement on its own line. + \item \textbf{PEP\,8 -- Whitespace}: Surround operators with spaces. Space after commas. + \item \textbf{Zen of Python}: ``Readability counts.'' ``Sparse is better than dense.'' +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 6: No Context Managers for File I/O} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +def loadJ(p): + f=open(p,'r');d=json.load(f);f.close();return d + +def saveJ(p,d): + f=open(p,'w');json.dump(d,f,indent=2);f.close() +\end{lstlisting} +If \texttt{json.load(f)} raises an exception, the file is \textbf{never closed} because \texttt{f.close()} is skipped. This is a resource leak. +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +def load_json(file_path: str) -> dict: + """Read and parse a JSON file, returning the parsed data.""" + with open(file_path, "r", encoding="utf-8") as file_handle: + return json.load(file_handle) +\end{lstlisting} +The \texttt{with} statement guarantees the file is closed even if an exception occurs. +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{Pythonic Code}: Always use context managers (\texttt{with}) for resource management. + \item \textbf{Clean Code -- Error Handling}: Code should be robust against exceptions. Manual \texttt{open}/\texttt{close} is error-prone. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 7: God Function -- Single Responsibility Violation} +% ============================================ + +\begin{badbox} +The function \texttt{proc()} is 38 lines long and handles \textbf{all of the following} in a single function: +\begin{itemize}[nosep] + \item Finding accounts by ID + \item Validating account status + \item Validating amounts + \item Processing deposits + \item Processing withdrawals + \item Processing transfers (including finding the target account) + \item Handling unknown transaction types + \item Building accepted and rejected lists +\end{itemize} +\begin{lstlisting} +def proc(accs,txns): + ok=[];bad=[] + for t in txns: + ... # 35 lines of nested if/elif/else with continue + return accs,ok,bad +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +The good version splits this into \textbf{seven focused functions}: +\begin{lstlisting} +def find_account(accounts, account_id): # lookup +def validate_common(account, amount): # shared validation +def process_deposit(accounts, transaction): # deposit logic +def process_withdrawal(accounts, transaction):# withdrawal logic +def process_transfer(accounts, transaction): # transfer logic +def process_all_transactions(accounts, transactions): # orchestration +def print_results(accounts, accepted, rejected): # output +\end{lstlisting} +A dispatch dictionary replaces the \texttt{if/elif} chain: +\begin{lstlisting} +TRANSACTION_HANDLERS = { + "deposit": process_deposit, + "withdrawal": process_withdrawal, + "transfer": process_transfer, +} +\end{lstlisting} +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{SRP (Single Responsibility Principle)}: Each function should have one reason to change. + \item \textbf{DRY (Don't Repeat Yourself)}: The amount validation (\texttt{amt<=0}) is duplicated for deposits and transfers in the bad version; \texttt{validate\_common()} eliminates this. + \item \textbf{Clean Code -- Short Functions}: Functions should be comprehensible in a few minutes. + \item \textbf{Open-Closed Principle}: Adding a new transaction type in the bad version requires modifying the \texttt{proc()} function. In the good version, you add a new handler function and register it in the dictionary. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 8: Magic Strings Instead of Constants} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +if a['status']!='active': # magic string + ... +if tp=='deposit': # magic string + ... +\end{lstlisting} +The strings \texttt{'active'}, \texttt{'deposit'}, \texttt{'withdrawal'}, and \texttt{'transfer'} appear throughout the code as \textbf{literals}. If the status name ever changed, every occurrence would need to be found and updated. +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +ACTIVE_STATUS = "active" +... +if account["status"] != ACTIVE_STATUS: +\end{lstlisting} +Transaction types are handled via the \texttt{TRANSACTION\_HANDLERS} dictionary, so the string literals appear only \textbf{once} in the handler registration. +\end{goodbox} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{Clean Code -- No Magic Numbers/Strings}: Use named constants for values that carry domain meaning. + \item \textbf{DRY}: The same literal repeated in multiple places is a maintenance risk. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 9: Comparison with \texttt{None}} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +if a==None: + ... +if ta==None: + ... +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +if account is None: + ... +if target is None: + ... +\end{lstlisting} +\end{goodbox} + +PEP\,8 explicitly states: ``Comparisons to singletons like \texttt{None} should always be done with \texttt{is} or \texttt{is not}, never the equality operators.'' +The \texttt{is} operator checks \textbf{identity} (the correct test for \texttt{None}), while \texttt{==} checks \textbf{equality} and can be overridden by custom \texttt{\_\_eq\_\_} methods. + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{PEP\,8 -- Programming Recommendations}: Use \texttt{is None}, not \texttt{== None}. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Violation 10: Missing \texttt{\_\_main\_\_} Guard and String Formatting} +% ============================================ + +\begin{badbox} +\begin{lstlisting} +main() +\end{lstlisting} +\begin{lstlisting} +print(" "+a['account_id']+" "+a['holder']+": "+str(a['balance']) + +" "+a['currency']+" ("+a['status']+")") +\end{lstlisting} +\end{badbox} + +\begin{goodbox} +\begin{lstlisting} +if __name__ == "__main__": + main() +\end{lstlisting} +\begin{lstlisting} +print( + f" {account['account_id']} {account['holder']}: " + f"{account['balance']:.2f} {account['currency']} " + f"({account['status']})" +) +\end{lstlisting} +\end{goodbox} + +\textbf{What is wrong:} +\begin{itemize} + \item No \texttt{\_\_main\_\_} guard means importing the module triggers execution. + \item String concatenation with \texttt{+} and \texttt{str()} is harder to read than f-strings. + \item The bad version does not format numbers (\texttt{str(5000.0)} vs.\ \texttt{5000.00}). +\end{itemize} + +\begin{principlebox}[Principles Violated] +\begin{itemize}[nosep] + \item \textbf{Clean Code -- Avoid Side Effects}: Importing should not trigger execution. + \item \textbf{Pythonic Code}: Use f-strings for string formatting. +\end{itemize} +\end{principlebox} + +% ============================================ +\section{Summary of Violations} +% ============================================ + +\begin{center} +\small +\begin{tabular}{@{}rp{4.5cm}p{5.5cm}@{}} +\toprule +\textbf{\#} & \textbf{Violation} & \textbf{Principle / PEP\,8 Rule} \\ +\midrule +1 & Unused imports, one-line format & PEP\,8 Imports, KISS \\ +2 & No docstrings, noise comments & PEP\,257, Clean Code Documentation \\ +3 & Implicit data model (raw dicts) & Explicit $>$ Implicit, PEP\,484/589 \\ +4 & Abbreviations, single-letter names & PEP\,8 Naming, Descriptive Names \\ +5 & Semicolons, dense lines, no whitespace & PEP\,8 Whitespace, Zen of Python \\ +6 & Manual file open/close & Pythonic Code, Context Managers \\ +7 & God function (38-line \texttt{proc}) & SRP, DRY, Open-Closed Principle \\ +8 & Magic strings & No Magic Numbers, DRY \\ +9 & \texttt{== None} instead of \texttt{is None} & PEP\,8 Programming Recommendations \\ +10 & No \texttt{\_\_main\_\_} guard, string concat & Side Effects, Pythonic Code \\ +\bottomrule +\end{tabular} +\end{center} + +\end{document} diff --git a/Clean Code exercise/example2_bank/bank_analysis.toc b/Clean Code exercise/example2_bank/bank_analysis.toc new file mode 100644 index 0000000..ae12702 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_analysis.toc @@ -0,0 +1,13 @@ +\babel@toc {english}{}\relax +\contentsline {section}{\numberline {1}Overview}{2}{section.1}% +\contentsline {section}{\numberline {2}Violation 1: Unused Imports and Import Formatting}{2}{section.2}% +\contentsline {section}{\numberline {3}Violation 2: No Documentation or Docstrings}{2}{section.3}% +\contentsline {section}{\numberline {4}Violation 3: Implicit Data Model}{3}{section.4}% +\contentsline {section}{\numberline {5}Violation 4: Poor Naming}{4}{section.5}% +\contentsline {section}{\numberline {6}Violation 5: Formatting -- Semicolons and Dense Lines}{5}{section.6}% +\contentsline {section}{\numberline {7}Violation 6: No Context Managers for File I/O}{6}{section.7}% +\contentsline {section}{\numberline {8}Violation 7: God Function -- Single Responsibility Violation}{7}{section.8}% +\contentsline {section}{\numberline {9}Violation 8: Magic Strings Instead of Constants}{8}{section.9}% +\contentsline {section}{\numberline {10}Violation 9: Comparison with \texttt {None}}{8}{section.10}% +\contentsline {section}{\numberline {11}Violation 10: Missing \texttt {\_\_main\_\_} Guard and String Formatting}{9}{section.11}% +\contentsline {section}{\numberline {12}Summary of Violations}{10}{section.12}% diff --git a/Clean Code exercise/example2_bank/bank_bad.py b/Clean Code exercise/example2_bank/bank_bad.py new file mode 100644 index 0000000..5489cb7 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_bad.py @@ -0,0 +1,62 @@ +import json,sys,os,copy;from datetime import datetime + +def loadJ(p): + f=open(p,'r');d=json.load(f);f.close();return d + +def saveJ(p,d): + f=open(p,'w');json.dump(d,f,indent=2);f.close() + +def proc(accs,txns): + ok=[];bad=[] + for t in txns: + tp=t['type'];aid=t['account_id'];amt=t['amount'];tid=t['id'] + # find account + a=None + for x in accs: + if x['account_id']==aid:a=x + if a==None: + t['reason']='account not found';bad.append(t);continue + if a['status']!='active': + t['reason']='account not active';bad.append(t);continue + if amt<=0 and tp!='withdrawal': + if tp=='deposit':t['reason']='invalid amount';bad.append(t);continue + if tp=='transfer':t['reason']='invalid amount';bad.append(t);continue + if amt<=0 and tp=='withdrawal': + t['reason']='invalid amount';bad.append(t);continue + if tp=='deposit': + a['balance']=a['balance']+amt;t['status']='accepted';ok.append(t) + elif tp=='withdrawal': + if a['balance']>=amt: + a['balance']=a['balance']-amt;t['status']='accepted';ok.append(t) + else: + t['reason']='insufficient funds';t['status']='rejected';bad.append(t) + elif tp=='transfer': + ta=None + for x in accs: + if x['account_id']==t.get('to_account_id',''):ta=x + if ta==None:t['reason']='target account not found';bad.append(t);continue + if ta['status']!='active':t['reason']='target account not active';bad.append(t);continue + if a['balance']>=amt: + a['balance']=a['balance']-amt;ta['balance']=ta['balance']+amt + t['status']='accepted';ok.append(t) + else: + t['reason']='insufficient funds';t['status']='rejected';bad.append(t) + else: + t['reason']='unknown type';bad.append(t) + return accs,ok,bad + +def main(): + D=loadJ('accounts.json');T=loadJ('transactions.json') + accs=D['accounts'];txns=T['transactions'] + accs,ok,bad=proc(accs,txns) + # print results + print("=== UPDATED ACCOUNTS ===") + for a in accs:print(" "+a['account_id']+" "+a['holder']+": "+str(a['balance'])+" "+a['currency']+" ("+a['status']+")") + print("\n=== ACCEPTED ("+str(len(ok))+") ===") + for t in ok:print(" "+t['id']+" "+t['type']+" "+str(t['amount'])+" -> "+t.get('description','')) + print("\n=== REJECTED ("+str(len(bad))+") ===") + for t in bad:print(" "+t['id']+" "+t['type']+" "+str(t['amount'])+" -> "+t.get('reason','unknown')) + saveJ('accounts_updated_bad.json',{"accounts":accs}) + saveJ('transaction_log_bad.json',{"accepted":ok,"rejected":bad}) + +main() diff --git a/Clean Code exercise/example2_bank/bank_good.py b/Clean Code exercise/example2_bank/bank_good.py new file mode 100644 index 0000000..0c87662 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_good.py @@ -0,0 +1,280 @@ +"""Bank account transaction processor. + +Reads account state and a list of transactions from JSON files, +validates and applies each transaction, then writes updated account +state and a transaction log (accepted / rejected) to output files. +""" + +import json +from typing import TypedDict, Optional + + +# --------------------------------------------------------------------------- +# Explicit data model -- defines the exact shape of every data structure +# --------------------------------------------------------------------------- + +class Account(TypedDict): + """A bank account with its current state.""" + account_id: str + holder: str + balance: float + currency: str + status: str # "active" or "frozen" + + +class Transaction(TypedDict, total=False): + """A financial transaction to be processed. + + Fields marked total=False are optional (e.g. to_account_id only + exists for transfers; status/reason are added during processing). + """ + id: str + type: str # "deposit", "withdrawal", or "transfer" + account_id: str + amount: float + description: str + to_account_id: str # only for transfers + status: str # added after processing: "accepted" / "rejected" + reason: str # added on rejection + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +ACCOUNTS_INPUT = "accounts.json" +TRANSACTIONS_INPUT = "transactions.json" +ACCOUNTS_OUTPUT = "accounts_updated_good.json" +TRANSACTION_LOG_OUTPUT = "transaction_log_good.json" + +ACTIVE_STATUS = "active" + + +# --------------------------------------------------------------------------- +# File I/O +# --------------------------------------------------------------------------- + +def load_json(file_path: str) -> dict: + """Read and parse a JSON file, returning the parsed data.""" + with open(file_path, "r", encoding="utf-8") as file_handle: + return json.load(file_handle) + + +def save_json(file_path: str, data: dict) -> None: + """Write data to a JSON file with readable indentation.""" + with open(file_path, "w", encoding="utf-8") as file_handle: + json.dump(data, file_handle, indent=2, ensure_ascii=False) + + +def load_accounts(file_path: str) -> list[Account]: + """Load and return the list of accounts from a JSON file.""" + data = load_json(file_path) + return data["accounts"] + + +def load_transactions(file_path: str) -> list[Transaction]: + """Load and return the list of transactions from a JSON file.""" + data = load_json(file_path) + return data["transactions"] + + +# --------------------------------------------------------------------------- +# Account lookup +# --------------------------------------------------------------------------- + +def find_account(accounts: list[Account], account_id: str) -> Optional[Account]: + """Find an account by its ID. Returns the account dict or None.""" + for account in accounts: + if account["account_id"] == account_id: + return account + return None + + +# --------------------------------------------------------------------------- +# Validation +# --------------------------------------------------------------------------- + +def validate_common( + account: Optional[Account], + amount: float, +) -> Optional[str]: + """Run validations shared by all transaction types. + + Returns an error message string, or None if valid. + """ + if account is None: + return "account not found" + + if account["status"] != ACTIVE_STATUS: + return f"account is {account['status']}" + + if amount is None or amount <= 0: + return "amount must be positive" + + return None + + +# --------------------------------------------------------------------------- +# Transaction handlers -- one function per transaction type +# --------------------------------------------------------------------------- + +def process_deposit( + accounts: list[Account], + transaction: Transaction, +) -> tuple[bool, str]: + """Apply a deposit transaction. Returns (success, reason).""" + account = find_account(accounts, transaction["account_id"]) + error = validate_common(account, transaction["amount"]) + if error: + return False, error + + account["balance"] += transaction["amount"] + return True, "accepted" + + +def process_withdrawal( + accounts: list[Account], + transaction: Transaction, +) -> tuple[bool, str]: + """Apply a withdrawal transaction. Returns (success, reason).""" + account = find_account(accounts, transaction["account_id"]) + error = validate_common(account, transaction["amount"]) + if error: + return False, error + + if account["balance"] < transaction["amount"]: + return False, "insufficient funds" + + account["balance"] -= transaction["amount"] + return True, "accepted" + + +def process_transfer( + accounts: list[Account], + transaction: Transaction, +) -> tuple[bool, str]: + """Apply a transfer between two accounts. Returns (success, reason).""" + source = find_account(accounts, transaction["account_id"]) + error = validate_common(source, transaction["amount"]) + if error: + return False, f"source: {error}" + + target_id = transaction.get("to_account_id", "") + target = find_account(accounts, target_id) + + if target is None: + return False, "target account not found" + if target["status"] != ACTIVE_STATUS: + return False, f"target account is {target['status']}" + + if source["balance"] < transaction["amount"]: + return False, "insufficient funds" + + source["balance"] -= transaction["amount"] + target["balance"] += transaction["amount"] + return True, "accepted" + + +TRANSACTION_HANDLERS = { + "deposit": process_deposit, + "withdrawal": process_withdrawal, + "transfer": process_transfer, +} + + +# --------------------------------------------------------------------------- +# Processing +# --------------------------------------------------------------------------- + +def process_all_transactions( + accounts: list[Account], + transactions: list[Transaction], +) -> tuple[list[Transaction], list[Transaction]]: + """Process a list of transactions against the account state. + + Returns two lists: (accepted_transactions, rejected_transactions). + Each transaction is augmented with 'status' and optionally 'reason'. + """ + accepted: list[Transaction] = [] + rejected: list[Transaction] = [] + + for transaction in transactions: + transaction_type = transaction.get("type", "") + handler = TRANSACTION_HANDLERS.get(transaction_type) + + if handler is None: + transaction["status"] = "rejected" + transaction["reason"] = f"unknown transaction type '{transaction_type}'" + rejected.append(transaction) + continue + + success, reason = handler(accounts, transaction) + + if success: + transaction["status"] = "accepted" + accepted.append(transaction) + else: + transaction["status"] = "rejected" + transaction["reason"] = reason + rejected.append(transaction) + + return accepted, rejected + + +# --------------------------------------------------------------------------- +# Output +# --------------------------------------------------------------------------- + +def print_results( + accounts: list[Account], + accepted: list[Transaction], + rejected: list[Transaction], +) -> None: + """Print a human-readable summary to the console.""" + print("=== UPDATED ACCOUNTS ===") + for account in accounts: + print( + f" {account['account_id']} {account['holder']}: " + f"{account['balance']:.2f} {account['currency']} " + f"({account['status']})" + ) + + print(f"\n=== ACCEPTED TRANSACTIONS ({len(accepted)}) ===") + for txn in accepted: + print( + f" {txn['id']} {txn['type']:12s} {txn['amount']:>10.2f} " + f"{txn.get('description', '')}" + ) + + print(f"\n=== REJECTED TRANSACTIONS ({len(rejected)}) ===") + for txn in rejected: + print( + f" {txn['id']} {txn['type']:12s} {txn['amount']:>10.2f} " + f"Reason: {txn.get('reason', 'unknown')}" + ) + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def main() -> None: + """Load data, process transactions, print and save results.""" + accounts: list[Account] = load_accounts(ACCOUNTS_INPUT) + transactions: list[Transaction] = load_transactions(TRANSACTIONS_INPUT) + + accepted, rejected = process_all_transactions(accounts, transactions) + + print_results(accounts, accepted, rejected) + + save_json(ACCOUNTS_OUTPUT, {"accounts": accounts}) + save_json(TRANSACTION_LOG_OUTPUT, { + "accepted": accepted, + "rejected": rejected, + }) + + print(f"\nOutput written to {ACCOUNTS_OUTPUT} and {TRANSACTION_LOG_OUTPUT}") + + +if __name__ == "__main__": + main() diff --git a/Clean Code exercise/example2_bank/bank_usecase.aux b/Clean Code exercise/example2_bank/bank_usecase.aux new file mode 100644 index 0000000..dcc0324 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_usecase.aux @@ -0,0 +1,16 @@ +\relax +\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo } +\@nameuse{bbl@beforestart} +\providecommand\hyper@newdestlabel[2]{} +\providecommand\HyField@AuxAddToFields[1]{} +\providecommand\HyField@AuxAddToCoFields[2]{} +\babel@aux{english}{} +\@writefile{toc}{\contentsline {section}{\numberline {1}Use Case}{1}{section.1}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {2}Input Files}{1}{section.2}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Account State (\texttt {accounts.json})}{1}{subsection.2.1}\protected@file@percent } +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Transactions (\texttt {transactions.json})}{1}{subsection.2.2}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {3}Validation Rules}{1}{section.3}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {4}Output}{2}{section.4}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {5}Expected Results}{2}{section.5}\protected@file@percent } +\@writefile{toc}{\contentsline {section}{\numberline {6}Exercise}{2}{section.6}\protected@file@percent } +\gdef \@abspage@last{3} diff --git a/Clean Code exercise/example2_bank/bank_usecase.out b/Clean Code exercise/example2_bank/bank_usecase.out new file mode 100644 index 0000000..e253279 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_usecase.out @@ -0,0 +1,8 @@ +\BOOKMARK [1][-]{section.1}{\376\377\000U\000s\000e\000\040\000C\000a\000s\000e}{}% 1 +\BOOKMARK [1][-]{section.2}{\376\377\000I\000n\000p\000u\000t\000\040\000F\000i\000l\000e\000s}{}% 2 +\BOOKMARK [2][-]{subsection.2.1}{\376\377\000A\000c\000c\000o\000u\000n\000t\000\040\000S\000t\000a\000t\000e\000\040\000\050\000a\000c\000c\000o\000u\000n\000t\000s\000.\000j\000s\000o\000n\000\051}{section.2}% 3 +\BOOKMARK [2][-]{subsection.2.2}{\376\377\000T\000r\000a\000n\000s\000a\000c\000t\000i\000o\000n\000s\000\040\000\050\000t\000r\000a\000n\000s\000a\000c\000t\000i\000o\000n\000s\000.\000j\000s\000o\000n\000\051}{section.2}% 4 +\BOOKMARK [1][-]{section.3}{\376\377\000V\000a\000l\000i\000d\000a\000t\000i\000o\000n\000\040\000R\000u\000l\000e\000s}{}% 5 +\BOOKMARK [1][-]{section.4}{\376\377\000O\000u\000t\000p\000u\000t}{}% 6 +\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000c\000t\000e\000d\000\040\000R\000e\000s\000u\000l\000t\000s}{}% 7 +\BOOKMARK [1][-]{section.6}{\376\377\000E\000x\000e\000r\000c\000i\000s\000e}{}% 8 diff --git a/Clean Code exercise/example2_bank/bank_usecase.pdf b/Clean Code exercise/example2_bank/bank_usecase.pdf new file mode 100644 index 0000000..31bad72 Binary files /dev/null and b/Clean Code exercise/example2_bank/bank_usecase.pdf differ diff --git a/Clean Code exercise/example2_bank/bank_usecase.tex b/Clean Code exercise/example2_bank/bank_usecase.tex new file mode 100644 index 0000000..9131585 --- /dev/null +++ b/Clean Code exercise/example2_bank/bank_usecase.tex @@ -0,0 +1,152 @@ +\documentclass[12pt,a4paper]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage[english]{babel} +\usepackage{geometry} +\geometry{margin=2.5cm} +\usepackage{xcolor} +\usepackage{tcolorbox} +\usepackage{booktabs} +\usepackage{hyperref} +\usepackage{listings} + +\definecolor{seblue}{rgb}{0.0,0.28,0.67} +\definecolor{backcolour}{rgb}{0.95,0.95,0.92} + +\lstdefinestyle{json}{ + backgroundcolor=\color{backcolour}, + basicstyle=\ttfamily\small, + breaklines=true, + showstringspaces=false, + tabsize=2 +} + +\title{\textcolor{seblue}{Exercise 2: Bank Account Transaction Processor}\\[0.3em] +\large AISE501 -- AI in Software Engineering I} +\author{Dr.\ Florian Herzog} +\date{Spring Semester 2026} + +\begin{document} +\maketitle + +\section{Use Case} + +A simple bank system maintains a set of customer accounts, each with a balance, currency, and status (\texttt{active} or \texttt{frozen}). +A series of transactions is submitted for processing. +The program must validate each transaction, apply valid ones, reject invalid ones, and produce output files recording the results. + +\section{Input Files} + +\subsection{Account State (\texttt{accounts.json})} + +A JSON file containing an array of account objects: + +\begin{lstlisting}[style=json] +{ + "accounts": [ + { + "account_id": "ACC-001", + "holder": "Alice Mueller", + "balance": 5000.00, + "currency": "CHF", + "status": "active" + }, + ... + ] +} +\end{lstlisting} + +\subsection{Transactions (\texttt{transactions.json})} + +A JSON file containing an array of transaction objects. +Each transaction has a \texttt{type} (\texttt{deposit}, \texttt{withdrawal}, or \texttt{transfer}), an \texttt{account\_id}, an \texttt{amount}, and a \texttt{description}. +Transfers additionally have a \texttt{to\_account\_id}. + +\section{Validation Rules} + +A transaction is \textbf{rejected} if any of these conditions apply: + +\begin{center} +\begin{tabular}{ll} +\toprule +\textbf{Condition} & \textbf{Applies to} \\ +\midrule +Account ID does not exist & All types \\ +Account status is not \texttt{active} & All types \\ +Amount is zero or negative & All types \\ +Balance is less than withdrawal amount & Withdrawal, Transfer \\ +Target account does not exist & Transfer \\ +Target account is not \texttt{active} & Transfer \\ +Unknown transaction type & -- \\ +\bottomrule +\end{tabular} +\end{center} + +\section{Output} + +The program produces: + +\begin{enumerate} + \item \textbf{Console output} -- A summary of updated account balances, accepted transactions, and rejected transactions with reasons. + \item \textbf{Updated account state} (\texttt{accounts\_updated.json}) -- The accounts JSON with balances modified by accepted transactions. + \item \textbf{Transaction log} (\texttt{transaction\_log.json}) -- Two arrays: \texttt{accepted} and \texttt{rejected}, each transaction annotated with its \texttt{status} and (for rejections) a \texttt{reason}. +\end{enumerate} + +\section{Expected Results} + +Given the provided input files, the expected outcome is: + +\begin{center} +\small +\begin{tabular}{lllp{5cm}} +\toprule +\textbf{TXN ID} & \textbf{Type} & \textbf{Result} & \textbf{Reason (if rejected)} \\ +\midrule +TXN-001 & deposit & Accepted & -- \\ +TXN-002 & withdrawal & Accepted & -- \\ +TXN-003 & withdrawal & Rejected & Insufficient funds \\ +TXN-004 & deposit & Rejected & Negative amount \\ +TXN-005 & deposit & Rejected & Account is frozen \\ +TXN-006 & transfer & Accepted & -- \\ +TXN-007 & withdrawal & Rejected & Account not found \\ +TXN-008 & deposit & Rejected & Zero amount \\ +\bottomrule +\end{tabular} +\end{center} + +\section{Exercise} + +Two implementations are provided: + +\begin{enumerate} + \item \textbf{\texttt{bank\_bad.py}} -- A working but poorly written version that violates many clean code and PEP\,8 principles. + \item \textbf{\texttt{bank\_good.py}} -- A clean, well-structured version following PEP\,8 and clean code best practices. +\end{enumerate} + +\subsection*{Tasks} + +\begin{enumerate} + \item Run both programs and verify they produce the same results. + \item Read the bad version and list all clean code / PEP\,8 violations you can find. + \item For each violation, explain which principle is broken and why it makes the code harder to read or maintain. + \item Compare your list with the good version to see how each issue was resolved. +\end{enumerate} + +\subsection*{Violations to Look For} + +\begin{itemize} + \item Unused imports (\texttt{sys}, \texttt{os}, \texttt{copy}, \texttt{datetime}) + \item No docstrings or module documentation + \item Single-letter and abbreviated variable names (\texttt{a}, \texttt{t}, \texttt{d}, \texttt{tp}, \texttt{tid}) + \item Multiple statements per line (semicolons) + \item No whitespace around operators and after commas + \item Manual file open/close instead of context managers (\texttt{with}) + \item One giant function doing all validation (violates Single Responsibility) + \item Duplicated validation logic for deposit/transfer amount checks + \item No constants for file paths + \item Missing \texttt{if \_\_name\_\_ == "\_\_main\_\_"} guard + \item Inconsistent error handling and status assignment + \item Hard-to-follow control flow with nested \texttt{if}/\texttt{elif}/\texttt{continue} +\end{itemize} + +\end{document} diff --git a/Clean Code exercise/example2_bank/transaction_log_bad.json b/Clean Code exercise/example2_bank/transaction_log_bad.json new file mode 100644 index 0000000..295e094 --- /dev/null +++ b/Clean Code exercise/example2_bank/transaction_log_bad.json @@ -0,0 +1,72 @@ +{ + "accepted": [ + { + "id": "TXN-001", + "type": "deposit", + "account_id": "ACC-001", + "amount": 500.0, + "description": "Salary payment", + "status": "accepted" + }, + { + "id": "TXN-002", + "type": "withdrawal", + "account_id": "ACC-001", + "amount": 200.0, + "description": "ATM withdrawal", + "status": "accepted" + }, + { + "id": "TXN-006", + "type": "transfer", + "account_id": "ACC-001", + "to_account_id": "ACC-002", + "amount": 750.0, + "description": "Transfer to Bob", + "status": "accepted" + } + ], + "rejected": [ + { + "id": "TXN-003", + "type": "withdrawal", + "account_id": "ACC-002", + "amount": 1500.0, + "description": "Rent payment - exceeds balance", + "reason": "insufficient funds", + "status": "rejected" + }, + { + "id": "TXN-004", + "type": "deposit", + "account_id": "ACC-002", + "amount": -100.0, + "description": "Invalid negative deposit", + "reason": "invalid amount" + }, + { + "id": "TXN-005", + "type": "deposit", + "account_id": "ACC-003", + "amount": 1000.0, + "description": "Deposit to frozen account", + "reason": "account not active" + }, + { + "id": "TXN-007", + "type": "withdrawal", + "account_id": "ACC-999", + "amount": 50.0, + "description": "Unknown account", + "reason": "account not found" + }, + { + "id": "TXN-008", + "type": "deposit", + "account_id": "ACC-001", + "amount": 0, + "description": "Zero-amount deposit", + "reason": "invalid amount" + } + ] +} \ No newline at end of file diff --git a/Clean Code exercise/example2_bank/transaction_log_good.json b/Clean Code exercise/example2_bank/transaction_log_good.json new file mode 100644 index 0000000..edca35a --- /dev/null +++ b/Clean Code exercise/example2_bank/transaction_log_good.json @@ -0,0 +1,76 @@ +{ + "accepted": [ + { + "id": "TXN-001", + "type": "deposit", + "account_id": "ACC-001", + "amount": 500.0, + "description": "Salary payment", + "status": "accepted" + }, + { + "id": "TXN-002", + "type": "withdrawal", + "account_id": "ACC-001", + "amount": 200.0, + "description": "ATM withdrawal", + "status": "accepted" + }, + { + "id": "TXN-006", + "type": "transfer", + "account_id": "ACC-001", + "to_account_id": "ACC-002", + "amount": 750.0, + "description": "Transfer to Bob", + "status": "accepted" + } + ], + "rejected": [ + { + "id": "TXN-003", + "type": "withdrawal", + "account_id": "ACC-002", + "amount": 1500.0, + "description": "Rent payment - exceeds balance", + "status": "rejected", + "reason": "insufficient funds" + }, + { + "id": "TXN-004", + "type": "deposit", + "account_id": "ACC-002", + "amount": -100.0, + "description": "Invalid negative deposit", + "status": "rejected", + "reason": "amount must be positive" + }, + { + "id": "TXN-005", + "type": "deposit", + "account_id": "ACC-003", + "amount": 1000.0, + "description": "Deposit to frozen account", + "status": "rejected", + "reason": "account is frozen" + }, + { + "id": "TXN-007", + "type": "withdrawal", + "account_id": "ACC-999", + "amount": 50.0, + "description": "Unknown account", + "status": "rejected", + "reason": "account not found" + }, + { + "id": "TXN-008", + "type": "deposit", + "account_id": "ACC-001", + "amount": 0, + "description": "Zero-amount deposit", + "status": "rejected", + "reason": "amount must be positive" + } + ] +} \ No newline at end of file diff --git a/Clean Code exercise/example2_bank/transactions.json b/Clean Code exercise/example2_bank/transactions.json new file mode 100644 index 0000000..340e3fa --- /dev/null +++ b/Clean Code exercise/example2_bank/transactions.json @@ -0,0 +1,61 @@ +{ + "transactions": [ + { + "id": "TXN-001", + "type": "deposit", + "account_id": "ACC-001", + "amount": 500.00, + "description": "Salary payment" + }, + { + "id": "TXN-002", + "type": "withdrawal", + "account_id": "ACC-001", + "amount": 200.00, + "description": "ATM withdrawal" + }, + { + "id": "TXN-003", + "type": "withdrawal", + "account_id": "ACC-002", + "amount": 1500.00, + "description": "Rent payment - exceeds balance" + }, + { + "id": "TXN-004", + "type": "deposit", + "account_id": "ACC-002", + "amount": -100.00, + "description": "Invalid negative deposit" + }, + { + "id": "TXN-005", + "type": "deposit", + "account_id": "ACC-003", + "amount": 1000.00, + "description": "Deposit to frozen account" + }, + { + "id": "TXN-006", + "type": "transfer", + "account_id": "ACC-001", + "to_account_id": "ACC-002", + "amount": 750.00, + "description": "Transfer to Bob" + }, + { + "id": "TXN-007", + "type": "withdrawal", + "account_id": "ACC-999", + "amount": 50.00, + "description": "Unknown account" + }, + { + "id": "TXN-008", + "type": "deposit", + "account_id": "ACC-001", + "amount": 0, + "description": "Zero-amount deposit" + } + ] +} diff --git a/Code embeddings/00_tokens_and_embeddings_intro.py b/Code embeddings/00_tokens_and_embeddings_intro.py new file mode 100644 index 0000000..825b4e6 --- /dev/null +++ b/Code embeddings/00_tokens_and_embeddings_intro.py @@ -0,0 +1,486 @@ +""" +============================================================================ +Example 0: Tokens, Embeddings, and Language Similarity — An Introduction +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +GOAL: + Before we look at CODE embeddings, we need to understand the + foundational concepts: tokenization and text embeddings. This script + walks through the full pipeline step by step, using German words + and phrases so you can build intuition in your native language. + + The pipeline is: Text → Tokens → Token IDs → Embedding Vectors + +WHAT YOU WILL LEARN: + 1. How text is split into TOKENS (sub-word units) + 2. How tokens are mapped to integer IDs (the model's vocabulary) + 3. How token IDs become dense EMBEDDING VECTORS (768 dimensions) + 4. How cosine similarity measures meaning — similar phrases are + close in vector space, different phrases are far apart + 5. How to VISUALIZE the embedding space in 2D using PCA + +LANGUAGE: + All examples use German words and phrases to make the concepts + tangible. The model (multilingual) handles German natively. + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +import numpy as np +from transformers import AutoTokenizer, AutoModel, BertTokenizer +import torch.nn.functional as F +from sklearn.decomposition import PCA +import matplotlib +import matplotlib.pyplot as plt + +matplotlib.use("Agg") + +# ── Device selection ────────────────────────────────────────────────────── +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load a MULTILINGUAL EMBEDDING model ─────────────────────────────────── +# We use paraphrase-multilingual-mpnet-base-v2: a sentence embedding model +# fine-tuned for semantic similarity across 50+ languages including German. +# It uses an XLM-RoBERTa backbone and produces 768-dimensional embeddings +# where cosine similarity directly reflects semantic similarity. +MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" + +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() +print("Model loaded.\n") + +# ── Load a German-only tokenizer for comparison ────────────────────────── +# gbert-base uses WordPiece trained exclusively on German text (~31k vocab). +# We only load its tokenizer — no model weights needed. +GERMAN_TOKENIZER_NAME = "deepset/gbert-base" +print(f"Loading German tokenizer: {GERMAN_TOKENIZER_NAME} ...") +german_tokenizer = BertTokenizer.from_pretrained(GERMAN_TOKENIZER_NAME) +print("German tokenizer loaded.\n") + + +# ══════════════════════════════════════════════════════════════════════════ +# PART 1: TOKENIZATION — How text becomes numbers +# ══════════════════════════════════════════════════════════════════════════ +print("=" * 70) +print("PART 1: TOKENIZATION") +print("=" * 70) +print(""" +Neural networks cannot read text — they only understand numbers. +TOKENIZATION is the first step: splitting text into sub-word pieces +called TOKENS, then mapping each token to an integer ID. + +We compare two tokenizers: + • gbert (German-only, ~31k vocab) — trained exclusively on German text + • mpnet (multilingual, ~250k vocab) — trained on 100+ languages +""") + +german_words = [ + "Fachhochschule", + "Softwareentwicklung", + "Künstliche Intelligenz", + "Programmiersprache", + "Datenbank", + "Maschinelles Lernen", + "Graubünden", + "unhappiness", # English comparison +] + +# ── 1a: German-only tokenizer (gbert / WordPiece) ──────────────────────── +print("─── 1a: German-Only Tokenizer (gbert, WordPiece, 31k vocab) ───\n") +print(f"{'Word/Phrase':<28s} {'#':>3s} {'Tokens'}") +print("-" * 90) + +for word in german_words: + ids = german_tokenizer.encode(word, add_special_tokens=False) + toks = german_tokenizer.convert_ids_to_tokens(ids) + print(f"{word:<28s} {len(toks):3d} {' | '.join(toks)}") + +# ── 1b: Multilingual tokenizer (mpnet / SentencePiece) ─────────────────── +print(f"\n─── 1b: Multilingual Tokenizer (mpnet, SentencePiece, 250k vocab) ───\n") +print(f"{'Word/Phrase':<28s} {'#':>3s} {'Tokens'}") +print("-" * 90) + +for word in german_words: + ids = tokenizer.encode(word, add_special_tokens=False) + toks = tokenizer.convert_ids_to_tokens(ids) + print(f"{word:<28s} {len(toks):3d} {' | '.join(toks)}") + +print(""" +KEY OBSERVATIONS: + • The GERMAN tokenizer keeps common words intact: "Fachhochschule" is + a SINGLE token, "Programmiersprache" splits at the natural compound + boundary "Programmier" + "sprache". + • The MULTILINGUAL tokenizer fragments German more aggressively: + "Fachhochschule" → 4 tokens ("Fach", "ho", "ch", "schule"), because + its 250k vocabulary is shared across 100+ languages — German gets + a smaller budget per word. + • Both tokenizers use STATISTICAL sub-word splitting (not morphological + analysis). The German tokenizer simply has more German-specific + entries because its entire vocabulary is dedicated to one language. + • Trade-off: the multilingual tokenizer needs more tokens per German + word, but it enables CROSS-LINGUAL capabilities (comparing German + and English in the same embedding space — see Part 3b). + • The rest of this script uses the multilingual model for embeddings. +""") + + +# ══════════════════════════════════════════════════════════════════════════ +# PART 2: FROM TOKENS TO EMBEDDING VECTORS +# ══════════════════════════════════════════════════════════════════════════ +print("=" * 70) +print("PART 2: FROM TOKENS TO EMBEDDING VECTORS") +print("=" * 70) +print(""" +Each token ID is looked up in an EMBEDDING TABLE — a large matrix where +each row is a dense vector (768 dimensions in this model, up to 4096 in +large LLMs). The transformer then refines these vectors through 12 layers +of self-attention, producing contextual embeddings where each token's +vector depends on ALL surrounding tokens. +""") + +example_sentence = "Der Student lernt Programmieren an der Fachhochschule" + +inputs = tokenizer(example_sentence, return_tensors="pt").to(DEVICE) +token_ids = inputs["input_ids"].squeeze().tolist() +tokens = tokenizer.convert_ids_to_tokens(token_ids) + +with torch.no_grad(): + outputs = model(**inputs) + +# outputs.last_hidden_state: shape [1, num_tokens, 768] +hidden_states = outputs.last_hidden_state.squeeze(0) + +print(f'Sentence: "{example_sentence}"\n') +print(f"{'Pos':>4s} {'Token':<20s} {'ID':>7s} {'Vector (first 8 of 768 dims)...'}") +print("-" * 80) + +for i, (tok, tid) in enumerate(zip(tokens, token_ids)): + vec = hidden_states[i].cpu().numpy() + vec_preview = " ".join(f"{v:+.3f}" for v in vec[:8]) + print(f"{i:4d} {tok:<20s} {tid:7d} [{vec_preview} ...]") + +print(f""" +KEY OBSERVATIONS: + • Each token becomes a vector of {hidden_states.shape[1]} numbers. + • These numbers are NOT random — they encode the token's meaning + IN CONTEXT. The vector for "Fachhochschule" here is different from + the vector for "Fachhochschule" in a different sentence. + • The full sentence has {len(tokens)} tokens, producing a matrix of + shape [{len(tokens)} × {hidden_states.shape[1]}]. + • To get a single vector for the whole sentence, we average all + token vectors (mean pooling). +""") + + +# ══════════════════════════════════════════════════════════════════════════ +# PART 3: MEASURING SIMILARITY BETWEEN WORDS +# ══════════════════════════════════════════════════════════════════════════ +print("=" * 70) +print("PART 3: WORD AND PHRASE SIMILARITY") +print("=" * 70) +print(""" +If embeddings capture meaning, then SIMILAR words should have SIMILAR +vectors (high cosine similarity) and DIFFERENT words should have +DIFFERENT vectors (low cosine similarity). Let's test this with German. +""") + + +def embed_text(text: str) -> torch.Tensor: + """Embed a word or phrase into a single normalized vector.""" + inputs = tokenizer(text, return_tensors="pt", truncation=True, + max_length=128, padding=True).to(DEVICE) + with torch.no_grad(): + outputs = model(**inputs) + mask = inputs["attention_mask"].unsqueeze(-1) + embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1) + return F.normalize(embedding, p=2, dim=1).squeeze(0) + + +# ── 3a: Single word similarities ───────────────────────────────────────── +print("─── 3a: Single Word Similarities ───\n") + +word_pairs = [ + # Semantically SIMILAR pairs (synonyms or near-synonyms) + ("Auto", "Fahrzeug"), # car / vehicle — near-synonyms + ("Arzt", "Doktor"), # physician / doctor — synonyms + ("Programmierer", "Entwickler"), # programmer / developer + ("schnell", "rasch"), # fast / swift — synonyms + ("Haus", "Gebäude"), # house / building — closely related + + # SAME CATEGORY but different concepts + ("Hund", "Katze"), # dog / cat — both pets, but different! + ("Montag", "Freitag"), # Monday / Friday — both weekdays + + # Semantically UNRELATED pairs + ("Hund", "Mathematik"), # dog vs math + ("Auto", "Philosophie"), # car vs philosophy + ("schnell", "Datenbank"), # fast vs database +] + +print(f"{'Word A':<20s} {'Word B':<20s} {'Cosine Sim':>10s} {'Relationship'}") +print("-" * 75) + +for w1, w2 in word_pairs: + v1, v2 = embed_text(w1), embed_text(w2) + sim = torch.dot(v1.cpu(), v2.cpu()).item() + if sim > 0.6: + rel = "synonyms/close" + elif sim > 0.3: + rel = "related" + else: + rel = "unrelated" + bar = "█" * int(max(0, sim) * 30) + print(f"{w1:<20s} {w2:<20s} {sim:10.3f} {bar} ({rel})") + +print(""" +KEY OBSERVATIONS: + → Synonyms (Auto/Fahrzeug, Arzt/Doktor) have HIGHEST similarity. + → Same-category but different concepts (Hund/Katze) have MODERATE + similarity — they share context (both are pets) but a dog is NOT + a cat. The model captures this nuance! + → Completely unrelated words (Hund/Mathematik) have LOW similarity. + → Embedding similarity reflects MEANING OVERLAP, not just category. +""") + +# ── 3b: Phrase/sentence similarities ───────────────────────────────────── +print("─── 3b: Phrase and Sentence Similarities ───\n") + +phrases = { + "ML_de": "Maschinelles Lernen ist ein Teilgebiet der Informatik", + "ML_en": "Machine learning is a subfield of computer science", + "DL_de": "Deep Learning verwendet neuronale Netze mit vielen Schichten", + "Koch": "Der Koch bereitet das Abendessen in der Küche vor", + "Wetter": "Morgen wird es regnen und kalt sein", + "Prog": "Python ist eine beliebte Programmiersprache", +} + +phrase_embeddings = {name: embed_text(text) for name, text in phrases.items()} + +names = list(phrases.keys()) +print(f"{'':>10s}", end="") +for n in names: + print(f"{n:>10s}", end="") +print() + +for n1 in names: + print(f"{n1:>10s}", end="") + for n2 in names: + sim = torch.dot(phrase_embeddings[n1].cpu(), + phrase_embeddings[n2].cpu()).item() + print(f"{sim:10.3f}", end="") + print() + +print(""" +KEY OBSERVATIONS: + • "Maschinelles Lernen..." (German) and "Machine learning..." (English) + should have HIGH similarity — the model understands both languages + and maps equivalent meanings to nearby vectors. + • ML and Deep Learning sentences should be moderately similar (related + topics in computer science). + • The cooking sentence and weather sentence should be DISSIMILAR to + the tech sentences — completely different topics. + • This CROSS-LINGUAL capability is what makes multilingual embeddings + so powerful. +""") + + +# ══════════════════════════════════════════════════════════════════════════ +# PART 4: VISUALIZING THE EMBEDDING SPACE +# ══════════════════════════════════════════════════════════════════════════ +print("=" * 70) +print("PART 4: VISUALIZING THE EMBEDDING SPACE") +print("=" * 70) +print(""" +768 dimensions are impossible to visualize. We use PCA to project the +vectors down to 2D while preserving as much structure as possible. +If the embeddings truly capture meaning, we should see CLUSTERS of +related words in the 2D plot. +""") + +# Groups of German words organized by semantic category +word_groups = { + "Tiere": ["Hund", "Katze", "Pferd", "Vogel", "Fisch", "Kuh"], + "Technik": ["Computer", "Software", "Programmieren", "Datenbank", + "Algorithmus", "Internet"], + "Essen": ["Brot", "Käse", "Apfel", "Suppe", "Kuchen", "Wurst"], + "Natur": ["Berg", "Fluss", "Wald", "See", "Wiese", "Schnee"], + "Berufe": ["Arzt", "Lehrer", "Ingenieur", "Koch", "Pilot", "Anwalt"], +} + +all_words = [] +all_categories = [] +all_vectors = [] + +print("Computing embeddings for word groups...") +for category, words in word_groups.items(): + for word in words: + vec = embed_text(word).cpu().numpy() + all_words.append(word) + all_categories.append(category) + all_vectors.append(vec) + print(f" {category}: {', '.join(words)}") + +X = np.stack(all_vectors) +print(f"\nEmbedding matrix: {X.shape[0]} words × {X.shape[1]} dimensions") + +# ── PCA to 2D ──────────────────────────────────────────────────────────── +pca = PCA(n_components=2) +X_2d = pca.fit_transform(X) + +# ── Plot ────────────────────────────────────────────────────────────────── +category_names = list(word_groups.keys()) +cmap = plt.cm.Set1 +colors = {cat: cmap(i / len(category_names)) for i, cat in enumerate(category_names)} + +fig, ax = plt.subplots(figsize=(12, 9)) + +for i, (word, cat) in enumerate(zip(all_words, all_categories)): + x, y = X_2d[i] + ax.scatter(x, y, c=[colors[cat]], s=120, edgecolors="black", + linewidth=0.5, zorder=3) + ax.annotate(word, (x, y), fontsize=9, ha="center", va="bottom", + xytext=(0, 7), textcoords="offset points", + fontweight="bold") + +for cat in category_names: + ax.scatter([], [], c=[colors[cat]], s=100, label=cat, + edgecolors="black", linewidth=0.5) + +ax.legend(loc="best", fontsize=11, title="Kategorie", title_fontsize=12, + framealpha=0.9) + +var = pca.explained_variance_ratio_ +ax.set_title( + "Deutsche Wörter im Embedding-Raum (768D → 2D via PCA)\n" + f"PC1: {var[0]:.1%} Varianz, PC2: {var[1]:.1%} Varianz", + fontsize=14, fontweight="bold" +) +ax.set_xlabel("Hauptkomponente 1 (PC1)", fontsize=12) +ax.set_ylabel("Hauptkomponente 2 (PC2)", fontsize=12) +ax.grid(True, alpha=0.3) +fig.tight_layout() +fig.savefig("embedding_space_german.png", dpi=150) +print(f"\nSaved: embedding_space_german.png") + +# ── Second plot: Phrases including cross-lingual ────────────────────────── +print("\nComputing phrase embeddings for visualization...") + +viz_phrases = { + # German CS phrases + "Maschinelles Lernen": "Technik (DE)", + "Neuronale Netze": "Technik (DE)", + "Softwareentwicklung": "Technik (DE)", + "Künstliche Intelligenz": "Technik (DE)", + # English equivalents + "Machine Learning": "Technik (EN)", + "Neural Networks": "Technik (EN)", + "Software Development": "Technik (EN)", + "Artificial Intelligence": "Technik (EN)", + # German everyday phrases + "Guten Morgen": "Alltag (DE)", + "Wie geht es Ihnen": "Alltag (DE)", + "Das Wetter ist schön": "Alltag (DE)", + "Ich gehe einkaufen": "Alltag (DE)", + # English everyday phrases + "Good morning": "Alltag (EN)", + "How are you": "Alltag (EN)", + "The weather is nice": "Alltag (EN)", + "I am going shopping": "Alltag (EN)", +} + +phrase_labels = list(viz_phrases.keys()) +phrase_cats = list(viz_phrases.values()) +phrase_vecs = np.stack([embed_text(p).cpu().numpy() for p in phrase_labels]) + +pca2 = PCA(n_components=2) +P_2d = pca2.fit_transform(phrase_vecs) + +cat_colors = { + "Technik (DE)": "#1f77b4", + "Technik (EN)": "#aec7e8", + "Alltag (DE)": "#d62728", + "Alltag (EN)": "#ff9896", +} + +fig2, ax2 = plt.subplots(figsize=(12, 9)) + +for i, (label, cat) in enumerate(zip(phrase_labels, phrase_cats)): + x, y = P_2d[i] + marker = "o" if "(DE)" in cat else "s" # circle=German, square=English + ax2.scatter(x, y, c=cat_colors[cat], s=140, marker=marker, + edgecolors="black", linewidth=0.5, zorder=3) + ax2.annotate(label, (x, y), fontsize=8, ha="center", va="bottom", + xytext=(0, 8), textcoords="offset points") + +for cat, color in cat_colors.items(): + marker = "o" if "(DE)" in cat else "s" + ax2.scatter([], [], c=color, s=100, marker=marker, label=cat, + edgecolors="black", linewidth=0.5) + +ax2.legend(loc="best", fontsize=10, title="Kategorie & Sprache", + title_fontsize=11, framealpha=0.9) + +var2 = pca2.explained_variance_ratio_ +ax2.set_title( + "Cross-lingual Embeddings: Deutsche & Englische Phrasen\n" + f"PC1: {var2[0]:.1%} Varianz, PC2: {var2[1]:.1%} Varianz", + fontsize=14, fontweight="bold" +) +ax2.set_xlabel("Hauptkomponente 1 (PC1)", fontsize=12) +ax2.set_ylabel("Hauptkomponente 2 (PC2)", fontsize=12) +ax2.grid(True, alpha=0.3) +fig2.tight_layout() +fig2.savefig("embedding_space_crosslingual.png", dpi=150) +print(f"Saved: embedding_space_crosslingual.png") + +print(f""" +{'=' * 70} +SUMMARY: THE FULL PIPELINE +{'=' * 70} + + Text → Tokens → Token IDs → Embeddings + "Fachhochschule" [▁Fach, ho, [28356, 497, [0.012, -0.34, + ch, schule] 206, 72460] 0.88, ...] + (768 dimensions) + + 1. TOKENIZATION splits text into statistical sub-word pieces. + → Splits are based on frequency, not German morphology. + → Each token maps to an integer ID from the vocabulary. + + 2. EMBEDDING VECTORS are 768-dimensional representations of meaning. + → Computed by the transformer's 12 layers of self-attention. + → Similar meanings → nearby vectors (high cosine similarity). + → Different meanings → distant vectors (low cosine similarity). + + 3. COSINE SIMILARITY measures how "aligned" two vectors are. + → 1.0 = identical meaning, 0.0 = unrelated, -1.0 = opposite. + + 4. CROSS-LINGUAL EMBEDDINGS map equivalent phrases in different + languages to nearby vectors. "Maschinelles Lernen" ≈ "Machine + Learning" in embedding space. + + 5. The SAME PRINCIPLES apply to CODE EMBEDDINGS (next examples): + → Code is tokenized into sub-word pieces + → A transformer produces embedding vectors + → Similar code has similar vectors + → This enables semantic code search, clone detection, and RAG + + Check the two PNG files for visual confirmation: + • embedding_space_german.png — German word clusters + • embedding_space_crosslingual.png — DE/EN phrase alignment +""") diff --git a/Code embeddings/01_basic_embeddings.py b/Code embeddings/01_basic_embeddings.py new file mode 100644 index 0000000..1b26399 --- /dev/null +++ b/Code embeddings/01_basic_embeddings.py @@ -0,0 +1,231 @@ +""" +============================================================================ +Example 1: Computing Code Embeddings and Measuring Similarity +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +GOAL: + Load a pre-trained code embedding model, embed several code snippets, + and compute pairwise cosine similarities to see which snippets the + model considers semantically similar. + +WHAT YOU WILL LEARN: + - How to load a code embedding model with PyTorch + - How code is tokenized and converted to vectors + - How cosine similarity reveals semantic relationships + - That similar functionality → high similarity, different purpose → low + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F + +# ── Device selection ────────────────────────────────────────────────────── +# PyTorch supports three backends: +# - "cuda" → NVIDIA GPUs (Linux/Windows) +# - "mps" → Apple Silicon GPUs (macOS M1/M2/M3/M4) +# - "cpu" → always available, slower +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load model and tokenizer ───────────────────────────────────────────── +# We use st-codesearch-distilroberta-base — a DistilRoBERTa model (82M params) +# specifically fine-tuned on 1.38M code-comment pairs from CodeSearchNet using +# contrastive learning. It produces 768-dim embeddings optimized for matching +# natural language descriptions to code, making it ideal for code search and +# similarity tasks. +MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base" + +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() # disable dropout — we want deterministic embeddings +print("Model loaded.\n") + + +# ── Define code snippets to compare ────────────────────────────────────── +# We intentionally include: +# - Two sorting functions (similar purpose, different implementation) +# - A function that does something completely different (JSON parsing) +# - A sorting function in a different style (list comprehension) +snippets = { + "bubble_sort": """ +def bubble_sort(arr): + n = len(arr) + for i in range(n): + for j in range(0, n - i - 1): + if arr[j] > arr[j + 1]: + arr[j], arr[j + 1] = arr[j + 1], arr[j] + return arr +""", + "quick_sort": """ +def quick_sort(arr): + if len(arr) <= 1: + return arr + pivot = arr[len(arr) // 2] + left = [x for x in arr if x < pivot] + middle = [x for x in arr if x == pivot] + right = [x for x in arr if x > pivot] + return quick_sort(left) + middle + quick_sort(right) +""", + "sorted_builtin": """ +def sort_list(data): + return sorted(data) +""", + "parse_json": """ +import json + +def parse_config(filepath): + with open(filepath, 'r') as f: + config = json.load(f) + return config +""", + "read_csv": """ +import csv + +def read_csv_file(filepath): + rows = [] + with open(filepath, 'r') as f: + reader = csv.reader(f) + for row in reader: + rows.append(row) + return rows +""", +} + + +def embed_code(code_text: str) -> torch.Tensor: + """ + Convert a code snippet into a single embedding vector. + + This function implements the full pipeline from the lecture: + raw code → tokens → token embeddings → single vector → unit vector + + Why a function like this is needed: + A transformer model outputs one vector *per token*, but we need a single + vector that represents the entire snippet so we can compare snippets using + cosine similarity. This function handles tokenization, the forward pass, + pooling (many vectors → one), and normalization (arbitrary length → unit). + + Returns: + A 768-dimensional unit vector (torch.Tensor) representing the code. + """ + + # ── Step 1: Tokenization ────────────────────────────────────────────── + # The model cannot read raw text. We must split the code into sub-word + # tokens and convert each token to its integer ID from the vocabulary. + # + # The tokenizer also produces an "attention mask": a tensor of 1s and 0s + # indicating which positions are real tokens (1) vs. padding (0). + # Padding is needed because tensors must have uniform length. + # + # truncation=True: if the code exceeds 512 tokens, cut it off. + # Why 512? This model was trained with a max context of 512 tokens. + # Anything beyond that would be out-of-distribution. + inputs = tokenizer( + code_text, + return_tensors="pt", + truncation=True, + max_length=512, + padding=True + ).to(DEVICE) + + # ── Step 2: Forward pass through the transformer ────────────────────── + # The model processes all tokens through multiple layers of self-attention + # (as covered in the lecture). Each layer refines the representation. + # + # torch.no_grad() disables gradient tracking because we are only doing + # inference, not training. This saves memory and speeds things up. + # + # The output contains a CONTEXTUAL embedding for EACH token: + # outputs.last_hidden_state has shape [1, seq_len, 768] + # → 1 batch, seq_len tokens, each represented as a 768-dim vector. + # + # These are NOT the static input embeddings — they have been transformed + # by the attention mechanism, so each token's vector now encodes context + # from ALL other tokens in the sequence. + with torch.no_grad(): + outputs = model(**inputs) + + # ── Step 3: Mean pooling — many token vectors → one snippet vector ──── + # Problem: we have one 768-dim vector per token, but we need ONE vector + # for the entire code snippet (so we can compare it to other snippets). + # + # Solution: average all token vectors. This is called "mean pooling." + # + # Subtlety: we must ignore padding tokens. If the code has 30 real tokens + # but the tensor was padded to 40, we don't want the 10 zero-vectors from + # padding to dilute the average. The attention mask lets us do this: + # 1. Multiply each token vector by its mask (1 for real, 0 for padding) + # 2. Sum the masked vectors + # 3. Divide by the number of real tokens (not the padded length) + attention_mask = inputs["attention_mask"].unsqueeze(-1) # [1, seq_len, 1] + masked_output = outputs.last_hidden_state * attention_mask + embedding = masked_output.sum(dim=1) / attention_mask.sum(dim=1) + + # ── Step 4: L2 normalization — project onto the unit hypersphere ────── + # From the lecture: when vectors are normalized to length 1, cosine + # similarity simplifies to the dot product: + # + # cos(θ) = (a · b) / (‖a‖ · ‖b‖) → if ‖a‖=‖b‖=1 → cos(θ) = a · b + # + # This is not just a convenience — it is standard practice in production + # embedding systems (OpenAI, Cohere, etc.) because: + # - Dot products are faster to compute than full cosine similarity + # - Vector databases are optimized for dot-product search + # - It removes magnitude differences so we compare direction only + embedding = F.normalize(embedding, p=2, dim=1) + + return embedding.squeeze(0) # remove batch dim → shape: [768] + + +# ── Compute embeddings for all snippets ─────────────────────────────────── +print("Computing embeddings...") +embeddings = {} +for name, code in snippets.items(): + embeddings[name] = embed_code(code) + num_tokens = len(tokenizer.encode(code)) + print(f" {name:20s} → {num_tokens:3d} tokens → vector of dim {embeddings[name].shape[0]}") + +print() + +# ── Compute pairwise cosine similarities ────────────────────────────────── +# cosine_similarity = dot product of unit vectors (we already normalized above) +names = list(embeddings.keys()) +print("Pairwise Cosine Similarities:") +print(f"{'':22s}", end="") +for n in names: + print(f"{n:>16s}", end="") +print() + +for i, n1 in enumerate(names): + print(f"{n1:22s}", end="") + for j, n2 in enumerate(names): + sim = torch.dot(embeddings[n1].cpu(), embeddings[n2].cpu()).item() + print(f"{sim:16.3f}", end="") + print() + +# ── Interpretation ──────────────────────────────────────────────────────── +print("\n" + "=" * 70) +print("INTERPRETATION:") +print("=" * 70) +print(""" +- bubble_sort, quick_sort, and sorted_builtin should have HIGH similarity + (all perform sorting, despite very different implementations). +- parse_json and read_csv should be similar to each other (both read files) + but DISSIMILAR to the sorting functions (different purpose). +- This demonstrates that code embeddings capture WHAT code does, + not just HOW it looks syntactically. +""") diff --git a/Code embeddings/02_text_to_code_search.py b/Code embeddings/02_text_to_code_search.py new file mode 100644 index 0000000..0c3d85d --- /dev/null +++ b/Code embeddings/02_text_to_code_search.py @@ -0,0 +1,251 @@ +""" +============================================================================ +Example 2: Text-to-Code Semantic Search +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +GOAL: + Build a mini code search engine: given a natural language query like + "sort a list", find the most relevant code snippet from a collection. + This is the core mechanism behind semantic code search in tools like + Cursor, GitHub Copilot, and code search engines. + +WHAT YOU WILL LEARN: + - How the SAME embedding model maps both text and code into a shared + vector space — this is what makes text-to-code search possible. + - How to build a simple search index and query it. + - Why embedding-based search beats keyword search for code. + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F + +# ── Device selection ────────────────────────────────────────────────────── +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load model ──────────────────────────────────────────────────────────── +MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base" +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() +print("Model loaded.\n") + +# ── Code "database" ────────────────────────────────────────────────────── +# Imagine these are functions in a large codebase that we want to search. +code_database = [ + { + "name": "binary_search", + "code": """ +def binary_search(arr, target): + low, high = 0, len(arr) - 1 + while low <= high: + mid = (low + high) // 2 + if arr[mid] == target: + return mid + elif arr[mid] < target: + low = mid + 1 + else: + high = mid - 1 + return -1 +""" + }, + { + "name": "merge_sort", + "code": """ +def merge_sort(arr): + if len(arr) <= 1: + return arr + mid = len(arr) // 2 + left = merge_sort(arr[:mid]) + right = merge_sort(arr[mid:]) + return merge(left, right) +""" + }, + { + "name": "read_json_file", + "code": """ +import json +def read_json_file(path): + with open(path, 'r') as f: + return json.load(f) +""" + }, + { + "name": "calculate_average", + "code": """ +def calculate_average(numbers): + if not numbers: + return 0.0 + return sum(numbers) / len(numbers) +""" + }, + { + "name": "connect_database", + "code": """ +import sqlite3 +def connect_database(db_path): + conn = sqlite3.connect(db_path) + cursor = conn.cursor() + return conn, cursor +""" + }, + { + "name": "send_http_request", + "code": """ +import requests +def send_http_request(url, method='GET', data=None): + if method == 'GET': + response = requests.get(url) + else: + response = requests.post(url, json=data) + return response.json() +""" + }, + { + "name": "flatten_nested_list", + "code": """ +def flatten(nested_list): + result = [] + for item in nested_list: + if isinstance(item, list): + result.extend(flatten(item)) + else: + result.append(item) + return result +""" + }, + { + "name": "count_words", + "code": """ +def count_words(text): + words = text.lower().split() + word_count = {} + for word in words: + word_count[word] = word_count.get(word, 0) + 1 + return word_count +""" + }, + { + "name": "validate_email", + "code": """ +import re +def validate_email(email): + pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$' + return bool(re.match(pattern, email)) +""" + }, + { + "name": "fibonacci", + "code": """ +def fibonacci(n): + if n <= 1: + return n + a, b = 0, 1 + for _ in range(2, n + 1): + a, b = b, a + b + return b +""" + }, +] + + +def embed_text(text: str) -> torch.Tensor: + """Embed a piece of text or code into a normalized vector.""" + inputs = tokenizer( + text, return_tensors="pt", truncation=True, max_length=512, padding=True + ).to(DEVICE) + + with torch.no_grad(): + outputs = model(**inputs) + + # Mean pooling over non-padding tokens + mask = inputs["attention_mask"].unsqueeze(-1) + embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1) + return F.normalize(embedding, p=2, dim=1).squeeze(0) + + +# ── Step 1: Index the code database ─────────────────────────────────────── +# In a real system this would be stored in a vector database (ChromaDB, +# Pinecone, pgvector). Here we keep it simple with a list of tensors. +print("Indexing code database...") +code_vectors = [] +for entry in code_database: + vec = embed_text(entry["code"]) + code_vectors.append(vec) + print(f" Indexed: {entry['name']}") + +# Stack into a matrix: shape [num_snippets, embedding_dim] +code_matrix = torch.stack(code_vectors) +print(f"\nIndex built: {code_matrix.shape[0]} snippets, {code_matrix.shape[1]} dimensions\n") + + +# ── Step 2: Search with natural language queries ────────────────────────── +queries = [ + "sort a list of numbers", + "find an element in a sorted array", + "compute the mean of a list", + "make an HTTP API call", + "open and read a JSON file", + "check if an email address is valid", + "count word frequencies in a string", + "generate fibonacci numbers", + "connect to a SQL database", + "flatten a nested list into a single list", +] + +print("=" * 70) +print("SEMANTIC CODE SEARCH RESULTS") +print("=" * 70) + +for query in queries: + # Embed the natural language query with the SAME model + query_vec = embed_text(query) + + # Compute cosine similarity against all code embeddings + # Because vectors are normalized, dot product = cosine similarity + similarities = torch.mv(code_matrix.cpu(), query_vec.cpu()) + + # Rank results by similarity (highest first) + ranked_indices = torch.argsort(similarities, descending=True) + + print(f'\nQuery: "{query}"') + print(f" Rank Score Function") + print(f" ---- ----- --------") + for rank, idx in enumerate(ranked_indices[:3]): # show top 3 + score = similarities[idx].item() + name = code_database[idx]["name"] + marker = " ← best match" if rank == 0 else "" + print(f" {rank+1:4d} {score:.3f} {name}{marker}") + +print("\n" + "=" * 70) +print("KEY OBSERVATIONS:") +print("=" * 70) +print(""" +1. The model maps NATURAL LANGUAGE queries and CODE into the same vector + space. This is why "sort a list" finds merge_sort and "find an element + in a sorted array" finds binary_search — even though the queries + contain none of the function identifiers. + +2. This is fundamentally different from grep/keyword search: + - grep "sort" would miss functions named "order" or "arrange" + - grep "find element" would miss "binary_search" + Embeddings understand MEANING, not just string matching. + +3. This is exactly how Cursor, Copilot, and other AI coding tools + retrieve relevant code from your project to feed into the LLM. +""") diff --git a/Code embeddings/03_cross_language.py b/Code embeddings/03_cross_language.py new file mode 100644 index 0000000..593a2ed --- /dev/null +++ b/Code embeddings/03_cross_language.py @@ -0,0 +1,199 @@ +""" +============================================================================ +Example 3: Cross-Language Code Similarity +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +GOAL: + Demonstrate that code embeddings capture FUNCTIONALITY, not syntax. + The same algorithm written in Python, JavaScript, Java, and C++ + should produce similar embedding vectors — even though the surface + syntax is completely different. + +WHAT YOU WILL LEARN: + - Code embedding models create a language-agnostic semantic space. + - Functionally equivalent code clusters together regardless of language. + - This enables cross-language code search (e.g., find the Java + equivalent of a Python function). + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F + +# ── Device selection ────────────────────────────────────────────────────── +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load model ──────────────────────────────────────────────────────────── +MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base" +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() +print("Model loaded.\n") + +# ── Same algorithm in four languages ────────────────────────────────────── +# Task A: Factorial — a simple recursive/iterative computation +# Task B: Reverse a string +# If embeddings are truly semantic, Task A functions should cluster together +# and Task B functions should cluster together, regardless of language. + +code_snippets = { + # ── Task A: Factorial ── + "factorial_python": """ +def factorial(n): + result = 1 + for i in range(2, n + 1): + result *= i + return result +""", + "factorial_javascript": """ +function factorial(n) { + let result = 1; + for (let i = 2; i <= n; i++) { + result *= i; + } + return result; +} +""", + "factorial_java": """ +public static int factorial(int n) { + int result = 1; + for (int i = 2; i <= n; i++) { + result *= i; + } + return result; +} +""", + "factorial_cpp": """ +int factorial(int n) { + int result = 1; + for (int i = 2; i <= n; i++) { + result *= i; + } + return result; +} +""", + + # ── Task B: Reverse a string ── + "reverse_python": """ +def reverse_string(s): + return s[::-1] +""", + "reverse_javascript": """ +function reverseString(s) { + return s.split('').reverse().join(''); +} +""", + "reverse_java": """ +public static String reverseString(String s) { + return new StringBuilder(s).reverse().toString(); +} +""", + "reverse_cpp": """ +std::string reverseString(std::string s) { + std::reverse(s.begin(), s.end()); + return s; +} +""", +} + + +def embed_code(code: str) -> torch.Tensor: + """Embed code into a normalized vector.""" + inputs = tokenizer( + code, return_tensors="pt", truncation=True, max_length=512, padding=True + ).to(DEVICE) + with torch.no_grad(): + outputs = model(**inputs) + mask = inputs["attention_mask"].unsqueeze(-1) + embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1) + return F.normalize(embedding, p=2, dim=1).squeeze(0) + + +# ── Compute all embeddings ──────────────────────────────────────────────── +print("Computing embeddings for all snippets...") +embeddings = {} +for name, code in code_snippets.items(): + embeddings[name] = embed_code(code) +print(f"Done. {len(embeddings)} embeddings computed.\n") + +# ── Compute similarity matrix ───────────────────────────────────────────── +names = list(embeddings.keys()) +n = len(names) + +print("=" * 70) +print("CROSS-LANGUAGE SIMILARITY MATRIX") +print("=" * 70) + +# Print header (abbreviated names for readability) +short_names = [n.replace("factorial_", "F:").replace("reverse_", "R:") for n in names] + +print(f"\n{'':14s}", end="") +for sn in short_names: + print(f"{sn:>10s}", end="") +print() + +for i in range(n): + print(f"{short_names[i]:14s}", end="") + for j in range(n): + sim = torch.dot(embeddings[names[i]].cpu(), embeddings[names[j]].cpu()).item() + print(f"{sim:10.3f}", end="") + print() + +# ── Compute average within-task and across-task similarities ────────────── +factorial_names = [n for n in names if "factorial" in n] +reverse_names = [n for n in names if "reverse" in n] + +within_factorial = [] +within_reverse = [] +across_tasks = [] + +for i, n1 in enumerate(names): + for j, n2 in enumerate(names): + if i >= j: + continue + sim = torch.dot(embeddings[n1].cpu(), embeddings[n2].cpu()).item() + if n1 in factorial_names and n2 in factorial_names: + within_factorial.append(sim) + elif n1 in reverse_names and n2 in reverse_names: + within_reverse.append(sim) + else: + across_tasks.append(sim) + +print("\n" + "=" * 70) +print("ANALYSIS") +print("=" * 70) +print(f"\nAvg similarity WITHIN factorial (across languages): " + f"{sum(within_factorial)/len(within_factorial):.3f}") +print(f"Avg similarity WITHIN reverse (across languages): " + f"{sum(within_reverse)/len(within_reverse):.3f}") +print(f"Avg similarity ACROSS tasks (factorial vs reverse): " + f"{sum(across_tasks)/len(across_tasks):.3f}") + +print(""" +EXPECTED RESULT: + Within-task similarity should be MUCH HIGHER than across-task similarity. + This proves that the embedding model groups code by WHAT IT DOES, + not by WHAT LANGUAGE it is written in. + + factorial_python ≈ factorial_java ≈ factorial_cpp ≈ factorial_javascript + reverse_python ≈ reverse_java ≈ reverse_cpp ≈ reverse_javascript + factorial_* ≠ reverse_* + + This is what enables cross-language code search: you can find a Java + implementation by providing a Python query, or vice versa. +""") diff --git a/Code embeddings/04_clone_detection.py b/Code embeddings/04_clone_detection.py new file mode 100644 index 0000000..03a7c3a --- /dev/null +++ b/Code embeddings/04_clone_detection.py @@ -0,0 +1,237 @@ +""" +============================================================================ +Example 4: Code Clone Detection +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +GOAL: + Detect code clones (duplicate/similar code) in a collection of + functions using embeddings. We simulate a real-world scenario + where a codebase contains multiple near-duplicate implementations + that should be refactored into a single function. + +WHAT YOU WILL LEARN: + - The four types of code clones (Type 1–4) + - How embeddings detect clones that text-based tools miss + - Ranking-based clone detection via cosine similarity + - Practical application: finding refactoring opportunities + +CLONE TYPES: + Type 1: Exact copy (trivial — grep can find these) + Type 2: Renamed variables (grep misses these) + Type 3: Modified structure (added/removed lines) + Type 4: Same functionality, completely different implementation + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F +from itertools import combinations + +# ── Device selection ────────────────────────────────────────────────────── +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load model ──────────────────────────────────────────────────────────── +MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base" +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() +print("Model loaded.\n") + +# ── Simulated codebase ──────────────────────────────────────────────────── +# These functions simulate what you'd find in a messy, real-world codebase +# where different developers wrote similar functionality independently. +# +# IMPORTANT: The clone groups share ZERO common words (besides Python +# keywords). This demonstrates that embeddings capture semantics, not +# surface-level text overlap. grep would never find these. +codebase = { + # ── Clone group 1: Computing the maximum of a list ── + # Three completely different implementations — no shared identifiers, + # no shared structure, but identical purpose. + "utils/find_max.py": """ +def find_max(numbers): + result = numbers[0] + for candidate in numbers[1:]: + if candidate > result: + result = candidate + return result +""", + "legacy/find_max_old.py": """ +def find_max(numbers): + result = numbers[0] + for candidate in numbers[1:]: + if candidate > result: + result = candidate + return result +""", + "analytics/top_scorer.py": """ +import heapq +def fetch_top_element(collection): + return heapq.nlargest(1, collection)[0] +""", + "stats/dominant_value.py": """ +def extract_peak(dataset): + dataset = sorted(dataset, reverse=True) + return dataset[0] +""", + + # ── Clone group 2: String reversal ── + # Two implementations with zero lexical overlap — slicing vs index-based. + "text/flip_text.py": """ +def flip_text(content): + return content[::-1] +""", + "helpers/mirror.py": """ +def mirror_characters(phrase): + output = [] + idx = len(phrase) - 1 + while idx >= 0: + output.append(phrase[idx]) + idx -= 1 + return ''.join(output) +""", + + # ── Not a clone: completely different functionality ── + # Each uses a different Python construct and domain to ensure + # they don't cluster with each other or with the clone groups. + "math/square_root.py": """ +def square_root(x): + return x ** 0.5 +""", + "calendar/leap_year.py": """ +def is_leap_year(year): + return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) +""", + "formatting/currency.py": """ +def format_currency(amount, symbol="$"): + return f"{symbol}{amount:,.2f}" +""", +} + + +def embed_code(code: str) -> torch.Tensor: + """Embed code into a normalized vector.""" + inputs = tokenizer( + code, return_tensors="pt", truncation=True, max_length=512, padding=True + ).to(DEVICE) + with torch.no_grad(): + outputs = model(**inputs) + mask = inputs["attention_mask"].unsqueeze(-1) + embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1) + return F.normalize(embedding, p=2, dim=1).squeeze(0) + + +# ── Embed all functions ─────────────────────────────────────────────────── +print("Embedding all functions in the codebase...") +embeddings = {} +for path, code in codebase.items(): + embeddings[path] = embed_code(code) + print(f" {path}") +print() + +# ── Compute pairwise similarity matrix ──────────────────────────────────── +paths = list(embeddings.keys()) +n = len(paths) + +def short_name(path): + """Extract a readable label from the file path.""" + return path.split("/")[-1].replace(".py", "") + +labels = [short_name(p) for p in paths] + +sim_matrix = {} +for i in range(n): + for j in range(n): + sim = torch.dot(embeddings[paths[i]].cpu(), embeddings[paths[j]].cpu()).item() + sim_matrix[(i, j)] = sim + +# ── Print similarity matrix ─────────────────────────────────────────────── +col_w = max(len(l) for l in labels) + 2 +header_w = col_w + +print("=" * 70) +print("SIMILARITY MATRIX") +print("=" * 70) + +print(f"\n{'':>{header_w}}", end="") +for label in labels: + print(f"{label:>{col_w}}", end="") +print() + +for i in range(n): + print(f"{labels[i]:>{header_w}}", end="") + for j in range(n): + print(f"{sim_matrix[(i, j)]:>{col_w}.3f}", end="") + print() + +# ── Most similar match per function ─────────────────────────────────────── +print() +print(f"{'BEST MATCH':>{header_w}}", end="") +for i in range(n): + best_j, best_sim = -1, -1.0 + for j in range(n): + if i != j and sim_matrix[(i, j)] > best_sim: + best_sim = sim_matrix[(i, j)] + best_j = j + print(f"{labels[best_j]:>{col_w}}", end="") +print() + +print(f"{'(similarity)':>{header_w}}", end="") +for i in range(n): + best_sim = max(sim_matrix[(i, j)] for j in range(n) if i != j) + print(f"{best_sim:>{col_w}.3f}", end="") +print() + +print(f""" +{'=' * 70} +INTERPRETATION: +{'=' * 70} + +HOW TO READ THE TABLE: + Each cell shows the cosine similarity between two functions. + The BEST MATCH row shows which other function is most similar + to each column — these are the clone candidates a developer + would investigate. + +EXPECTED CLONE GROUPS: + + 1. find_max ↔ find_max_old (Type 1: exact copy) + → Similarity ≈ 1.000 + + 2. find_max / fetch_top_element / extract_peak (Type 4 clones) + → Same purpose (find the largest value), completely different + code: for-loop vs heapq.nlargest() vs sorted(reverse=True) + → Zero shared identifiers between implementations + + 3. flip_text ↔ mirror_characters (Type 4 clone) + → Same purpose (reverse a string), completely different code: + slicing ([::-1]) vs while-loop with index + → Zero shared identifiers + +NON-CLONES: + square_root, is_leap_year, format_currency each use a different + domain and code structure. Their best matches should have low + similarity compared to the clone groups. + +KEY INSIGHT: + The clone groups share NO common words (besides Python keywords + like def/return/if). grep or any text-matching tool would never + find these clones. Only semantic understanding — which is what + embeddings provide — can detect that these functions do the same + thing despite having completely different code. +""") diff --git a/Code embeddings/05_visualize_embeddings.py b/Code embeddings/05_visualize_embeddings.py new file mode 100644 index 0000000..fb8d93a --- /dev/null +++ b/Code embeddings/05_visualize_embeddings.py @@ -0,0 +1,216 @@ +""" +============================================================================ +Example 5: Visualizing Code Embeddings with PCA and t-SNE +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +GOAL: + Reduce 768-dimensional code embeddings to 2D and plot them. + This makes the embedding space visible: you can SEE that similar + code clusters together and different code is far apart. + +WHAT YOU WILL LEARN: + - How PCA projects high-dimensional vectors to 2D (linear reduction) + - How t-SNE creates a non-linear 2D map that preserves neighborhoods + - How to interpret embedding space visualizations + - That code functionality determines position, not syntax or language + +OUTPUT: + Saves two PNG plots: code_embeddings_pca.png and code_embeddings_tsne.png + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +import numpy as np +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE +import matplotlib.pyplot as plt +import matplotlib + +# Use a non-interactive backend so the script works in headless environments +matplotlib.use("Agg") + +# ── Device selection ────────────────────────────────────────────────────── +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load model ──────────────────────────────────────────────────────────── +MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base" +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() +print("Model loaded.\n") + +# ── Code snippets organized by CATEGORY ─────────────────────────────────── +# Each category represents a type of task. We expect snippets within the +# same category to cluster together in the embedding space. +categories = { + "Sorting": { + "bubble_sort_py": "def bubble_sort(arr):\n n = len(arr)\n for i in range(n):\n for j in range(n-i-1):\n if arr[j] > arr[j+1]:\n arr[j], arr[j+1] = arr[j+1], arr[j]\n return arr", + "quick_sort_py": "def quick_sort(a):\n if len(a) <= 1: return a\n p = a[0]\n return quick_sort([x for x in a[1:] if x < p]) + [p] + quick_sort([x for x in a[1:] if x >= p])", + "sort_js": "function sortArray(arr) { return arr.sort((a, b) => a - b); }", + "insertion_sort": "def insertion_sort(arr):\n for i in range(1, len(arr)):\n key = arr[i]\n j = i - 1\n while j >= 0 and arr[j] > key:\n arr[j+1] = arr[j]\n j -= 1\n arr[j+1] = key\n return arr", + }, + "File I/O": { + "read_json": "import json\ndef read_json(path):\n with open(path) as f:\n return json.load(f)", + "write_file": "def write_file(path, content):\n with open(path, 'w') as f:\n f.write(content)", + "read_csv": "import csv\ndef read_csv(path):\n with open(path) as f:\n return list(csv.reader(f))", + "read_yaml": "import yaml\ndef read_yaml(path):\n with open(path) as f:\n return yaml.safe_load(f)", + }, + "String ops": { + "reverse_str": "def reverse(s): return s[::-1]", + "capitalize": "def capitalize_words(s): return ' '.join(w.capitalize() for w in s.split())", + "count_chars": "def count_chars(s):\n return {c: s.count(c) for c in set(s)}", + "is_palindrome": "def is_palindrome(s): return s == s[::-1]", + }, + "Math": { + "factorial": "def factorial(n):\n r = 1\n for i in range(2, n+1): r *= i\n return r", + "fibonacci": "def fib(n):\n a, b = 0, 1\n for _ in range(n): a, b = b, a+b\n return a", + "gcd": "def gcd(a, b):\n while b: a, b = b, a % b\n return a", + "is_prime": "def is_prime(n):\n if n < 2: return False\n for i in range(2, int(n**0.5)+1):\n if n % i == 0: return False\n return True", + }, + "Networking": { + "http_get": "import requests\ndef http_get(url): return requests.get(url).json()", + "fetch_url": "import urllib.request\ndef fetch(url):\n with urllib.request.urlopen(url) as r:\n return r.read().decode()", + "post_data": "import requests\ndef post_json(url, data): return requests.post(url, json=data).status_code", + "download_file": "import urllib.request\ndef download(url, path): urllib.request.urlretrieve(url, path)", + }, +} + + +def embed_code(code: str) -> torch.Tensor: + """Embed code into a normalized vector.""" + inputs = tokenizer( + code, return_tensors="pt", truncation=True, max_length=512, padding=True + ).to(DEVICE) + with torch.no_grad(): + outputs = model(**inputs) + mask = inputs["attention_mask"].unsqueeze(-1) + embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1) + return F.normalize(embedding, p=2, dim=1).squeeze(0).cpu().numpy() + + +# ── Compute embeddings ──────────────────────────────────────────────────── +print("Computing embeddings...") +all_embeddings = [] +all_labels = [] +all_categories = [] + +for category, snippets in categories.items(): + for label, code in snippets.items(): + vec = embed_code(code) + all_embeddings.append(vec) + all_labels.append(label) + all_categories.append(category) + print(f" [{category:12s}] {label}") + +# Convert to numpy matrix: shape [num_snippets, 768] +X = np.stack(all_embeddings) +print(f"\nEmbedding matrix: {X.shape[0]} snippets × {X.shape[1]} dimensions\n") + +# ── Color map for categories ────────────────────────────────────────────── +category_names = list(categories.keys()) +colors = plt.cm.Set1(np.linspace(0, 1, len(category_names))) +color_map = {cat: colors[i] for i, cat in enumerate(category_names)} +point_colors = [color_map[cat] for cat in all_categories] + +# ── Plot 1: PCA ────────────────────────────────────────────────────────── +# PCA finds the two directions of maximum variance in the 1024-dim space +# and projects all points onto those two directions. +print("Computing PCA (2 components)...") +pca = PCA(n_components=2) +X_pca = pca.fit_transform(X) + +fig, ax = plt.subplots(figsize=(10, 8)) +for i, (x, y) in enumerate(X_pca): + ax.scatter(x, y, c=[point_colors[i]], s=100, edgecolors="black", linewidth=0.5, zorder=3) + ax.annotate(all_labels[i], (x, y), fontsize=7, ha="center", va="bottom", + xytext=(0, 6), textcoords="offset points") + +# Legend +for cat in category_names: + ax.scatter([], [], c=[color_map[cat]], s=80, label=cat, edgecolors="black", linewidth=0.5) +ax.legend(loc="best", fontsize=9, title="Category", title_fontsize=10) + +variance_explained = pca.explained_variance_ratio_ +ax.set_title(f"Code Embeddings — PCA Projection\n" + f"(PC1: {variance_explained[0]:.1%} variance, PC2: {variance_explained[1]:.1%} variance)", + fontsize=13) +ax.set_xlabel("Principal Component 1", fontsize=11) +ax.set_ylabel("Principal Component 2", fontsize=11) +ax.grid(True, alpha=0.3) +fig.tight_layout() +fig.savefig("code_embeddings_pca.png", dpi=150) +print(f" Saved: code_embeddings_pca.png") +print(f" Variance explained: PC1={variance_explained[0]:.1%}, PC2={variance_explained[1]:.1%}\n") + +# ── Plot 2: t-SNE ──────────────────────────────────────────────────────── +# t-SNE is a non-linear method that preserves LOCAL neighborhood structure. +# Points that are close in 1024-dim space stay close in 2D. +# Perplexity controls the balance between local and global structure. +print("Computing t-SNE (this may take a few seconds)...") +tsne = TSNE(n_components=2, perplexity=5, random_state=42, max_iter=1000) +X_tsne = tsne.fit_transform(X) + +fig, ax = plt.subplots(figsize=(10, 8)) +for i, (x, y) in enumerate(X_tsne): + ax.scatter(x, y, c=[point_colors[i]], s=100, edgecolors="black", linewidth=0.5, zorder=3) + ax.annotate(all_labels[i], (x, y), fontsize=7, ha="center", va="bottom", + xytext=(0, 6), textcoords="offset points") + +for cat in category_names: + ax.scatter([], [], c=[color_map[cat]], s=80, label=cat, edgecolors="black", linewidth=0.5) +ax.legend(loc="best", fontsize=9, title="Category", title_fontsize=10) + +ax.set_title("Code Embeddings — t-SNE Projection\n" + "(non-linear dimensionality reduction)", fontsize=13) +ax.set_xlabel("t-SNE Dimension 1", fontsize=11) +ax.set_ylabel("t-SNE Dimension 2", fontsize=11) +ax.grid(True, alpha=0.3) +fig.tight_layout() +fig.savefig("code_embeddings_tsne.png", dpi=150) +print(f" Saved: code_embeddings_tsne.png\n") + +print("=" * 70) +print("INTERPRETATION") +print("=" * 70) +print(f""" +Both plots project {X.shape[1]}-dimensional embedding vectors to 2D: + +PCA (Principal Component Analysis): + - Linear projection onto the two axes of maximum variance. + - Preserves global structure: large distances are meaningful. + - Good for seeing overall separation between categories. + - The % variance tells you how much information is retained. + +t-SNE (t-distributed Stochastic Neighbor Embedding): + - Non-linear: distorts distances but preserves neighborhoods. + - Points that are close in the original space stay close in 2D. + - Better at revealing tight clusters within categories. + - Distances BETWEEN clusters are not meaningful. + +EXPECTED RESULT: + You should see 5 distinct clusters, one per category: + - Sorting functions (bubble, quick, insertion, JS sort) cluster together + - File I/O functions cluster together + - String operations cluster together + - Math functions cluster together + - Networking functions cluster together + + This visually confirms that code embeddings organize code by + PURPOSE, not by surface syntax or programming language. +""") diff --git a/Code embeddings/06_pca_denoising.py b/Code embeddings/06_pca_denoising.py new file mode 100644 index 0000000..88981ae --- /dev/null +++ b/Code embeddings/06_pca_denoising.py @@ -0,0 +1,716 @@ +""" +============================================================================ +Example 6: PCA Denoising — Can Fewer Dimensions Improve Similarity? +============================================================================ +AISE501 – AI in Software Engineering I +Fachhochschule Graubünden + +HYPOTHESIS: + Embedding vectors live in a 768-dimensional space, but most of the + semantic signal may be concentrated in a small number of principal + components. The remaining dimensions could add "noise" that dilutes + cosine similarity. If true, projecting embeddings onto a small PCA + subspace should INCREASE similarity within semantic groups and + DECREASE similarity across groups — making code search sharper. + +WHAT YOU WILL LEARN: + - How PCA decomposes the embedding space into ranked components + - How to measure retrieval quality (intra- vs inter-group similarity) + - Whether dimensionality reduction helps or hurts in practice + - The concept of an "optimal" embedding dimension for a given task + +OUTPUT: + Saves pca_denoising_analysis.png with three sub-plots. + +HARDWARE: + Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac). +============================================================================ +""" + +import torch +import numpy as np +from transformers import AutoTokenizer, AutoModel +import torch.nn.functional as F +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt +import matplotlib +matplotlib.use("Agg") + +# ── Device selection ────────────────────────────────────────────────────── +def get_device(): + if torch.cuda.is_available(): + return torch.device("cuda") + elif torch.backends.mps.is_available(): + return torch.device("mps") + return torch.device("cpu") + +DEVICE = get_device() +print(f"Using device: {DEVICE}\n") + +# ── Load model ──────────────────────────────────────────────────────────── +MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base" +print(f"Loading model: {MODEL_NAME} ...") +tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE) +model.eval() +print("Model loaded.\n") + +# ── Code snippets organized into semantic GROUPS ────────────────────────── +# We need clear groups so we can measure intra-group vs inter-group similarity. +groups = { + "Sorting": { + "bubble_sort": """ +def bubble_sort(arr): + n = len(arr) + for i in range(n): + for j in range(0, n - i - 1): + if arr[j] > arr[j + 1]: + arr[j], arr[j + 1] = arr[j + 1], arr[j] + return arr""", + "quick_sort": """ +def quick_sort(arr): + if len(arr) <= 1: + return arr + pivot = arr[len(arr) // 2] + left = [x for x in arr if x < pivot] + middle = [x for x in arr if x == pivot] + right = [x for x in arr if x > pivot] + return quick_sort(left) + middle + quick_sort(right)""", + "merge_sort": """ +def merge_sort(arr): + if len(arr) <= 1: + return arr + mid = len(arr) // 2 + left = merge_sort(arr[:mid]) + right = merge_sort(arr[mid:]) + merged = [] + i = j = 0 + while i < len(left) and j < len(right): + if left[i] <= right[j]: + merged.append(left[i]); i += 1 + else: + merged.append(right[j]); j += 1 + return merged + left[i:] + right[j:]""", + "insertion_sort": """ +def insertion_sort(arr): + for i in range(1, len(arr)): + key = arr[i] + j = i - 1 + while j >= 0 and arr[j] > key: + arr[j + 1] = arr[j] + j -= 1 + arr[j + 1] = key + return arr""", + "selection_sort": """ +def selection_sort(arr): + for i in range(len(arr)): + min_idx = i + for j in range(i + 1, len(arr)): + if arr[j] < arr[min_idx]: + min_idx = j + arr[i], arr[min_idx] = arr[min_idx], arr[i] + return arr""", + "heap_sort": """ +def heap_sort(arr): + import heapq + heapq.heapify(arr) + return [heapq.heappop(arr) for _ in range(len(arr))]""", + }, + "File I/O": { + "read_json": """ +import json +def read_json(path): + with open(path, 'r') as f: + return json.load(f)""", + "write_file": """ +def write_file(path, content): + with open(path, 'w') as f: + f.write(content)""", + "read_csv": """ +import csv +def read_csv(path): + with open(path, 'r') as f: + reader = csv.reader(f) + return list(reader)""", + "read_yaml": """ +import yaml +def load_yaml(path): + with open(path, 'r') as f: + return yaml.safe_load(f)""", + "write_json": """ +import json +def write_json(path, data): + with open(path, 'w') as f: + json.dump(data, f, indent=2)""", + "read_lines": """ +def read_lines(path): + with open(path, 'r') as f: + return f.readlines()""", + }, + "Math": { + "factorial": """ +def factorial(n): + if n <= 1: + return 1 + return n * factorial(n - 1)""", + "fibonacci": """ +def fibonacci(n): + a, b = 0, 1 + for _ in range(n): + a, b = b, a + b + return a""", + "gcd": """ +def gcd(a, b): + while b: + a, b = b, a % b + return a""", + "is_prime": """ +def is_prime(n): + if n < 2: + return False + for i in range(2, int(n**0.5) + 1): + if n % i == 0: + return False + return True""", + "power": """ +def power(base, exp): + if exp == 0: + return 1 + if exp % 2 == 0: + half = power(base, exp // 2) + return half * half + return base * power(base, exp - 1)""", + "sum_digits": """ +def sum_digits(n): + total = 0 + while n > 0: + total += n % 10 + n //= 10 + return total""", + }, + "Networking": { + "http_get": """ +import requests +def http_get(url): + response = requests.get(url) + return response.json()""", + "post_data": """ +import requests +def post_data(url, payload): + response = requests.post(url, json=payload) + return response.status_code, response.json()""", + "fetch_url": """ +import urllib.request +def fetch_url(url): + with urllib.request.urlopen(url) as resp: + return resp.read().decode('utf-8')""", + "download_file": """ +import urllib.request +def download_file(url, dest): + urllib.request.urlretrieve(url, dest) + return dest""", + "http_put": """ +import requests +def http_put(url, data): + response = requests.put(url, json=data) + return response.status_code""", + "http_delete": """ +import requests +def http_delete(url): + response = requests.delete(url) + return response.status_code""", + }, + "String ops": { + "reverse_str": """ +def reverse_string(s): + return s[::-1]""", + "is_palindrome": """ +def is_palindrome(s): + s = s.lower().replace(' ', '') + return s == s[::-1]""", + "count_vowels": """ +def count_vowels(s): + return sum(1 for c in s.lower() if c in 'aeiou')""", + "capitalize_words": """ +def capitalize_words(s): + return ' '.join(w.capitalize() for w in s.split())""", + "remove_duplicates": """ +def remove_duplicate_chars(s): + seen = set() + result = [] + for c in s: + if c not in seen: + seen.add(c) + result.append(c) + return ''.join(result)""", + "count_words": """ +def count_words(text): + words = text.lower().split() + freq = {} + for w in words: + freq[w] = freq.get(w, 0) + 1 + return freq""", + }, + "Data structures": { + "stack_push_pop": """ +class Stack: + def __init__(self): + self.items = [] + def push(self, item): + self.items.append(item) + def pop(self): + return self.items.pop()""", + "queue_impl": """ +from collections import deque +class Queue: + def __init__(self): + self.items = deque() + def enqueue(self, item): + self.items.append(item) + def dequeue(self): + return self.items.popleft()""", + "linked_list": """ +class Node: + def __init__(self, val): + self.val = val + self.next = None +class LinkedList: + def __init__(self): + self.head = None + def append(self, val): + node = Node(val) + if not self.head: + self.head = node + return + curr = self.head + while curr.next: + curr = curr.next + curr.next = node""", + "binary_tree": """ +class TreeNode: + def __init__(self, val): + self.val = val + self.left = None + self.right = None +def inorder(root): + if root: + yield from inorder(root.left) + yield root.val + yield from inorder(root.right)""", + "hash_map": """ +class HashMap: + def __init__(self, size=256): + self.buckets = [[] for _ in range(size)] + def put(self, key, value): + idx = hash(key) % len(self.buckets) + for i, (k, v) in enumerate(self.buckets[idx]): + if k == key: + self.buckets[idx][i] = (key, value) + return + self.buckets[idx].append((key, value))""", + "priority_queue": """ +import heapq +class PriorityQueue: + def __init__(self): + self.heap = [] + def push(self, priority, item): + heapq.heappush(self.heap, (priority, item)) + def pop(self): + return heapq.heappop(self.heap)[1]""", + }, +} + + +def embed_code(code: str) -> torch.Tensor: + """Embed code into a normalized vector.""" + inputs = tokenizer( + code, return_tensors="pt", truncation=True, max_length=512, padding=True + ).to(DEVICE) + with torch.no_grad(): + outputs = model(**inputs) + mask = inputs["attention_mask"].unsqueeze(-1) + embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1) + return F.normalize(embedding, p=2, dim=1).squeeze(0) + + +# ── Step 1: Compute all embeddings ──────────────────────────────────────── +print("Computing embeddings...") +all_names = [] +all_labels = [] +all_vectors = [] + +for group_name, snippets in groups.items(): + for snippet_name, code in snippets.items(): + vec = embed_code(code).cpu().numpy() + all_names.append(snippet_name) + all_labels.append(group_name) + all_vectors.append(vec) + print(f" [{group_name:12s}] {snippet_name}") + +X = np.stack(all_vectors) # shape: [N, 768] +N, D = X.shape +print(f"\nEmbedding matrix: {N} snippets × {D} dimensions\n") + +# ── Step 2: Define similarity metrics ───────────────────────────────────── +def cosine_matrix(vectors): + """Compute pairwise cosine similarity for L2-normalized vectors.""" + norms = np.linalg.norm(vectors, axis=1, keepdims=True) + norms = np.maximum(norms, 1e-10) + normed = vectors / norms + return normed @ normed.T + +def compute_metrics(sim_matrix, labels): + """ + Compute intra-group (same category) and inter-group (different category) + average similarities. The GAP between them measures discriminability. + """ + intra_sims = [] + inter_sims = [] + n = len(labels) + for i in range(n): + for j in range(i + 1, n): + if labels[i] == labels[j]: + intra_sims.append(sim_matrix[i, j]) + else: + inter_sims.append(sim_matrix[i, j]) + intra_mean = np.mean(intra_sims) + inter_mean = np.mean(inter_sims) + gap = intra_mean - inter_mean + return intra_mean, inter_mean, gap + + +# ── Step 3: Sweep across PCA dimensions ────────────────────────────────── +# PCA can have at most min(N, D) components; cap accordingly +max_components = min(N, D) +dims_to_test = sorted(set( + k for k in [2, 3, 5, 8, 10, 15, 20, 30, 50, 75, 100, 150, 200, + 300, 400, 500, 600, D] + if k <= max_components +)) +dims_to_test.append(D) # always include full dimensionality as baseline + +print("=" * 70) +print("PCA DENOISING EXPERIMENT") +print("=" * 70) +print(f"\n{'Components':>12s} {'Intra-Group':>12s} {'Inter-Group':>12s} " + f"{'Gap':>8s} {'vs Full':>8s}") +print("-" * 62) + +results = [] +for k in dims_to_test: + if k >= D: + # Full dimensionality — no PCA, just use original vectors + X_reduced = X.copy() + actual_k = D + else: + pca = PCA(n_components=k, random_state=42) + X_reduced = pca.fit_transform(X) + actual_k = k + + sim = cosine_matrix(X_reduced) + intra, inter, gap = compute_metrics(sim, all_labels) + results.append((actual_k, intra, inter, gap)) + +# Compute full-dim gap for comparison +full_intra, full_inter, full_gap = results[-1][1], results[-1][2], results[-1][3] + +for k, intra, inter, gap in results: + delta = gap - full_gap + delta_str = f"{delta:+.4f}" if k < D else " (base)" + print(f"{k:>12d} {intra:>12.4f} {inter:>12.4f} {gap:>8.4f} {delta_str:>8s}") + +# ── Step 4: Find the optimal dimensionality ────────────────────────────── +dims_arr = np.array([r[0] for r in results]) +gaps_arr = np.array([r[3] for r in results]) +best_idx = np.argmax(gaps_arr) +best_k, best_gap = int(dims_arr[best_idx]), gaps_arr[best_idx] + +print(f"\n{'=' * 70}") +print(f"BEST DIMENSIONALITY: {best_k} components") +print(f" Gap (intra - inter): {best_gap:.4f} vs {full_gap:.4f} at full 768-d") +print(f" Improvement: {best_gap - full_gap:+.4f}") +print(f"{'=' * 70}") + +# ── Step 5: Show detailed comparison at optimal k vs full ──────────────── +print(f"\n── Detailed Similarity Matrix at k={best_k} vs k={D} ──\n") + +if best_k < D: + pca_best = PCA(n_components=best_k, random_state=42) + X_best = pca_best.fit_transform(X) +else: + X_best = X.copy() + +sim_full = cosine_matrix(X) +sim_best = cosine_matrix(X_best) + +# Show a selection of interesting pairs +print(f"{'Snippet A':>20s} {'Snippet B':>20s} {'Full 768d':>10s} " + f"{'PCA {0}d'.format(best_k):>10s} {'Change':>8s}") +print("-" * 78) + +interesting_pairs = [ + # Intra-group: should be high + ("bubble_sort", "quick_sort"), + ("bubble_sort", "merge_sort"), + ("read_json", "read_csv"), + ("http_get", "fetch_url"), + ("factorial", "fibonacci"), + ("reverse_str", "is_palindrome"), + ("stack_push_pop", "queue_impl"), + # Inter-group: should be low + ("bubble_sort", "read_json"), + ("factorial", "http_get"), + ("reverse_str", "download_file"), + ("is_prime", "write_file"), + ("stack_push_pop", "count_vowels"), +] + +for n1, n2 in interesting_pairs: + i = all_names.index(n1) + j = all_names.index(n2) + s_full = sim_full[i, j] + s_best = sim_best[i, j] + same = all_labels[i] == all_labels[j] + marker = "SAME" if same else "DIFF" + change = s_best - s_full + print(f"{n1:>20s} {n2:>20s} {s_full:>10.4f} {s_best:>10.4f} " + f"{change:>+8.4f} [{marker}]") + + +# ── Step 6: Text-to-code search comparison ──────────────────────────────── +print(f"\n── Text-to-Code Search: Full 768d vs PCA {best_k}d ──\n") + +search_queries = [ + ("sort a list of numbers", "Sorting"), + ("read a JSON config file", "File I/O"), + ("compute factorial recursively", "Math"), + ("make an HTTP GET request", "Networking"), + ("check if a number is prime", "Math"), +] + +if best_k < D: + pca_search = PCA(n_components=best_k, random_state=42) + X_search = pca_search.fit_transform(X) +else: + X_search = X.copy() + pca_search = None + +for query, expected_group in search_queries: + q_vec = embed_code(query).cpu().numpy().reshape(1, -1) + + # Full dimension search + q_norm = q_vec / np.linalg.norm(q_vec) + X_norm = X / np.linalg.norm(X, axis=1, keepdims=True) + scores_full = (X_norm @ q_norm.T).flatten() + + # PCA-reduced search + if pca_search is not None: + q_reduced = pca_search.transform(q_vec) + else: + q_reduced = q_vec.copy() + q_r_norm = q_reduced / np.linalg.norm(q_reduced) + X_s_norm = X_search / np.linalg.norm(X_search, axis=1, keepdims=True) + scores_pca = (X_s_norm @ q_r_norm.T).flatten() + + top_full = np.argsort(-scores_full)[:3] + top_pca = np.argsort(-scores_pca)[:3] + + print(f' Query: "{query}"') + print(f' Full 768d: {all_names[top_full[0]]:>16s} ({scores_full[top_full[0]]:.3f})' + f' {all_names[top_full[1]]:>16s} ({scores_full[top_full[1]]:.3f})' + f' {all_names[top_full[2]]:>16s} ({scores_full[top_full[2]]:.3f})') + print(f' PCA {best_k:>3d}d: {all_names[top_pca[0]]:>16s} ({scores_pca[top_pca[0]]:.3f})' + f' {all_names[top_pca[1]]:>16s} ({scores_pca[top_pca[1]]:.3f})' + f' {all_names[top_pca[2]]:>16s} ({scores_pca[top_pca[2]]:.3f})') + + full_correct = all_labels[top_full[0]] == expected_group + pca_correct = all_labels[top_pca[0]] == expected_group + print(f' Full correct: {full_correct} | PCA correct: {pca_correct}') + print() + + +# ── Step 7: Visualization ───────────────────────────────────────────────── +# Six-panel figure for a comprehensive visual analysis. + +group_colors = { + "Sorting": "#1f77b4", "File I/O": "#ff7f0e", "Math": "#2ca02c", + "Networking": "#d62728", "String ops": "#9467bd", "Data structures": "#8c564b", +} +label_colors = [group_colors[g] for g in all_labels] +unique_groups = list(dict.fromkeys(all_labels)) + +fig = plt.figure(figsize=(20, 13)) +fig.suptitle("PCA Denoising Analysis — Can Fewer Dimensions Improve Code Similarity?", + fontsize=15, fontweight="bold", y=0.98) + +# ── Row 1 ── + +# Plot 1: Intra/inter similarity vs number of PCA components +ax1 = fig.add_subplot(2, 3, 1) +dims_plot = [r[0] for r in results] +intra_plot = [r[1] for r in results] +inter_plot = [r[2] for r in results] +ax1.fill_between(dims_plot, inter_plot, intra_plot, alpha=0.15, color="tab:green") +ax1.plot(dims_plot, intra_plot, "o-", color="tab:blue", linewidth=2, + label="Intra-group (same category)", markersize=6) +ax1.plot(dims_plot, inter_plot, "s-", color="tab:red", linewidth=2, + label="Inter-group (different category)", markersize=6) +ax1.axvline(x=best_k, color="green", linestyle="--", alpha=0.7, + label=f"Best gap at k={best_k}") +ax1.set_xlabel("Number of PCA Components", fontsize=10) +ax1.set_ylabel("Average Cosine Similarity", fontsize=10) +ax1.set_title("(a) Intra- vs Inter-Group Similarity", fontsize=11, fontweight="bold") +ax1.legend(fontsize=7, loc="center right") +ax1.set_xscale("log") +ax1.grid(True, alpha=0.3) + +# Plot 2: Gap (discriminability) vs number of PCA components +ax2 = fig.add_subplot(2, 3, 2) +gaps_plot = [r[3] for r in results] +ax2.plot(dims_plot, gaps_plot, "D-", color="tab:green", linewidth=2, markersize=7) +ax2.axvline(x=best_k, color="green", linestyle="--", alpha=0.7, + label=f"Best k={best_k} (gap={best_gap:.3f})") +ax2.axhline(y=full_gap, color="gray", linestyle=":", alpha=0.7, + label=f"Full 768d (gap={full_gap:.3f})") +ax2.fill_between(dims_plot, full_gap, gaps_plot, alpha=0.12, color="tab:green", + where=[g > full_gap for g in gaps_plot]) +ax2.set_xlabel("Number of PCA Components", fontsize=10) +ax2.set_ylabel("Gap (Intra − Inter)", fontsize=10) +ax2.set_title("(b) Discriminability vs Dimensionality", fontsize=11, fontweight="bold") +ax2.legend(fontsize=8) +ax2.set_xscale("log") +ax2.grid(True, alpha=0.3) + +# Plot 3: Cumulative variance explained +pca_full = PCA(n_components=min(N, D), random_state=42) +pca_full.fit(X) +cumvar = np.cumsum(pca_full.explained_variance_ratio_) * 100 +ax3 = fig.add_subplot(2, 3, 3) +ax3.plot(range(1, len(cumvar) + 1), cumvar, "-", color="tab:purple", linewidth=2) +ax3.axvline(x=best_k, color="green", linestyle="--", alpha=0.7, + label=f"Best k={best_k}") +for threshold in [90, 95, 99]: + k_thresh = np.searchsorted(cumvar, threshold) + 1 + if k_thresh <= len(cumvar): + ax3.axhline(y=threshold, color="gray", linestyle=":", alpha=0.4) + ax3.annotate(f"{threshold}% → k={k_thresh}", xy=(k_thresh, threshold), + fontsize=8, color="gray", ha="left", + xytext=(k_thresh + 1, threshold - 2)) +ax3.set_xlabel("Number of PCA Components", fontsize=10) +ax3.set_ylabel("Cumulative Variance Explained (%)", fontsize=10) +ax3.set_title("(c) Variance Concentration", fontsize=11, fontweight="bold") +ax3.legend(fontsize=8) +ax3.set_xscale("log") +ax3.grid(True, alpha=0.3) + +# ── Row 2 ── + +# Plot 4 & 5: Side-by-side heatmaps (full vs PCA-denoised) +# Sort indices by group for a block-diagonal structure +sorted_idx = sorted(range(N), key=lambda i: all_labels[i]) +sorted_names = [all_names[i] for i in sorted_idx] +sorted_labels = [all_labels[i] for i in sorted_idx] + +sim_full_sorted = sim_full[np.ix_(sorted_idx, sorted_idx)] +sim_best_sorted = sim_best[np.ix_(sorted_idx, sorted_idx)] + +for panel_idx, (mat, title_str) in enumerate([ + (sim_full_sorted, f"(d) Similarity Heatmap — Full 768d"), + (sim_best_sorted, f"(e) Similarity Heatmap — PCA {best_k}d (Denoised)"), +]): + ax = fig.add_subplot(2, 3, 4 + panel_idx) + im = ax.imshow(mat, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto") + ax.set_xticks(range(N)) + ax.set_yticks(range(N)) + ax.set_xticklabels(sorted_names, rotation=90, fontsize=5) + ax.set_yticklabels(sorted_names, fontsize=5) + + # Draw group boundary lines + prev_label = sorted_labels[0] + for i, lab in enumerate(sorted_labels): + if lab != prev_label: + ax.axhline(y=i - 0.5, color="black", linewidth=1) + ax.axvline(x=i - 0.5, color="black", linewidth=1) + prev_label = lab + + ax.set_title(title_str, fontsize=11, fontweight="bold") + plt.colorbar(im, ax=ax, shrink=0.8, label="Cosine Similarity") + +# Plot 6: Bar chart comparing specific pairs at full vs PCA +ax6 = fig.add_subplot(2, 3, 6) +pair_labels = [] +full_scores = [] +pca_scores = [] +pair_colors = [] + +for n1, n2 in interesting_pairs: + i = all_names.index(n1) + j = all_names.index(n2) + pair_labels.append(f"{n1}\nvs {n2}") + full_scores.append(sim_full[i, j]) + pca_scores.append(sim_best[i, j]) + pair_colors.append("#2ca02c" if all_labels[i] == all_labels[j] else "#d62728") + +y_pos = np.arange(len(pair_labels)) +bar_h = 0.35 +bars_full = ax6.barh(y_pos + bar_h / 2, full_scores, bar_h, label="Full 768d", + color="tab:blue", alpha=0.7) +bars_pca = ax6.barh(y_pos - bar_h / 2, pca_scores, bar_h, label=f"PCA {best_k}d", + color="tab:orange", alpha=0.7) + +# Color labels by same/different group +for i, (yl, col) in enumerate(zip(pair_labels, pair_colors)): + ax6.annotate("●", xy=(-0.05, y_pos[i]), fontsize=10, color=col, + ha="right", va="center", fontweight="bold", + annotation_clip=False) + +ax6.set_yticks(y_pos) +ax6.set_yticklabels(pair_labels, fontsize=6) +ax6.set_xlabel("Cosine Similarity", fontsize=10) +ax6.set_title("(f) Pair Comparison: Full vs PCA Denoised", fontsize=11, fontweight="bold") +ax6.legend(fontsize=8) +ax6.axvline(x=0, color="black", linewidth=0.5) +ax6.set_xlim(-1.1, 1.1) +ax6.grid(True, axis="x", alpha=0.3) +ax6.invert_yaxis() + +# Custom legend for the dots +from matplotlib.lines import Line2D +dot_legend = [Line2D([0], [0], marker="o", color="w", markerfacecolor="#2ca02c", + markersize=8, label="Same group"), + Line2D([0], [0], marker="o", color="w", markerfacecolor="#d62728", + markersize=8, label="Different group")] +ax6.legend(handles=[bars_full, bars_pca] + dot_legend, fontsize=7, loc="lower right") + +plt.tight_layout(rect=[0, 0, 1, 0.96]) +plt.savefig("pca_denoising_analysis.png", dpi=150, bbox_inches="tight") +print(f"\nSaved: pca_denoising_analysis.png") + +# ── Summary ─────────────────────────────────────────────────────────────── +print(f""" +{'=' * 70} +CONCLUSIONS +{'=' * 70} + +1. VARIANCE CONCENTRATION: + The first few PCA components capture a disproportionate amount of + variance. This means the embedding space has low effective + dimensionality — most of the 768 dimensions are semi-redundant. + +2. DENOISING EFFECT: + At k={best_k}, the gap between intra-group and inter-group similarity + is {best_gap:.4f} (vs {full_gap:.4f} at full 768d). + {'PCA denoising IMPROVED discriminability by removing noisy dimensions.' if best_gap > full_gap else 'Full dimensionality was already optimal for this dataset.'} + +3. PRACTICAL IMPLICATIONS: + - For retrieval (code search), moderate PCA reduction can sharpen + results while also reducing storage and computation. + - Too few dimensions (k=2,3) lose important signal. + - Too many dimensions may retain noise that dilutes similarity. + - The "sweet spot" depends on the dataset and task. + +4. TRADE-OFF: + PCA denoising is a post-hoc technique. Newer embedding models are + trained with Matryoshka Representation Learning (MRL) that makes + the FIRST k dimensions maximally informative by design. +""") diff --git a/Code embeddings/README.md b/Code embeddings/README.md new file mode 100644 index 0000000..121c383 --- /dev/null +++ b/Code embeddings/README.md @@ -0,0 +1,93 @@ +# Code Embeddings — Hands-On Examples + +**AISE501 – AI in Software Engineering I** +Fachhochschule Graubünden — Spring Semester 2026 + +## Overview + +Seven self-contained Python programs that demonstrate how embedding +models work. Each script loads a pre-trained model, embeds text or code +snippets, and explores a different capability of embeddings. + +| # | Script | What it demonstrates | +|---|--------|---------------------| +| 0 | `00_tokens_and_embeddings_intro.py` | Tokenization basics and general text embeddings (German) | +| 1 | `01_basic_embeddings.py` | Compute code embeddings and pairwise cosine similarity | +| 2 | `02_text_to_code_search.py` | Semantic search: find code from natural language queries | +| 3 | `03_cross_language.py` | Same algorithm in 4 languages → similar embeddings | +| 4 | `04_clone_detection.py` | Detect duplicate/similar code in a simulated codebase | +| 5 | `05_visualize_embeddings.py` | PCA and t-SNE plots of the embedding space | +| 6 | `06_pca_denoising.py` | PCA denoising: fewer dimensions can improve similarity | + +## Setup + +### 1. Create a virtual environment (recommended) + +```bash +python -m venv venv + +# macOS / Linux +source venv/bin/activate + +# Windows +venv\Scripts\activate +``` + +### 2. Install dependencies + +```bash +pip install -r requirements.txt +``` + +**PyTorch GPU support:** + +- **Apple Silicon Mac (M1/M2/M3/M4):** MPS acceleration works + out of the box with the standard PyTorch install. No extra steps needed. +- **NVIDIA GPU (Windows/Linux):** Install the CUDA version of PyTorch. + See https://pytorch.org/get-started/locally/ for the correct command + for your CUDA version. +- **CPU only:** Everything works on CPU too, just a bit slower. + +### 3. Run any example + +```bash +python 00_tokens_and_embeddings_intro.py +python 01_basic_embeddings.py +python 02_text_to_code_search.py +python 03_cross_language.py +python 04_clone_detection.py +python 05_visualize_embeddings.py +python 06_pca_denoising.py +``` + +The first run will download the model (~300 MB). Subsequent runs +use the cached model. + +## Model + +All code embedding examples (01–06) use **st-codesearch-distilroberta-base** +(82M parameters), a DistilRoBERTa model fine-tuned on 1.38 million +code-comment pairs from CodeSearchNet using contrastive learning +(MultipleNegativesRankingLoss). It produces 768-dimensional embedding +vectors optimized for matching natural language descriptions to code, +making it ideal for semantic code search and similarity tasks. + +The introductory example (00) uses **paraphrase-multilingual-mpnet-base-v2** +for demonstrating general language embeddings with German text. + +## Hardware Requirements + +- **RAM:** 1 GB free (for the model) +- **Disk:** ~500 MB (for the downloaded model, cached in `~/.cache/huggingface/`) +- **GPU:** Optional — all scripts auto-detect and use: + - CUDA (NVIDIA GPUs) + - MPS (Apple Silicon) + - CPU (fallback) + +## Expected Output + +Each script prints structured output with explanations. Example 5 +saves two PNG images (`code_embeddings_pca.png` and +`code_embeddings_tsne.png`) showing the embedding space. Example 6 +saves `pca_denoising_analysis.png` with three sub-plots analyzing +optimal embedding dimensions. diff --git a/Code embeddings/code_embeddings_pca.png b/Code embeddings/code_embeddings_pca.png new file mode 100644 index 0000000..88b4b75 Binary files /dev/null and b/Code embeddings/code_embeddings_pca.png differ diff --git a/Code embeddings/code_embeddings_tsne.png b/Code embeddings/code_embeddings_tsne.png new file mode 100644 index 0000000..842df8d Binary files /dev/null and b/Code embeddings/code_embeddings_tsne.png differ diff --git a/Code embeddings/embedding_space_crosslingual.png b/Code embeddings/embedding_space_crosslingual.png new file mode 100644 index 0000000..a9c5553 Binary files /dev/null and b/Code embeddings/embedding_space_crosslingual.png differ diff --git a/Code embeddings/embedding_space_german.png b/Code embeddings/embedding_space_german.png new file mode 100644 index 0000000..11d84f3 Binary files /dev/null and b/Code embeddings/embedding_space_german.png differ diff --git a/Code embeddings/pca_denoising_analysis.png b/Code embeddings/pca_denoising_analysis.png new file mode 100644 index 0000000..8461c6d Binary files /dev/null and b/Code embeddings/pca_denoising_analysis.png differ diff --git a/Code embeddings/requirements.txt b/Code embeddings/requirements.txt new file mode 100644 index 0000000..c903590 --- /dev/null +++ b/Code embeddings/requirements.txt @@ -0,0 +1,6 @@ +torch +transformers +sentence-transformers +scikit-learn +matplotlib +numpy diff --git a/Prompting Exercise/.DS_Store b/Prompting Exercise/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/Prompting Exercise/.DS_Store differ diff --git a/Prompting Exercise/analyze_me.py b/Prompting Exercise/analyze_me.py new file mode 100644 index 0000000..d2b8b83 --- /dev/null +++ b/Prompting Exercise/analyze_me.py @@ -0,0 +1,67 @@ +""" +analyze_me.py – A data-processing script used in Exercise 2 +============================================================== +This file contains several realistic bugs and style issues. +Do NOT fix them manually — in Exercise 2 the LLM will help you find them! + +Can you spot the issues yourself before asking the LLM? +""" + + +def calculate_statistics(numbers): + total = 0 + for n in numbers: + total = total + n + average = total / len(numbers) # Bug 1: ZeroDivisionError when list is empty + + min_val = numbers[0] # Bug 2: IndexError when list is empty + max_val = numbers[0] + for n in numbers: + if n < min_val: + min_val = n + if n > max_val: + max_val = n + + variance = 0 + for n in numbers: + variance = variance + (n - average) ** 2 + variance = variance / len(numbers) # Bug 3: population variance (÷N), not sample variance (÷N-1) + + return { + "count": len(numbers), + "sum": total, + "average": average, + "min": min_val, + "max": max_val, + "variance": variance, + } + + +def process_data(filename): + numbers = [] + f = open(filename) # Bug 4: no context manager (file may not be closed on error) + for line in f: + numbers.append(int(line.strip())) # Bug 5: int() crashes on floats and blank lines + f.close() + + result = calculate_statistics(numbers) + print("Statistics:", result) + return result + + +def normalize(numbers, method="minmax"): + if method == "minmax": + mn = min(numbers) + mx = max(numbers) + return [(x - mn) / mx - mn for x in numbers] # Bug 6: operator-precedence error + elif method == "zscore": + stats = calculate_statistics(numbers) + std = stats["variance"] ** 0.5 + return [(x - stats["average"]) / std for x in numbers] + else: + print("Unknown normalisation method") # Bug 7: should raise ValueError, not just print + + +if __name__ == "__main__": + sample = [4, 8, 15, 16, 23, 42] + print(calculate_statistics(sample)) diff --git a/Prompting Exercise/analyze_me_blind.py b/Prompting Exercise/analyze_me_blind.py new file mode 100644 index 0000000..27cb4b3 --- /dev/null +++ b/Prompting Exercise/analyze_me_blind.py @@ -0,0 +1,67 @@ +""" +analyze_me.py – A data-processing script used in Exercise 2 +============================================================== +This file contains several realistic bugs and style issues. +Do NOT fix them manually — in Exercise 2 the LLM will help you find them! + +Can you spot the issues yourself before asking the LLM? +""" + + +def calculate_statistics(numbers): + total = 0 + for n in numbers: + total = total + n + average = total / len(numbers) + + min_val = numbers[0] + max_val = numbers[0] + for n in numbers: + if n < min_val: + min_val = n + if n > max_val: + max_val = n + + variance = 0 + for n in numbers: + variance = variance + (n - average) ** 2 + variance = variance / len(numbers) + + return { + "count": len(numbers), + "sum": total, + "average": average, + "min": min_val, + "max": max_val, + "variance": variance, + } + + +def process_data(filename): + numbers = [] + f = open(filename) + for line in f: + numbers.append(int(line.strip())) + f.close() + + result = calculate_statistics(numbers) + print("Statistics:", result) + return result + + +def normalize(numbers, method="minmax"): + if method == "minmax": + mn = min(numbers) + mx = max(numbers) + return [(x - mn) / mx - mn for x in numbers] + elif method == "zscore": + stats = calculate_statistics(numbers) + std = stats["variance"] ** 0.5 + return [(x - stats["average"]) / std for x in numbers] + else: + print("Unknown normalisation method") + + +if __name__ == "__main__": + sample = [4, 8, 15, 16, 23, 42] + print(calculate_statistics(sample)) diff --git a/Prompting Exercise/analyze_me_blind_fix.py b/Prompting Exercise/analyze_me_blind_fix.py new file mode 100644 index 0000000..517a90e --- /dev/null +++ b/Prompting Exercise/analyze_me_blind_fix.py @@ -0,0 +1,89 @@ +import sys + +""" +analyze_me.py – A data-processing script used in Exercise 2 +============================================================== +This file contains several realistic bugs and style issues. +Do NOT fix them manually — in Exercise 2 the LLM will help you find them! + +Can you spot the issues yourself before asking the LLM? +""" + + +def calculate_statistics(numbers): + if not numbers: + raise ValueError("Cannot calculate statistics for an empty list.") + + total = 0 + for n in numbers: + total = total + n + average = total / len(numbers) + + min_val = numbers[0] + max_val = numbers[0] + for n in numbers: + if n < min_val: + min_val = n + if n > max_val: + max_val = n + + variance = 0 + for n in numbers: + variance = variance + (n - average) ** 2 + variance = variance / len(numbers) + + return { + "count": len(numbers), + "sum": total, + "average": average, + "min": min_val, + "max": max_val, + "variance": variance, + } + + +def process_data(filename): + numbers = [] + try: + with open(filename, 'r') as file_handle: + for line in file_handle: + stripped_line = line.strip() + if stripped_line: + numbers.append(int(stripped_line)) + except FileNotFoundError: + print(f"Error: File '{filename}' not found.") + raise + except ValueError as e: + print(f"Error: Invalid integer in file: {e}") + raise + + result = calculate_statistics(numbers) + print("Statistics:", result) + return result + + +def normalize(numbers, method="minmax"): + if not numbers: + raise ValueError("Cannot normalize an empty list.") + + if method == "minmax": + mn = min(numbers) + mx = max(numbers) + if mx == mn: + return [0.0 for _ in numbers] + return [(x - mn) / (mx - mn) for x in numbers] + elif method == "zscore": + stats = calculate_statistics(numbers) + std = stats["variance"] ** 0.5 + if std == 0: + return [0.0 for _ in numbers] + return [(x - stats["average"]) / std for x in numbers] + else: + print("Unknown normalization method") + return [] + + +if __name__ == "__main__": + sample = [4, 8, 15, 16, 23, 42] + print(calculate_statistics(sample)) + diff --git a/Prompting Exercise/analyze_me_direct.py b/Prompting Exercise/analyze_me_direct.py new file mode 100644 index 0000000..9897271 --- /dev/null +++ b/Prompting Exercise/analyze_me_direct.py @@ -0,0 +1,192 @@ +""" +analyze_me.py – A data-processing script used in Exercise 2 +============================================================== +This module provides robust functions for calculating statistics, +processing data files, and normalizing numeric lists. + +All functions include PEP-484 type hints and NumPy-style docstrings. +""" + +from typing import List, Dict, Union, Any + + +def calculate_statistics(numbers: List[Union[int, float]]) -> Dict[str, Any]: + """ + Calculate basic statistics for a list of numbers. + + Parameters + ---------- + numbers : List[Union[int, float]] + The list of numeric values to analyze. + + Returns + ------- + Dict[str, Any] + A dictionary containing count, sum, average, min, max, and variance. + If the input list is empty, returns a dictionary with zero values + for all fields except count (which is 0). + + Notes + ----- + - Variance is calculated using the sample variance formula (dividing by N-1). + - If the list is empty, the function returns early to avoid division by zero + or index errors. + """ + count = len(numbers) + + if count == 0: + return { + "count": 0, + "sum": 0.0, + "average": 0.0, + "min": 0.0, + "max": 0.0, + "variance": 0.0, + } + + total = sum(numbers) + average = total / count + + min_val = min(numbers) + max_val = max(numbers) + + # Calculate sample variance (divide by N-1) + variance_sum = sum((n - average) ** 2 for n in numbers) + variance = variance_sum / (count - 1) + + return { + "count": count, + "sum": total, + "average": average, + "min": min_val, + "max": max_val, + "variance": variance, + } + + +def process_data(filename: str) -> Dict[str, Any]: + """ + Read numeric data from a file and calculate statistics. + + Parameters + ---------- + filename : str + Path to the input file containing one number per line. + Blank lines and non-numeric lines are skipped. + + Returns + ------- + Dict[str, Any] + The statistics dictionary returned by calculate_statistics(). + + Raises + ------ + FileNotFoundError + If the specified file does not exist. + ValueError + If the file cannot be read or contains no valid numbers. + """ + numbers: List[Union[int, float]] = [] + + try: + with open(filename, 'r') as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + try: + # Attempt to parse as float to handle both int and float + numbers.append(float(stripped)) + except ValueError: + # Skip non-numeric lines + continue + except FileNotFoundError: + raise FileNotFoundError(f"File not found: {filename}") + except IOError as e: + raise IOError(f"Error reading file {filename}: {e}") + + if not numbers: + raise ValueError(f"No valid numeric data found in {filename}") + + result = calculate_statistics(numbers) + print("Statistics:", result) + return result + + +def normalize(numbers: List[Union[int, float]], method: str = "minmax") -> List[float]: + """ + Normalize a list of numbers using the specified method. + + Parameters + ---------- + numbers : List[Union[int, float]] + The list of numeric values to normalize. + method : str, optional + The normalization method to use. Options are: + - "minmax": Min-Max normalization to [0, 1] + - "zscore": Z-score normalization (standardization) + + Returns + ------- + List[float] + The normalized list of numbers. + + Raises + ------ + ValueError + If an unknown normalization method is provided, or if the list is empty. + ZeroDivisionError + If the range is zero for minmax or standard deviation is zero for zscore. + """ + if not numbers: + raise ValueError("Cannot normalize an empty list.") + + if method == "minmax": + mn = min(numbers) + mx = max(numbers) + range_val = mx - mn + if range_val == 0: + # If all values are the same, return zeros or handle as needed + return [0.0 for _ in numbers] + return [(x - mn) / range_val for x in numbers] + + elif method == "zscore": + stats = calculate_statistics(numbers) + std = stats["variance"] ** 0.5 + if std == 0: + # If standard deviation is zero, all values are the same + return [0.0 for _ in numbers] + return [(x - stats["average"]) / std for x in numbers] + + else: + raise ValueError(f"Unknown normalization method: '{method}'. " + f"Supported methods: 'minmax', 'zscore'.") + + +if __name__ == "__main__": + # Basic sanity checks + sample = [4, 8, 15, 16, 23, 42] + + print("Testing calculate_statistics:") + stats = calculate_statistics(sample) + print(stats) + + print("\nTesting normalize (minmax):") + normalized_minmax = normalize(sample, "minmax") + print(normalized_minmax) + + print("\nTesting normalize (zscore):") + normalized_zscore = normalize(sample, "zscore") + print(normalized_zscore) + + print("\nTesting empty list handling:") + empty_stats = calculate_statistics([]) + print(empty_stats) + + print("\nTesting unknown method error:") + try: + normalize(sample, "unknown") + except ValueError as e: + print(f"Caught expected error: {e}") + + print("\nAll sanity checks passed!") \ No newline at end of file diff --git a/Prompting Exercise/analyze_me_fix.py b/Prompting Exercise/analyze_me_fix.py new file mode 100644 index 0000000..cf7df3f --- /dev/null +++ b/Prompting Exercise/analyze_me_fix.py @@ -0,0 +1,89 @@ +import sys + +""" +analyze_me.py – A data-processing script used in Exercise 2 +============================================================== +This file contains several realistic bugs and style issues. +Do NOT fix them manually — in Exercise 2 the LLM will help you find them! + +Can you spot the issues yourself before asking the LLM? +""" + + +def calculate_statistics(numbers): + if not numbers: + return { + "count": 0, + "sum": 0, + "average": 0.0, + "min": None, + "max": None, + "variance": 0.0, + } + + total = 0 + for n in numbers: + total = total + n + average = total / len(numbers) + + min_val = numbers[0] + max_val = numbers[0] + for n in numbers: + if n < min_val: + min_val = n + if n > max_val: + max_val = n + + variance = 0 + for n in numbers: + variance = variance + (n - average) ** 2 + variance = variance / (len(numbers) - 1) if len(numbers) > 1 else 0.0 + + return { + "count": len(numbers), + "sum": total, + "average": average, + "min": min_val, + "max": max_val, + "variance": variance, + } + + +def process_data(filename): + numbers = [] + with open(filename) as file_handle: + for line in file_handle: + stripped = line.strip() + if not stripped: + continue + try: + numbers.append(float(stripped)) + except ValueError: + continue + + result = calculate_statistics(numbers) + print("Statistics:", result) + return result + + +def normalize(numbers, method="minmax"): + if method == "minmax": + mn = min(numbers) + mx = max(numbers) + if mx == mn: + return [0.0 for _ in numbers] + return [(x - mn) / (mx - mn) for x in numbers] + elif method == "zscore": + stats = calculate_statistics(numbers) + std = stats["variance"] ** 0.5 + if std == 0: + return [0.0 for _ in numbers] + return [(x - stats["average"]) / std for x in numbers] + else: + raise ValueError(f"Unknown normalization method: {method}") + + +if __name__ == "__main__": + sample = [4, 8, 15, 16, 23, 42] + print(calculate_statistics(sample)) + diff --git a/Prompting Exercise/analyze_me_fixed.py b/Prompting Exercise/analyze_me_fixed.py new file mode 100644 index 0000000..ca0fd98 --- /dev/null +++ b/Prompting Exercise/analyze_me_fixed.py @@ -0,0 +1,216 @@ +""" +analyze_me.py – A data-processing script used in Exercise 2 +============================================================== +This file contains several realistic bugs and style issues. +Do NOT fix them manually — in Exercise 2 the LLM will help you find them! + +Can you spot the issues yourself before asking the LLM? +""" + + +def calculate_statistics(numbers: list[float]) -> dict[str, float]: + """ + Calculate basic statistical measures for a list of numbers. + + This function computes the count, sum, average, minimum, maximum, and + variance (population variance) of the provided list of numbers. + + Parameters + ---------- + numbers : list[float] + A list of numeric values to analyze. + + Returns + ------- + dict[str, float] + A dictionary containing the following keys: + - 'count': The number of elements in the list. + - 'sum': The sum of all elements. + - 'average': The arithmetic mean of the elements. + - 'min': The minimum value in the list. + - 'max': The maximum value in the list. + - 'variance': The population variance of the elements. + + Raises + ------ + ZeroDivisionError + If the input list is empty, division by zero will occur when + calculating the average and variance. + IndexError + If the input list is empty, accessing the first element for min/max + will raise an error. + """ + # Step 2 – Implement empty list handling in calculate_statistics + if not numbers: + return { + "count": 0, + "sum": 0.0, + "average": 0.0, + "min": 0.0, + "max": 0.0, + "variance": 0.0, + } + + total = 0 + for n in numbers: + total = total + n + average = total / len(numbers) # Bug 1: ZeroDivisionError when list is empty + + min_val = numbers[0] # Bug 2: IndexError when list is empty + max_val = numbers[0] + for n in numbers: + if n < min_val: + min_val = n + if n > max_val: + max_val = n + + variance = 0 + for n in numbers: + variance = variance + (n - average) ** 2 + + # Step 3 – Correct variance calculation to use sample variance + count = len(numbers) + if count > 1: + variance = variance / (count - 1) + else: + variance = 0.0 + + return { + "count": len(numbers), + "sum": total, + "average": average, + "min": min_val, + "max": max_val, + "variance": variance, + } + + +# Step 4 – Define type hints and docstrings for process_data +def process_data(filename: str) -> dict[str, float]: + """ + Read numeric data from a file and compute statistics. + + This function opens a text file, reads each line, converts it to an integer, + and collects the values into a list. It then passes this list to + calculate_statistics to compute and return the statistical summary. + + Parameters + ---------- + filename : str + The path to the text file containing one number per line. + + Returns + ------- + dict[str, float] + A dictionary containing the statistical measures computed from the file data. + + Raises + ------ + FileNotFoundError + If the specified file does not exist. + ValueError + If a line in the file cannot be converted to an integer. + """ + numbers = [] + # Step 5 – Implement context manager and robust line parsing in process_data + with open(filename) as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + try: + # Attempt to convert to float first to handle both ints and floats + value = float(stripped) + numbers.append(value) + except ValueError: + # Skip lines that cannot be converted to a number + continue + + result = calculate_statistics(numbers) + print("Statistics:", result) + return result + + +# Step 6 – Define type hints and docstrings for normalize +def normalize(numbers: list[float], method: str = "minmax") -> list[float]: + """ + Normalize a list of numbers using the specified method. + + This function applies either 'minmax' scaling or 'zscore' standardization + to the input list of numbers. + + Parameters + ---------- + numbers : list[float] + A list of numeric values to normalize. + method : str, optional + The normalization method to use. Options are: + - 'minmax': Scales values to the range [0, 1]. + - 'zscore': Standardizes values to have mean 0 and standard deviation 1. + Default is 'minmax'. + + Returns + ------- + list[float] + A list of normalized values. + + Raises + ------ + ValueError + If an unknown normalization method is provided. + ZeroDivisionError + If 'minmax' is used on a list where all values are identical (range is 0), + or if 'zscore' is used on a list with zero standard deviation. + + Examples + -------- + >>> normalize([1, 2, 3, 4, 5]) + [0.0, 0.25, 0.5, 0.75, 1.0] + """ + if method == "minmax": + mn = min(numbers) + mx = max(numbers) + # Step 7 – Fix operator precedence bug in minmax normalization + return [(x - mn) / (mx - mn) for x in numbers] + elif method == "zscore": + stats = calculate_statistics(numbers) + std = stats["variance"] ** 0.5 + return [(x - stats["average"]) / std for x in numbers] + else: + # Step 8 – Replace print statement with ValueError for unknown methods + raise ValueError(f"Unknown normalisation method: {method}") + + +if __name__ == "__main__": + # Step 9 – Implement and verify main block sanity checks + sample = [4, 8, 15, 16, 23, 42] + stats = calculate_statistics(sample) + + # Verify expected values for sample data + expected_sum = 4 + 8 + 15 + 16 + 23 + 42 + expected_count = 6 + expected_avg = expected_sum / expected_count + + assert stats["count"] == expected_count, f"Count mismatch: {stats['count']} != {expected_count}" + assert stats["sum"] == expected_sum, f"Sum mismatch: {stats['sum']} != {expected_sum}" + assert abs(stats["average"] - expected_avg) < 1e-9, f"Average mismatch: {stats['average']} != {expected_avg}" + assert stats["min"] == 4, f"Min mismatch: {stats['min']} != 4" + assert stats["max"] == 42, f"Max mismatch: {stats['max']} != 42" + + # Test empty list handling + empty_stats = calculate_statistics([]) + assert empty_stats["count"] == 0, "Empty list count should be 0" + assert empty_stats["sum"] == 0.0, "Empty list sum should be 0.0" + assert empty_stats["average"] == 0.0, "Empty list average should be 0.0" + assert empty_stats["min"] == 0.0, "Empty list min should be 0.0" + assert empty_stats["max"] == 0.0, "Empty list max should be 0.0" + assert empty_stats["variance"] == 0.0, "Empty list variance should be 0.0" + + # Test normalization + normalized = normalize([1, 2, 3, 4, 5]) + expected_normalized = [0.0, 0.25, 0.5, 0.75, 1.0] + assert len(normalized) == 5, "Normalized list length mismatch" + for i, val in enumerate(normalized): + assert abs(val - expected_normalized[i]) < 1e-9, f"Normalized value mismatch at index {i}" + + print("All sanity checks passed!") \ No newline at end of file diff --git a/Prompting Exercise/ex01_xml_prompting.py b/Prompting Exercise/ex01_xml_prompting.py new file mode 100644 index 0000000..0ccc6e8 --- /dev/null +++ b/Prompting Exercise/ex01_xml_prompting.py @@ -0,0 +1,142 @@ +""" +Exercise 1 – Basic XML Structured Prompting +============================================ +AISE501 · Prompting in Coding · Spring Semester 2026 + +Learning goals +-------------- +* Connect to the local LLM server and send your first prompt. +* Understand the difference between unstructured and XML-structured prompts. +* See how structure helps the model parse and prioritise different parts + of your request. + +Tasks +----- +Part A Run the unstructured prompt (already done for you). Read the response. +Part B Complete the XML-structured version of the same request (TODOs 1-3). +Part C Add a system prompt to set the response style (TODOs 4-5). + +""" + +from server_utils import chat, get_client, print_messages, print_separator + +client = get_client() + + +# ── Part A: Unstructured (Zero-Shot) Prompt ─────────────────────────────────── +# This section is complete. Run it, read the response, then move on. + +print_separator("Part A – Unstructured Prompt") + +unstructured_messages = [ + { + "role": "user", + "content": ( + "Explain what a Python list comprehension is, " + "give an example that filters even numbers from a list, " + "and list two common mistakes beginners make." + ), + } +] + +# print_messages(unstructured_messages) # ← always inspect what you send! +# response_a = chat(client, unstructured_messages) +# print(response_a) + + +# ── Part B: Structured Prompt with XML Tags ─────────────────────────────────── +# Use XML tags to structure the same request more precisely. +# Named sections help the model parse and prioritise your intent. + +print_separator("Part B – Structured Prompt with XML Tags") + +# TODO 1: Fill in the three XML sections below. +# Use the same topic as Part A but make each section specific. +# +# – the Python concept to explain +# – what the code example should demonstrate +# – two or three specific points you want covered in the answer +# +# Tip: XML tag names are arbitrary — choose names that make sense to a +# human reader and the model will understand them too. + +structured_content = """\ + + + Python list comprehensions + + + Filter even numbers from a list + + + Syntax overview and two common beginner mistakes + +""" + +# TODO 2: Build the messages list. +# Use structured_content as the content of a "user" message. +# +# Reminder: messages is a list of dicts with keys "role" and "content". +# "role" is one of "system", "user", or "assistant". + +structured_messages = [ + # TODO: add the user message dict here + { + "role": "user", + "content": structured_content, + } +] + +# TODO 3: Call chat() with structured_messages, store the result, print it. +# Compare the output with response_a above. +# Always call print_messages() before chat() to see the full prompt. + +# print_messages(structured_messages) +# response_b = chat(client, structured_messages) +# print(response_b) + + +# ── Part C: Adding a System Prompt ──────────────────────────────────────────── +# A system prompt lets you define a persona and global rules for every +# response in the conversation without repeating yourself each time. + +print_separator("Part C – Adding a System Prompt") + +# TODO 4: Write an XML-structured system prompt that defines: +# – who the LLM should be +# + Format your response in json + +""" + +# TODO 5: Build a messages list that puts the system prompt FIRST (role="system"), +# followed by the structured user message from Part B. +# Call chat() and print the result. +# +# Reflection: How did the system prompt change the answer compared to Part B? + +messages_c = [ + {"role": "system", "content": system_content}, + {"role": "user", "content": structured_content} +] +print_messages(messages_c) +response_c = chat(client, messages_c) +print(response_c) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. How did XML structure change the format and depth of the response?\n" + "2. What happens if you use inconsistent or missing closing tags?\n" + "3. When would you NOT bother with XML structure?\n" + "4. How does the system prompt interact with the user message?\n" +) diff --git a/Prompting Exercise/ex01_xml_prompting_solution.py b/Prompting Exercise/ex01_xml_prompting_solution.py new file mode 100644 index 0000000..79ad237 --- /dev/null +++ b/Prompting Exercise/ex01_xml_prompting_solution.py @@ -0,0 +1,91 @@ +""" +Exercise 1 – SOLUTION – Basic XML Structured Prompting +======================================================= +AISE501 · Prompting in Coding · Spring Semester 2026 +""" + +from server_utils import chat, get_client, print_messages, print_separator + +client = get_client() +temperature_value=0.3 + +# ── Part A: Unstructured (Zero-Shot) Prompt ─────────────────────────────────── +print_separator("Part A – Unstructured Prompt") + +unstructured_messages = [ + { + "role": "user", + "content": ( + "Explain what a Python list comprehension is, " + "give an example that filters even numbers from a list, " + "and list two common mistakes beginners make." + ), + } +] + +print_messages(unstructured_messages) +response_a = chat(client, unstructured_messages) +print(response_a) + + +# ── Part B: Structured Prompt with XML Tags ─────────────────────────────────── +print_separator("Part B – Structured Prompt with XML Tags") + +structured_content = """\ + + + Python list comprehensions + + + A list comprehension that takes a list of integers and returns only + the even numbers, using a conditional filter expression. + + + 1. The general syntax: [expression for item in iterable if condition] + 2. Two common beginner mistakes when writing list comprehensions + +""" + +structured_messages = [ + {"role": "user", "content": structured_content} +] + +print_messages(structured_messages) +response_b = chat(client, structured_messages, temperature=temperature_value) +print(response_b) + + +# ── Part C: Adding a System Prompt ──────────────────────────────────────────── +print_separator("Part C – Adding a System Prompt") + +system_content = """\ + + You are an experienced Python tutor. You teach Python to university students + who have basic programming knowledge but are new to idiomatic Python. + + + + Keep each answer under 200 words. Use at most one code block per response. +""" + +messages_c = [ + {"role": "system", "content": system_content}, + {"role": "user", "content": structured_content}, +] + +print_messages(messages_c) +response_c = chat(client, messages_c,temperature=temperature_value) +print(response_c) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. How did XML structure change the format and depth of the response?\n" + "2. What happens if you use inconsistent or missing closing tags?\n" + "3. When would you NOT bother with XML structure?\n" + "4. How does the system prompt interact with the user message?\n" +) diff --git a/Prompting Exercise/ex02_persona_task_data.py b/Prompting Exercise/ex02_persona_task_data.py new file mode 100644 index 0000000..8f45be2 --- /dev/null +++ b/Prompting Exercise/ex02_persona_task_data.py @@ -0,0 +1,151 @@ +""" +Exercise 2 – Persona, Task, and Data in a Structured Prompt +============================================================ +AISE501 · Prompting in Coding · Spring Semester 2026 + +Learning goals +-------------- +* Use XML tags to separate three prompt concerns: WHO the LLM is, + WHAT it should do, and the DATA it should work with. +* Pass a real Python file as context (RAG-style) inside a tag. +* Iterate on the prompt to extract more specific information. + +The file analyze_me.py contains several bugs and style issues. +You will ask the LLM to find and explain them. + +Tasks +----- +Part A Build a structured prompt with , , and tags + and ask the LLM to review analyze_me.py (TODOs 1-4). +Part B Refine the prompt to request a prioritised bug list (TODOs 5-6). +Part C Ask for a corrected version of one specific function (TODO 7). + +""" + +from pathlib import Path + +from server_utils import chat, get_client, print_messages, print_separator + +client = get_client() + +# Read the file we want the LLM to analyse +code_to_review = Path("analyze_me.py").read_text() + + +# ── Part A: Persona + Task + Code ───────────────────────────────────────────── +print_separator("Part A – Structured Prompt: Persona / Task / Code") + +# TODO 1: Fill in the tag. +# Define a senior Python engineer who is rigorous about correctness +# and follows PEP-8 and best practices. + +# TODO 2: Fill in the tag. +# Ask the LLM to review the Python code and identify ALL bugs, +# listing each one with a short explanation of why it is a bug. + +# TODO 3: The tag already contains the file — do not change it. + +# TODO 4: Build the messages list using only a user message (no system prompt yet). +# Call chat() and print the result. + +prompt_a = f"""\ + + You are a Python engineer who is rigorous about correctness and follows PEP-8 and best practices. + + + + Review the Python code and identify ALL bugs, listing each one with a short explanation of why it is a bug. + + + +{code_to_review} +""" + +messages_a = [ + {"role": "user", "content": prompt_a} +] + +# print_messages(messages_a) +# response_a = chat(client, messages_a) +# print(response_a) + + +# ── Part B: Refine – Ask for a Prioritised Bug List ─────────────────────────── +print_separator("Part B – Refined Prompt: Prioritised Bug List") + +# TODO 5: Extend the from Part A to ask the LLM to: +# - Separate bugs by severity: Critical / Medium / Style +# - For each bug: state the line number, the problem, and a one-line fix hint +# +# Tip: add a tag that describes exactly how you want the answer +# structured (plain text for now — we tackle real machine output in Ex 3). + +# TODO 6: Build messages_b with a system prompt that reinforces the persona +# and a user message with the refined prompt. +# Call chat() and print the result. + +system_b = """\ + + You are a master python developer and teacher + + Format your response in json + +""" + +prompt_b = f"""\ + + You are a Python engineer who is rigorous about correctness and follows PEP-8 and best practices. + + + + Review the Python code and identify ALL bugs, listing each one with a short explanation of why it is a bug. + Separate bugs by severity: Critical / Medium / Style + or each bug: state the line number, the problem, and a one-line fix hint + + + + ... + + + +{code_to_review} +""" + +messages_b = [ + {"role": "system", "content": system_b}, + {"role": "user", "content": prompt_b}, +] +print_messages(messages_b) +response_b = chat(client, messages_b) +print(response_b) + + +# ── Part C: Request a Corrected Function ────────────────────────────────────── +print_separator("Part C – Ask for a Corrected Function") + +# TODO 7: Pick one buggy function from analyze_me.py (e.g. calculate_statistics). +# Write a new user message — continuing the SAME conversation as Part B — +# that asks the LLM to rewrite that function with all bugs fixed, +# including proper type hints and a docstring. +# +# Key insight: you can reuse the model's previous response by appending it to +# the messages list as an "assistant" message, then adding a new "user" message. +# This is how multi-turn conversations work with the API. + +messages_c = messages_b + [ + {"role": "assistant", "content": response_b}, # LLM's previous answer + {"role": "user", "content": "Fix all bugs, keep the rest as it is"}, +] +print_messages(messages_c) +response_c = chat(client, messages_c) +print(response_c) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. Did the LLM find all 7 bugs? Which did it miss?\n" + "2. How did the tag change the structure of the answer?\n" + "3. What is the advantage of continuing a conversation vs. starting fresh?\n" + "4. How would you scale this pattern to a large codebase (many files)?\n" +) diff --git a/Prompting Exercise/ex02_persona_task_data_solution.py b/Prompting Exercise/ex02_persona_task_data_solution.py new file mode 100644 index 0000000..cd3c586 --- /dev/null +++ b/Prompting Exercise/ex02_persona_task_data_solution.py @@ -0,0 +1,122 @@ +""" +Exercise 2 – SOLUTION – Persona, Task, and Data in a Structured Prompt +======================================================================= +AISE501 · Prompting in Coding · Spring Semester 2026 +""" + +from pathlib import Path + +from server_utils import chat, get_client, print_messages, print_separator + +client = get_client() + +code_to_review = Path("analyze_me.py").read_text() +temperature_value=1 + +# ── Part A: Persona + Task + Code ───────────────────────────────────────────── +print_separator("Part A – Structured Prompt: Persona / Task / Code") + +prompt_a = f"""\ + + You are a senior Python engineer with 10+ years of experience. + You are rigorous about correctness, follow PEP-8 strictly, and care + deeply about defensive programming and readable code. + + + + Review the Python code provided below. + Identify every bug and code-quality issue you can find. + For each issue, state what is wrong and why it is a problem. + + + +{code_to_review} +""" + +messages_a = [ + {"role": "user", "content": prompt_a} +] + +print_messages(messages_a) +response_a = chat(client, messages_a, temperature=temperature_value) +print(response_a) + + +# ── Part B: Refine – Ask for a Prioritised Bug List ─────────────────────────── +print_separator("Part B – Refined Prompt: Prioritised Bug List") + +system_b = """\ +You are a senior Python engineer performing a thorough code review. +Be concise, precise, and always refer to line numbers when available. +""" + +prompt_b = f"""\ + + You are a senior Python engineer with 10+ years of experience. + You are rigorous about correctness, follow PEP-8, and care about + defensive programming and readable code. + + + + Review the Python code below. + Identify every bug and code-quality issue. + Classify each finding by severity: + - Critical : causes a crash or wrong result under normal use + - Medium : bad practice that will cause problems in production + - Style : violates PEP-8 or reduces readability + + + + For each finding produce exactly this structure (plain text): + [SEVERITY] Line : + Fix hint: + + Group findings under headings: ## Critical, ## Medium, ## Style + + + +{code_to_review} +""" + +messages_b = [ + {"role": "system", "content": system_b}, + {"role": "user", "content": prompt_b}, +] + +print_messages(messages_b) +response_b = chat(client, messages_b, temperature=temperature_value) +print(response_b) + + +# ── Part C: Request a Corrected Function ────────────────────────────────────── +print_separator("Part C – Ask for a Corrected Function") + +followup = """\ + + Rewrite only the `calculate_statistics` function with all bugs fixed. + Requirements: + - Handle an empty list gracefully (return None or raise ValueError with a clear message) + - Use sample variance (divide by N-1) + - Add full PEP-8 type hints + - Add a NumPy-style docstring + Return only the function code, no surrounding explanation. +""" + +messages_c = messages_b + [ + {"role": "assistant", "content": response_b}, + {"role": "user", "content": followup}, +] + +print_messages(messages_c) +response_c = chat(client, messages_c, temperature=temperature_value) +print(response_c) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. Did the LLM find all 7 bugs? Which did it miss?\n" + "2. How did the tag change the structure of the answer?\n" + "3. What is the advantage of continuing a conversation vs. starting fresh?\n" + "4. How would you scale this pattern to a large codebase (many files)?\n" +) diff --git a/Prompting Exercise/ex03_structured_output.py b/Prompting Exercise/ex03_structured_output.py new file mode 100644 index 0000000..5b87616 --- /dev/null +++ b/Prompting Exercise/ex03_structured_output.py @@ -0,0 +1,231 @@ +""" +Exercise 3 – Structured Input and Structured Output +==================================================== +AISE501 · Prompting in Coding · Spring Semester 2026 + +Learning goals +-------------- +* Request machine-parseable output (JSON and YAML) from the LLM. +* Parse the JSON response in Python and use it programmatically. +* Build a second prompt dynamically from the parsed data. +* Understand why structured output is essential for LLM pipelines. + +Tasks +----- +Part A Ask the LLM to review analyze_me.py and return a JSON report (TODOs 1-4). +Part B Parse the JSON response and print a summary table (TODOs 5-6). +Part C Use the parsed data to build a follow-up prompt automatically (TODOs 7-8). +Part D Repeat Part A but request YAML instead of JSON (TODO 9). + +Estimated time: 40-50 minutes +""" + +import json +from pathlib import Path + +from server_utils import chat, chat_json, get_client, print_messages, print_separator + +client = get_client() + +code_to_review = Path("analyze_me.py").read_text() + + +# ── Part A: Structured Input → JSON Output ──────────────────────────────────── +print_separator("Part A – Request JSON Output") + +# TODO 1: Write a system prompt that instructs the model to ALWAYS respond +# with valid JSON and nothing else (no markdown fences, no explanation). + +system_a = """\ + + You are a master python tutor + + Only respond in a json format following the user provided schema + +""" + +# TODO 2: Write the user prompt. +# Use XML tags for , , and . +# +# In , specify the exact JSON schema you expect: +# +schema = """{ + "summary": "", + "bugs": [ + { + "id": 1, + "severity": "Critical|Medium|Style", + "line": , + "function": "", + "description": "", + "fix": "" + }, + ... + ], + "overall_quality": "Poor|Fair|Good|Excellent" +}""" +# +# Tip: paste the schema directly inside a tag in your prompt. + +prompt_a = f"""\ +TODO: Write your structured prompt here. +Include , , , and tags. + + + You are a Python engineer who is rigorous about correctness and follows PEP-8 and best practices. + + + + Review the Python code and identify ALL bugs. + Explain all the bugs you found the schema provided. + + + +{schema} + + + +{code_to_review} +""" + +messages_a = [ + # TODO 3: build the messages list (system + user) + {"role": "system", "content": system_a}, + {"role": "user", "content": prompt_a}, +] + +# TODO 4: call chat_json() and store the raw response string in raw_json_a. +# chat_json() adds response_format={"type": "json_object"} so the +# server guarantees the output is parseable by json.loads(). +print_messages(messages_a) +raw_json_a = chat_json(client, messages_a) +print("Raw response:") +print(raw_json_a) + + +# ── Part B: Parse the JSON and Display a Summary ────────────────────────────── +print_separator("Part B – Parse JSON and Print Summary") + +# TODO 5: Parse raw_json_a with json.loads(). +# Handle the case where the model returned malformed JSON +# (wrap in try/except and print a helpful error message). + +report = json.loads(raw_json_a) + +# TODO 6: Print a formatted summary table like this: +# +# Overall quality : Fair +# Summary : ... +# +# ID | Severity | Line | Function | Description +# ---+----------+------+-----------------------+--------------------------- +# 1 | Critical | 12 | calculate_statistics | ZeroDivisionError on ... +# 2 | ... +# +# Hint: use f-strings and ljust() / rjust() for alignment. + +print(f"Overall quality : {report['overall_quality']}") +print(f"Summary : {report['summary']}\n") + +bugs = report.get("bugs", []) +if bugs: + headers = { + "id": "ID", + "severity": "Severity", + "line": "Line", + "function": "Function", + "description": "Description", + } + + # Compute column widths + widths = { + key: max(len(headers[key]), *(len(str(b[key])) for b in bugs)) + for key in headers + } + + # Header row + print( + f"{headers['id'].ljust(widths['id'])} | " + f"{headers['severity'].ljust(widths['severity'])} | " + f"{headers['line'].ljust(widths['line'])} | " + f"{headers['function'].ljust(widths['function'])} | " + f"{headers['description']}" + ) + + # Separator row + print( + f"{'-' * widths['id']}-+-" + f"{'-' * widths['severity']}-+-" + f"{'-' * widths['line']}-+-" + f"{'-' * widths['function']}-+-" + f"{'-' * widths['description']}" + ) + + # Data rows + for bug in bugs: + print( + f"{str(bug['id']).ljust(widths['id'])} | " + f"{bug['severity'].ljust(widths['severity'])} | " + f"{str(bug['line']).ljust(widths['line'])} | " + f"{bug['function'].ljust(widths['function'])} | " + f"{bug['description']}" + ) + +# ── Part C: Use the Parsed Data to Build a Follow-Up Prompt ────────────────── +print_separator("Part C – Dynamic Follow-Up Prompt from Parsed Data") + +# TODO 7: Select all bugs with severity "Critical" from the parsed report. +# Build a new user prompt that: +# - Lists each critical bug by ID and description +# - Asks the LLM to provide the corrected code for each one +# - Requests the output as a JSON OBJECT (not a bare array, because +# response_format=json_object requires an object at the top level): +# {"fixes": [{"bug_id": 1, "fixed_code": "..."}, ...]} +# +# Tip: wrap the schema in a {"fixes": [...]} object so chat_json() works. + +critical_bugs = [b for b in report["bugs"] if b["severity"] == "Critical"] + +followup_prompt = """\ +TODO: Build the follow-up prompt dynamically using the critical_bugs list. + Loop over critical_bugs to embed each bug's description in the prompt. +""" + +# TODO 8: Continue the conversation (multi-turn) by appending the previous +# response and the new prompt, then call chat_json() and parse the result. +# Because the schema is {"fixes": [...]}, extract the list with ["fixes"]. + +# messages_c = messages_a + [ +# {"role": "assistant", "content": raw_json_a}, +# {"role": "user", "content": followup_prompt}, +# ] +# print_messages(messages_c) +# raw_json_c = chat_json(client, messages_c) +# fixes = json.loads(raw_json_c)["fixes"] +# for fix in fixes: +# print(f"\n--- Fix for bug {fix['bug_id']} ---") +# print(fix["fixed_code"]) + + +# ── Part D: Request YAML Instead of JSON ───────────────────────────────────── +print_separator("Part D – YAML Output") + +# TODO 9: Repeat Part A but ask for YAML output instead of JSON. +# Install PyYAML if needed: pip install pyyaml +# Parse the response with yaml.safe_load() and print the result. +# +# Question: Which format do you prefer for human-readable reports? For +# machine-to-machine pipelines? + +# import yaml +# ... + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. What can go wrong when asking an LLM to return JSON?\n" + "2. How did the tag influence the output structure?\n" + "3. Why is structured output important for building LLM pipelines?\n" + "4. When would you use JSON vs. YAML vs. plain text?\n" +) diff --git a/Prompting Exercise/ex03_structured_output_solution.py b/Prompting Exercise/ex03_structured_output_solution.py new file mode 100644 index 0000000..b2fa154 --- /dev/null +++ b/Prompting Exercise/ex03_structured_output_solution.py @@ -0,0 +1,188 @@ +""" +Exercise 3 – SOLUTION – Structured Input and Structured Output +============================================================== +AISE501 · Prompting in Coding · Spring Semester 2026 +""" + +import json +from pathlib import Path + +import yaml # pip install pyyaml + +from server_utils import chat, chat_json, get_client, print_messages, print_separator + +client = get_client() + +code_to_review = Path("analyze_me.py").read_text() + + +# ── Part A: Structured Input → JSON Output ──────────────────────────────────── +print_separator("Part A – Request JSON Output") + +system_a = """\ +You are a code-review assistant. You ALWAYS respond with valid JSON and +nothing else — no markdown code fences, no introductory text, no trailing +commentary. Your entire response must be parseable by json.loads(). +""" + +prompt_a = f"""\ + + You are a senior Python engineer performing a thorough, structured code review. + + + + Review the Python code below and return your findings as JSON. + Follow the schema defined in exactly. + + + +{{ + "summary": "", + "bugs": [ + {{ + "id": 1, + "severity": "Critical|Medium|Style", + "line": , + "function": "", + "description": "", + "fix": "" + }} + ], + "overall_quality": "Poor|Fair|Good|Excellent" +}} + + + +{code_to_review} +""" + +messages_a = [ + {"role": "system", "content": system_a}, + {"role": "user", "content": prompt_a}, +] + +print_messages(messages_a) +raw_json_a = chat_json(client, messages_a) # response_format=json_object → always valid JSON +print("Raw response:") +print(raw_json_a) + + +# ── Part B: Parse the JSON and Display a Summary ────────────────────────────── +print_separator("Part B – Parse JSON and Print Summary") + +report = json.loads(raw_json_a) + + +print(f"Overall quality : {report['overall_quality']}") +print(f"Summary : {report['summary']}\n") + +col_w = [4, 10, 6, 24, 45] +header = ( + f"{'ID':<{col_w[0]}} | {'Severity':<{col_w[1]}} | {'Line':<{col_w[2]}} | " + f"{'Function':<{col_w[3]}} | {'Description':<{col_w[4]}}" +) +print(header) +print("-" * len(header)) + +for bug in report["bugs"]: + line_str = str(bug["line"]) if bug["line"] is not None else "—" + print( + f"{bug['id']:<{col_w[0]}} | " + f"{bug['severity']:<{col_w[1]}} | " + f"{line_str:<{col_w[2]}} | " + f"{bug['function']:<{col_w[3]}} | " + f"{bug['description'][:col_w[4]]}" + ) + + +# ── Part C: Use the Parsed Data to Build a Follow-Up Prompt ────────────────── +print_separator("Part C – Dynamic Follow-Up Prompt from Parsed Data") + +critical_bugs = [b for b in report["bugs"] if b["severity"] == "Critical"] + +if not critical_bugs: + print("No critical bugs found — nothing to fix.") +else: + lines = [] + for b in critical_bugs: + lines.append(f' - Bug {b["id"]} (line {b["line"]}): {b["description"]}') + bug_list_text = "\n".join(lines) + + followup_prompt = f"""\ + + The following critical bugs were found in analyze_me.py: + +{bug_list_text} + + For each bug, provide the corrected Python code snippet (the full function + is fine). Return your answer as a JSON object with this schema: + {{ + "fixes": [ + {{"bug_id": , "fixed_code": ""}} + ] + }} + No markdown, no explanation — only the JSON object. +""" + + messages_c = messages_a + [ + {"role": "assistant", "content": raw_json_a}, + {"role": "user", "content": followup_prompt}, + ] + + print_messages(messages_c) + raw_json_c = chat_json(client, messages_c) + + fixes = json.loads(raw_json_c)["fixes"] + for fix in fixes: + print(f"\n--- Fix for bug {fix['bug_id']} ---") + print(fix["fixed_code"]) + + +# ── Part D: Request YAML Instead of JSON ───────────────────────────────────── +print_separator("Part D – YAML Output") + +system_d = """\ +You are a code-review assistant. You ALWAYS respond with valid YAML and +nothing else — no markdown fences, no introductory text. +""" + +prompt_d = f"""\ + + You are a senior Python engineer performing a structured code review. + + + + Review the code below and return your findings as YAML. + Use the same fields as before: summary, bugs (with id/severity/line/ + function/description/fix), and overall_quality. + + + +{code_to_review} +""" + +messages_d = [ + {"role": "system", "content": system_d}, + {"role": "user", "content": prompt_d}, +] + +print_messages(messages_d) +raw_yaml = chat(client, messages_d, temperature=0.2) + +try: + yaml_report = yaml.safe_load(raw_yaml) + print(f"Parsed YAML – overall quality: {yaml_report.get('overall_quality')}") + print(f"Number of bugs found: {len(yaml_report.get('bugs', []))}") +except yaml.YAMLError as e: + print(f"ERROR: malformed YAML: {e}") + print(raw_yaml) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. What can go wrong when asking an LLM to return JSON?\n" + "2. How did the tag influence the output structure?\n" + "3. Why is structured output important for building LLM pipelines?\n" + "4. When would you use JSON vs. YAML vs. plain text?\n" +) diff --git a/Prompting Exercise/ex04_cot_pipeline.py b/Prompting Exercise/ex04_cot_pipeline.py new file mode 100644 index 0000000..606515b --- /dev/null +++ b/Prompting Exercise/ex04_cot_pipeline.py @@ -0,0 +1,300 @@ +""" +Exercise 4 – Build Your Own Chain-of-Thought Pipeline +====================================================== +AISE501 · Prompting in Coding · Spring Semester 2026 + +Learning goals +-------------- +* Understand that reasoning models (o1, DeepSeek-R1, Qwen3 think mode) + generate a hidden "plan" before giving the final answer. +* Replicate this behaviour manually using multiple LLM calls: + Call 1 (Planning) – structured input → structured JSON plan + Calls 2…N (Execution) – iterate step-by-step, validating each step +* See why explicit reasoning steps improve answer quality for complex tasks. + +Background +---------- +When you disable Qwen3's built-in thinking mode (as we do in server_utils), +you get fast, direct answers — but no explicit reasoning. +In this exercise you rebuild that reasoning step yourself, step by step, +so you can inspect and control the thinking process. + +The problem +----------- +Given the buggy analyze_me.py from earlier exercises, design and implement +a corrected, production-ready version of the full module. + +Tasks +----- +Part A Planning phase: structured input → JSON reasoning plan (TODOs 1-5). +Part B Iterative execution: apply each plan step one at a time, + validating syntax after each step (TODOs 6-10). +Part C Reflection — compare with and without CoT (TODO 11). + +Estimated time: 50-60 minutes +""" + +import json +import subprocess +import sys +from pathlib import Path + +from server_utils import ( + chat, chat_json, get_client, print_messages, print_separator, + strip_code_fences, +) + +client = get_client() + +code_to_fix = Path("analyze_me.py").read_text() + +# ── The Problem Statement ───────────────────────────────────────────────────── +# We will use this description in both phases so we define it once. + +PROBLEM = """\ +Rewrite the Python module analyze_me.py so that it is correct, +robust, and production-ready. + +Requirements: + 1. calculate_statistics() must handle empty lists without crashing. + 2. Use sample variance (divide by N-1). + 3. process_data() must use a context manager and handle non-numeric lines. + 4. normalize() must fix the operator-precedence bug and raise ValueError + for unknown methods. + 5. All functions must have PEP-484 type hints and NumPy-style docstrings. + 6. The module must pass basic sanity checks when run as __main__. +""" + + +# ── Part A: Planning Phase ──────────────────────────────────────────────────── +print_separator("Part A – Planning Phase (CoT Step 1)") + +# The goal of this phase is NOT to write the code — it is to produce a +# structured plan: what steps are needed and in what order? + +# TODO 1: Write a system prompt that instructs the model to act as a +# "software architect" whose job is ONLY to produce a plan, +# never to write the final code. +# IMPORTANT: explicitly forbid code snippets in all fields — +# use plain English only. This prevents unescaped quotes from +# breaking the JSON output. +# Enforce JSON-only output. + +system_plan = """\ +TODO: Write a system prompt for the planning phase. + The model should only reason and plan, not write code. + Enforce JSON-only output. +""" + +# TODO 2: Write the planning user prompt using XML tags: +# – embed the PROBLEM string +# – embed the buggy code_to_fix +# – ask for a step-by-step plan +# – specify the exact JSON schema for the plan: +# +# { +# "goal": "", +# "steps": [ +# { +# "step_id": 1, +# "title": "", +# "reasoning": "", +# "action": "", +# "depends_on": [] // list of step_ids this step depends on +# }, +# ... +# ] +# } + +prompt_plan = f"""\ +TODO: Write the planning prompt here. +Use , , , and tags. + + +{PROBLEM} + + + +{code_to_fix} +""" + +# TODO 3: Build messages_plan (system + user) and call chat_json(). +# Use chat_json() (not chat()) so the server enforces valid JSON via +# response_format={"type": "json_object"}. +# Use max_tokens=4096 — the plan can be long and would get cut off +# with the default 2048, producing truncated (unparseable) JSON. + +messages_plan = [ + # TODO: add system and user messages +] + +# print_messages(messages_plan) +# raw_plan = chat_json(client, messages_plan, max_tokens=4096) +# print("Raw plan JSON:") +# print(raw_plan) + + +# TODO 4: Parse raw_plan with json.loads(). +# Print each step in a readable format: +# Step 1 – +# Reasoning : <reasoning> +# Action : <action> + +# plan = json.loads(raw_plan) +# print(f"\nGoal: {plan['goal']}\n") +# for step in plan["steps"]: +# print(f"Step {step['step_id']} – {step['title']}") +# print(f" Reasoning : {step['reasoning']}") +# print(f" Action : {step['action']}\n") + + +# TODO 5: (Optional) Inspect the plan critically. +# Does the order of steps make sense? +# Are any steps missing? +# You can edit the plan dict before passing it to the execution phase. + + +# ── Part B: Iterative Execution Phase ──────────────────────────────────────── +print_separator("Part B – Iterative Execution Phase (CoT Step 2)") + +# KEY INSIGHT: Instead of dumping the entire plan into one big prompt +# (which would just be another one-shot), we iterate through each step +# individually. After every step we: +# 1. Feed the model only the CURRENT step + the accumulated code so far +# 2. Validate the output (syntax check via py_compile) +# 3. Use the validated output as input for the next step +# +# This mirrors how a real developer works: implement one change, verify it +# compiles, then move on. The model always works with CONCRETE code from +# the previous step rather than an abstract plan of what it intends to write. + +# TODO 6: Write a system prompt for the execution phase. +# The model should act as a developer who receives the current +# state of a module plus a single step to implement. +# It should apply ONLY that step and return the full updated module. + +system_exec = """\ +TODO: Write a system prompt for the step-by-step execution phase. + The model should apply ONE step at a time. +""" + + +# TODO 7: Complete the validate_syntax() function below. +# It should write code to a temp file and run py_compile on it. +# Return (True, "") if syntax is valid, (False, error_message) otherwise. + +def validate_syntax(code: str) -> tuple[bool, str]: + """Write code to a temp file and run py_compile to check syntax.""" + tmp = Path("_tmp_validate.py") + # TODO: write code to tmp, run py_compile, clean up, return result + tmp.unlink(missing_ok=True) + return True, "" # placeholder + + +# TODO 8: Implement the step-by-step execution loop. +# Start with current_code = code_to_fix (the original buggy code). +# For each step in plan["steps"]: +# a) Build a prompt with <current_code>, <step>, and <task> tags +# b) Call chat() with the prompt +# c) Strip code fences from the response +# d) Validate syntax using validate_syntax() +# e) If valid: update current_code +# f) If invalid: retry ONCE with error feedback +# g) Print the code after each step + +# current_code = code_to_fix +# +# for step in plan["steps"]: +# step_id = step["step_id"] +# print_separator(f"Executing Step {step_id} – {step['title']}") +# +# prompt_step = f"""\ +# TODO: Build the per-step prompt here. +# Include <current_code>, <step>, and <task> tags. +# Tell the model to apply ONLY this step.""" +# +# messages_step = [ +# {"role": "system", "content": system_exec}, +# {"role": "user", "content": prompt_step}, +# ] +# +# print_messages(messages_step) +# raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096) +# step_code = strip_code_fences(raw_response) +# +# # Validate syntax +# ok, error_msg = validate_syntax(step_code) +# if ok: +# print(f" [PASS] Step {step_id} – syntax OK") +# current_code = step_code +# else: +# print(f" [FAIL] Step {step_id} – syntax error: {error_msg}") +# # TODO: retry with error feedback (see TODO 9) +# +# print(f"\n--- Code after Step {step_id} ---") +# print(current_code) + + +# TODO 9: Implement the retry logic for syntax errors. +# When a step produces invalid syntax: +# a) Build a retry prompt with the <error> and the broken <code> +# b) Ask the model to fix the syntax error +# c) Validate again +# d) If still broken, keep the last valid code and continue + + +# TODO 10: Save the final result and run it as a validation. +# - Save current_code to "analyze_me_fixed.py" +# - Run it with subprocess and print the output + +# Path("analyze_me_fixed.py").write_text(current_code) +# print("\nSaved iterative CoT result to analyze_me_fixed.py") +# +# result = subprocess.run( +# [sys.executable, "analyze_me_fixed.py"], +# capture_output=True, text=True, +# ) +# print("STDOUT:", result.stdout) +# if result.stderr: +# print("STDERR:", result.stderr) +# print(f"Exit code: {result.returncode}") + + +# ── Part C: Compare With and Without CoT ───────────────────────────────────── +print_separator("Part C – Baseline: Direct Prompt Without CoT") + +# TODO 11: Send the same problem to the model in a SINGLE prompt with NO plan. +# Compare this response with the iterative CoT version. + +direct_prompt = f"""\ +TODO: Write a direct, single-shot prompt asking the model to rewrite + analyze_me.py according to the PROBLEM requirements. + No plan, no iteration — just ask directly. + +<problem> +{PROBLEM} +</problem> + +<code language="python" filename="analyze_me.py"> +{code_to_fix} +</code>""" + +# messages_direct = [{"role": "user", "content": direct_prompt}] +# print_messages(messages_direct) +# direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096) +# print(direct_response) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. How did the iterative CoT output differ from the direct single-shot?\n" + "2. Did the validation step catch any syntax errors? How were they fixed?\n" + "3. What would happen if you gave the model a deliberately wrong plan?\n" + "4. How does this manual CoT pipeline relate to built-in thinking modes\n" + " in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n" + "5. What are the trade-offs of step-by-step iteration vs. one-shot?\n" + " (Think: latency, cost, error isolation, debuggability)\n" + "6. How could you extend the validation step beyond syntax checking?\n" + " (Hint: unit tests, type checking, linting)\n" +) diff --git a/Prompting Exercise/ex04_cot_pipeline_solution.py b/Prompting Exercise/ex04_cot_pipeline_solution.py new file mode 100644 index 0000000..ea766dd --- /dev/null +++ b/Prompting Exercise/ex04_cot_pipeline_solution.py @@ -0,0 +1,279 @@ +""" +Exercise 4 – SOLUTION – Build Your Own Chain-of-Thought Pipeline +================================================================ +AISE501 · Prompting in Coding · Spring Semester 2026 +""" + +import ast +import json +import subprocess +import sys +from pathlib import Path + +from server_utils import ( + chat, chat_json, get_client, print_messages, print_separator, + strip_code_fences, +) + +client = get_client() + +code_to_fix = Path("analyze_me.py").read_text() + +PROBLEM = """\ +Rewrite the Python module analyze_me.py so that it is correct, +robust, and production-ready. + +Requirements: + 1. calculate_statistics() must handle empty lists without crashing. + 2. Use sample variance (divide by N-1). + 3. process_data() must use a context manager and handle non-numeric lines. + 4. normalize() must fix the operator-precedence bug and raise ValueError + for unknown methods. + 5. All functions must have PEP-484 type hints and NumPy-style docstrings. + 6. The module must pass basic sanity checks when run as __main__. +""" + + +# ── Part A: Planning Phase ──────────────────────────────────────────────────── +print_separator("Part A – Planning Phase (CoT Step 1)") + +system_plan = """\ +You are a software architect. Your ONLY job right now is to produce a +structured reasoning plan. You must NOT write any Python code or code +snippets anywhere in your response — not in action fields, not in +reasoning fields, nowhere. Use plain English descriptions only. +Respond with valid JSON only (no markdown fences, no extra text). +""" + +prompt_plan = f"""\ +<problem> +{PROBLEM} +</problem> + +<code language="python" filename="analyze_me.py"> +{code_to_fix} +</code> + +<task> + Analyse the problem and the buggy code above. + Produce a step-by-step plan that a developer can follow to implement + the corrected module. Each step must be atomic and self-contained. +</task> + +<schema> +{{ + "goal": "<one-sentence goal>", + "steps": [ + {{ + "step_id": 1, + "title": "<short title>", + "reasoning": "<why this step is necessary>", + "action": "<concrete action to take — plain English only, no code>", + "depends_on": [] + }} + ] +}} +</schema>""" + +messages_plan = [ + {"role": "system", "content": system_plan}, + {"role": "user", "content": prompt_plan}, +] + +print_messages(messages_plan) +raw_plan = chat_json(client, messages_plan, max_tokens=4096) +print("Raw plan JSON:") +print(raw_plan) + +plan = json.loads(raw_plan) + +print(f"\nGoal: {plan['goal']}\n") +for step in plan["steps"]: + print(f"Step {step['step_id']} – {step['title']}") + print(f" Reasoning : {step['reasoning']}") + print(f" Action : {step['action']}") + deps = step.get("depends_on", []) + if deps: + print(f" Depends on: steps {deps}") + print() + + +# ── Part B: Iterative Execution Phase ──────────────────────────────────────── +print_separator("Part B – Iterative Execution Phase (CoT Step 2)") + +# Instead of dumping the entire plan into a single prompt, we iterate through +# each step individually. After every step we: +# 1. Feed the model only the CURRENT step + the accumulated code so far +# 2. Validate the output (syntax check via py_compile) +# 3. Use the validated output as input for the next step +# +# This mirrors how a real developer works: implement one change, verify it +# compiles, then move on. It also means the model always works with CONCRETE +# code from the previous step rather than an abstract plan of what it intends +# to write. + +system_exec = """\ +You are a senior Python developer. You receive the current state of a +Python module together with a single step to implement. Apply ONLY the +requested change. Return the complete updated module — no explanations +outside the code block. +""" + + +def validate_syntax_ast(code: str) -> tuple[bool, str]: + """Use ast.parse to check whether code is syntactically valid Python.""" + try: + ast.parse(code) + return True, "" + except SyntaxError as e: + return False, str(e) + +def validate_syntax(code: str) -> tuple[bool, str]: + """Write code to a temp file and run py_compile to check syntax.""" + tmp = Path("_tmp_validate.py") + # TODO: write code to tmp, run py_compile, clean up, return result + tmp.unlink(missing_ok=True) + return True, "" # placeholder + + +current_code = code_to_fix # start with the original buggy code + +for step in plan["steps"]: + step_id = step["step_id"] + print_separator(f"Executing Step {step_id} – {step['title']}") + + prompt_step = f"""\ +<current_code> +{current_code} +</current_code> + +<step> + Step {step_id}: {step['title']} + Action: {step['action']} + Reasoning: {step['reasoning']} +</step> + +<task> + Apply ONLY this single step to the current code above. + Do not skip ahead to other steps. + Mark your change with a comment: # Step {step_id} – {step['title']} + Return the complete updated Python module. + Do not include any explanation outside the code. +</task>""" + + messages_step = [ + {"role": "system", "content": system_exec}, + {"role": "user", "content": prompt_step}, + ] + + print_messages(messages_step) + raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096) + step_code = strip_code_fences(raw_response) + + # ── Validate: syntax check before moving on ── + ok, error_msg = validate_syntax(step_code) + if ok: + print(f" [PASS] Step {step_id} – syntax OK") + current_code = step_code + else: + print(f" [FAIL] Step {step_id} – syntax error:\n{error_msg}") + print(" Retrying with error feedback...") + + # Give the model one chance to fix its own syntax error + retry_prompt = f"""\ +The code you returned has a syntax error: + +<error> +{error_msg} +</error> + +<code> +{step_code} +</code> + +<task> + Fix the syntax error and return the complete corrected module. + Do not include any explanation outside the code. +</task>""" + + messages_retry = [ + {"role": "system", "content": system_exec}, + {"role": "user", "content": retry_prompt}, + ] + + print_messages(messages_retry) + retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096) + retry_code = strip_code_fences(retry_response) + + ok2, error_msg2 = validate_syntax(retry_code) + if ok2: + print(f" [PASS] Step {step_id} – retry syntax OK") + current_code = retry_code + else: + print(f" [FAIL] Step {step_id} – retry still has errors: {error_msg2}") + print(" Continuing with last valid code.") + + print(f"\n--- Code after Step {step_id} ---") + print(current_code) + print() + +# Save final result +Path("analyze_me_fixed.py").write_text(current_code) +print("\nSaved iterative CoT result to analyze_me_fixed.py") + +# Final validation: run the module +print_separator("Final Validation – Running analyze_me_fixed.py") +result = subprocess.run( + [sys.executable, "analyze_me_fixed.py"], + capture_output=True, text=True, +) +print("STDOUT:", result.stdout) +if result.stderr: + print("STDERR:", result.stderr) +print(f"Exit code: {result.returncode}") + + +# ── Part C: Baseline – Direct Prompt Without CoT ───────────────────────────── +print_separator("Part C – Baseline: Direct Prompt Without CoT") + +direct_prompt = f"""\ +<problem> +{PROBLEM} +</problem> + +<code language="python" filename="analyze_me.py"> +{code_to_fix} +</code> + +<task> + Rewrite the module so that it satisfies all requirements in <problem>. + Return only the corrected Python code. +</task>""" + +messages_direct = [{"role": "user", "content": direct_prompt}] +print_messages(messages_direct) +direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096) +print(direct_response) + +Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response)) +print("\nSaved direct-prompt result to analyze_me_direct.py") + +print( + "\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n" + "Which is more complete? Which follows the requirements more closely?" +) + + +# ── Reflection Questions ────────────────────────────────────────────────────── +print_separator("Reflection Questions") +print( + "1. How did the iterative CoT output differ from the direct single-shot?\n" + "2. Did the validation step catch any syntax errors? How were they fixed?\n" + "3. What would happen if you gave the model a deliberately wrong plan?\n" + "4. How does this manual CoT pipeline relate to built-in thinking modes\n" + " in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n" + "5. What are the trade-offs of step-by-step iteration vs. one-shot?\n" + " (Think: latency, cost, error isolation, debuggability)\n" + "6. How could you extend the validation step beyond syntax checking?\n" + " (Hint: unit tests, type checking, linting)\n" +) diff --git a/Prompting Exercise/prompting_exercises.pdf b/Prompting Exercise/prompting_exercises.pdf new file mode 100644 index 0000000..5fc43f6 Binary files /dev/null and b/Prompting Exercise/prompting_exercises.pdf differ diff --git a/Prompting Exercise/server_utils.py b/Prompting Exercise/server_utils.py new file mode 100644 index 0000000..aad3134 --- /dev/null +++ b/Prompting Exercise/server_utils.py @@ -0,0 +1,215 @@ +""" +server_utils.py – Shared utilities for AISE501 Prompting Exercises +====================================================================== +Connects to the vLLM inference server at silicon.fhgr.ch via the +OpenAI-compatible API. + +This file is complete — no TODOs here. +""" + +from openai import OpenAI + +# ── Server configuration ────────────────────────────────────────────────────── +HOST = "silicon.fhgr.ch" +PORT = 7080 +API_KEY = "EMPTY" +MODEL = "qwen3.5-35b-a3b" # model ID served on silicon.fhgr.ch + + +def get_client() -> OpenAI: + """Return an OpenAI-compatible client pointing at the vLLM server.""" + base_url = f"http://{HOST}:{PORT}/v1" + return OpenAI(base_url=base_url, api_key=API_KEY) + + +def list_models(client: OpenAI) -> list[str]: + """Return all model IDs available on the server.""" + return [m.id for m in client.models.list().data] + + +def chat( + client: OpenAI, + messages: list[dict], + model: str = MODEL, + temperature: float = 0.2, + max_tokens: int = 2048, +) -> str: + """ + Send a list of chat messages to the LLM and return the response text. + + Qwen3's built-in chain-of-thought "think" mode is disabled via + ``extra_body`` so that replies are direct and not wrapped in + <think>…</think> blocks. + + Parameters + ---------- + client : OpenAI client returned by get_client() + messages : List of {"role": ..., "content": ...} dicts + model : Model ID (default: module-level MODEL constant) + temperature : Sampling temperature (0 = deterministic, 1 = creative) + max_tokens : Maximum number of tokens in the response + """ + response = client.chat.completions.create( + model=model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ) + return response.choices[0].message.content + + +def chat_json( + client: OpenAI, + messages: list[dict], + model: str = MODEL, + temperature: float = 0.2, + max_tokens: int = 2048, +) -> str: + """ + Like chat(), but forces the model to emit syntactically valid JSON via + response_format={"type": "json_object"}. + + The server constrains token sampling so the output is always parseable + by json.loads() — no post-processing needed. Use this whenever you + need structured JSON output (Exercises 3 and 4). + + Parameters are the same as chat(); temperature defaults to 0.2 because + deterministic output is usually preferable for structured data. + """ + response = client.chat.completions.create( + model=model, + messages=messages, + max_tokens=max_tokens, + temperature=temperature, + response_format={"type": "json_object"}, + extra_body={"chat_template_kwargs": {"enable_thinking": False}}, + ) + return response.choices[0].message.content + + +def _repair_json_strings(text: str) -> str: + """ + Replace unescaped control characters (newline, tab, carriage return) + inside JSON string values with their proper escape sequences. + + LLMs frequently emit literal newlines inside long string values, which + is invalid JSON. This function fixes that without touching structural + whitespace outside strings. + """ + result: list[str] = [] + in_string = False + escape = False + _escapes = {'\n': '\\n', '\r': '\\r', '\t': '\\t'} + for ch in text: + if escape: + result.append(ch) + escape = False + continue + if ch == '\\' and in_string: + result.append(ch) + escape = True + continue + if ch == '"': + in_string = not in_string + result.append(ch) + continue + if in_string and ch in _escapes: + result.append(_escapes[ch]) + continue + result.append(ch) + return ''.join(result) + + +def extract_json(text: str) -> str: + """ + Extract and repair a JSON object or array from an LLM response that may + contain extra prose, markdown code fences, or unescaped control characters. + + Strategy: + 1. Strip markdown ```json ... ``` or ``` ... ``` fences. + 2. Find the first '{' or '[' and extract to the matching closing bracket. + 3. Repair unescaped newlines/tabs inside string values. + + Returns the cleaned JSON string, or the original text as a fallback + (so json.loads can raise a meaningful error with context). + """ + import re + + # 1. Strip markdown fences + fenced = re.sub(r"```(?:json)?\s*([\s\S]*?)\s*```", r"\1", text.strip()) + if fenced != text.strip(): + return _repair_json_strings(fenced.strip()) + + # 2. Find first JSON container and extract to matching close + extracted = text + for start_char, end_char in [('{', '}'), ('[', ']')]: + idx = text.find(start_char) + if idx == -1: + continue + depth = 0 + in_string = False + escape = False + for i, ch in enumerate(text[idx:], start=idx): + if escape: + escape = False + continue + if ch == '\\' and in_string: + escape = True + continue + if ch == '"': + in_string = not in_string + continue + if in_string: + continue + if ch == start_char: + depth += 1 + elif ch == end_char: + depth -= 1 + if depth == 0: + extracted = text[idx: i + 1] + break + break + + # 3. Repair unescaped control characters inside string values + return _repair_json_strings(extracted) + + +def strip_code_fences(text: str) -> str: + """Remove markdown code fences (```python ... ```) from LLM output. + + LLMs often wrap code in fences even when told not to. Call this before + writing LLM-generated code to a .py file so it is directly executable. + """ + import re + text = text.strip() + text = re.sub(r"^```\w*\n?", "", text) + text = re.sub(r"\n?```\s*$", "", text) + return text.strip() + + +def print_messages(messages: list[dict]) -> None: + """Print the full messages list before sending it to the LLM. + + Call this before chat() or chat_json() to inspect the exact prompt + hierarchy (system + user + assistant turns) that the model receives. + This is the primary debugging and learning tool for prompt engineering. + """ + width = 64 + print("\n" + "═" * width) + print(" PROMPT SENT TO LLM") + print("═" * width) + for msg in messages: + role = msg["role"].upper() + print(f"\n── [{role}] " + "─" * max(0, width - len(role) - 6)) + print(msg["content"]) + print("\n" + "═" * width) + + +def print_separator(title: str = "") -> None: + """Print a visual separator with an optional title.""" + width = 64 + print("\n" + "─" * width) + if title: + print(f" {title}") + print("─" * width) diff --git a/Prompting Exercise/test_connection.py b/Prompting Exercise/test_connection.py new file mode 100644 index 0000000..1404db6 --- /dev/null +++ b/Prompting Exercise/test_connection.py @@ -0,0 +1,23 @@ +""" +test_connection.py – Verify the vLLM server connection +========================================================= +Run this script from the prompting_exercises/ directory before starting +the exercises: + + python test_connection.py + +Expected output: + Models available: ['qwen3.5-35b-a3b'] + Connection OK. +""" + +from server_utils import get_client, list_models + +client = get_client() +models = list_models(client) +print(f"Models available: {models}") + +if models: + print("Connection OK.") +else: + print("WARNING: no models returned – check server address and port.") diff --git a/code_embeddings_pca.png b/code_embeddings_pca.png new file mode 100644 index 0000000..ead9793 Binary files /dev/null and b/code_embeddings_pca.png differ diff --git a/code_embeddings_tsne.png b/code_embeddings_tsne.png new file mode 100644 index 0000000..842df8d Binary files /dev/null and b/code_embeddings_tsne.png differ diff --git a/pca_denoising_analysis.png b/pca_denoising_analysis.png new file mode 100644 index 0000000..8461c6d Binary files /dev/null and b/pca_denoising_analysis.png differ diff --git a/Übung: Clean Code/Student Grade Calculator.py b/Übung: Clean Code/Student Grade Calculator.py new file mode 100644 index 0000000..e650a1c --- /dev/null +++ b/Übung: Clean Code/Student Grade Calculator.py @@ -0,0 +1,72 @@ +""" +Bad example +""" + +def calc (l) : + t =0 + for i in l: + t = t + i + a = t / len (l) + if a >=90: + g = "A" + elif a >=80: + g = "B" + elif a >=70: + g = "C" + elif a >=60: + g = "D" + else : + g = "F" + return g, a + +def doeverything (n, s1, s2, s3, s4, s5) : + print ("Processing student :"+ n) + l = [s1, s2, s3, s4, s5] + r = calc (l) + print ("Average :"+ str (r [1])) + print ("Grade :"+ r [0]) + if r[1] >= 60: + print ("Status : PASSED") + else: + print ("Status : FAILED") + return r + +# main program +x = "John" +doeverything (x,85,90,78,92,88) +print ("---") +y = "Jane" +doeverything (y,55,60,45,50,58) +print ("---") +z = "Bob" +doeverything (z,70,75,80,72,78) + +""" +[x] Naming conventions (variables, functions, classes) +[x] Code structure and indentation +[x] Magic numbers and constants +[x] Function length and single responsibility +[ ] DRY principle (Don’t Repeat Yourself) +[x] Comments and documentation +[x] Error handling +[x] Whitespace and formatting +[ ] Mutable default arguments +""" + +""" +good example +""" + +def calculate_avg(points: list[int]) -> float: + return sum(points) / len(points) + +def calculate_grade(point_avg: float) -> str: + grade_dict = { + (lambda avg: avg >= 90): "A", + (lambda avg: avg >= 80): "B", + (lambda avg: avg >= 70): "C", + (lambda avg: avg >= 60): "D", + (lambda avg: avg < 60): "F" + } + + return grade_dict.get(point_avg) \ No newline at end of file