Add lessons week 1-5
172
.gitignore
vendored
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
# ---> Python
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# UV
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
#uv.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
**/.env
|
||||||
|
**/.venv
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
10
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Ignored default folder with query files
|
||||||
|
/queries/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
14
.idea/aise-501_aise_in_se_i.iml
generated
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.12 (aise-501_aise_in_se_i)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
<component name="PyDocumentationSettings">
|
||||||
|
<option name="format" value="PLAIN" />
|
||||||
|
<option name="myDocStringFormat" value="Plain" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
12
.idea/dataSources.xml
generated
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||||
|
<data-source source="LOCAL" name="postgres@localhost" uuid="be9eece5-a8ff-447a-a6a9-4660fffe89da">
|
||||||
|
<driver-ref>postgresql</driver-ref>
|
||||||
|
<synchronize>true</synchronize>
|
||||||
|
<jdbc-driver>org.postgresql.Driver</jdbc-driver>
|
||||||
|
<jdbc-url>jdbc:postgresql://localhost:5432/postgres</jdbc-url>
|
||||||
|
<working-dir>$ProjectFileDir$</working-dir>
|
||||||
|
</data-source>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/data_source_mapping.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="DataSourcePerFileMappings">
|
||||||
|
<file url="file://$APPLICATION_CONFIG_DIR$/scratches/scratch_1.sql" value="be9eece5-a8ff-447a-a6a9-4660fffe89da" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
7
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="PROJECT_PROFILE" value="Default" />
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
7
.idea/misc.xml
generated
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.12" />
|
||||||
|
</component>
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.12 (aise-501_aise_in_se_i)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/aise-501_aise_in_se_i.iml" filepath="$PROJECT_DIR$/.idea/aise-501_aise_in_se_i.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
272
AISE501 LLM Zugang/STUDENT_GUIDE.md
Normal file
@ -0,0 +1,272 @@
|
|||||||
|
# Student Guide — Qwen3.5 Inference Server
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
A **Qwen3.5** large language model is running on our GPU server. Two models
|
||||||
|
may be available at different times (your instructor will let you know which
|
||||||
|
one is active):
|
||||||
|
|
||||||
|
| Model | Params | Best for |
|
||||||
|
|-------|--------|----------|
|
||||||
|
| `qwen3.5-35b-a3b` | 35B (3B active) | Fast responses, everyday tasks |
|
||||||
|
| `qwen3.5-122b-a10b-fp8` | 122B (10B active) | Complex reasoning, coding, research |
|
||||||
|
|
||||||
|
There are **three ways** to interact with the model:
|
||||||
|
|
||||||
|
1. **Open WebUI** — ChatGPT-like interface in your browser (easiest)
|
||||||
|
2. **Streamlit App** — Local app with chat, file editor, and code execution
|
||||||
|
3. **Python SDK / curl** — Programmatic access via the OpenAI-compatible API
|
||||||
|
|
||||||
|
> **Note**: You must be on the fhgr network or VPN to reach the server.
|
||||||
|
|
||||||
|
## Connection Details
|
||||||
|
|
||||||
|
| Parameter | Value |
|
||||||
|
|------------------|---------------------------------------------|
|
||||||
|
| **Open WebUI** | `http://silicon.fhgr.ch:7081` |
|
||||||
|
| **API Base URL** | `http://silicon.fhgr.ch:7080/v1` |
|
||||||
|
| **Model** | *(check Open WebUI model selector or ask your instructor)* |
|
||||||
|
| **API Key** | *(ask your instructor — may be `EMPTY`)* |
|
||||||
|
|
||||||
|
> **Tip**: In Open WebUI, the model dropdown at the top automatically shows
|
||||||
|
> whichever model is currently running. For the API, use
|
||||||
|
> `curl http://silicon.fhgr.ch:7080/v1/models` to check.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Option 1: Open WebUI (Recommended)
|
||||||
|
|
||||||
|
The easiest way to chat with the model — no installation required.
|
||||||
|
|
||||||
|
### Getting Started
|
||||||
|
|
||||||
|
1. Make sure you are connected to the **university network** (or VPN).
|
||||||
|
2. Open your browser and go to **http://silicon.fhgr.ch:7081**
|
||||||
|
3. Click **"Sign Up"** to create a new account:
|
||||||
|
- Enter your **name** (e.g. your first and last name)
|
||||||
|
- Enter your **email** (use your university email)
|
||||||
|
- Choose a **password**
|
||||||
|
- Click **"Create Account"**
|
||||||
|
4. After signing up you are logged in automatically.
|
||||||
|
5. Select the model **qwen3.5-35b-a3b** from the model dropdown at the top.
|
||||||
|
6. Type a message and press Enter — you're chatting with the LLM.
|
||||||
|
|
||||||
|
### Returning Later
|
||||||
|
|
||||||
|
- Go to **http://silicon.fhgr.ch:7081** and click **"Sign In"**.
|
||||||
|
- Enter the email and password you used during sign-up.
|
||||||
|
- All your previous chats are still there.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- **Chat history** — all conversations are saved on the server and persist across sessions
|
||||||
|
- **Markdown rendering** with syntax-highlighted code blocks
|
||||||
|
- **Model selector** — auto-discovers available models from the server
|
||||||
|
- **Conversation branching** — edit previous messages and explore alternative responses
|
||||||
|
- **File upload** — attach files to your messages for the model to analyze
|
||||||
|
- **Search** — search across all your past conversations
|
||||||
|
|
||||||
|
### Tips
|
||||||
|
|
||||||
|
- Your account and chat history are stored on the server. You can log in
|
||||||
|
from any device on the university network.
|
||||||
|
- If you forget your password, ask your instructor to reset it via the
|
||||||
|
Admin Panel.
|
||||||
|
- The model works best when you provide clear, specific instructions.
|
||||||
|
- For code tasks, mention the programming language explicitly (e.g.
|
||||||
|
"Write a Python function that...").
|
||||||
|
- Long conversations use more context. Start a **New Chat** (top-left
|
||||||
|
button) when switching topics to get faster, more focused responses.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Option 2: Streamlit App (Chat + File Editor)
|
||||||
|
|
||||||
|
A local app with chat, file editing, and Python/LaTeX execution.
|
||||||
|
See the [Streamlit section below](#streamlit-chat--file-editor-app) for setup.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Option 3: Python SDK / curl
|
||||||
|
|
||||||
|
For programmatic access and scripting.
|
||||||
|
|
||||||
|
### Quick Start with Python
|
||||||
|
|
||||||
|
#### 1. Install the OpenAI SDK
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install openai
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Simple Chat
|
||||||
|
|
||||||
|
```python
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
client = OpenAI(
|
||||||
|
base_url="http://silicon.fhgr.ch:7080/v1",
|
||||||
|
api_key="EMPTY", # replace if your instructor set a key
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="qwen3.5-35b-a3b",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are a helpful assistant."},
|
||||||
|
{"role": "user", "content": "Explain gradient descent in simple terms."},
|
||||||
|
],
|
||||||
|
max_tokens=1024,
|
||||||
|
temperature=0.7,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response.choices[0].message.content)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Streaming Responses
|
||||||
|
|
||||||
|
```python
|
||||||
|
stream = client.chat.completions.create(
|
||||||
|
model="qwen3.5-35b-a3b",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Write a haiku about machine learning."},
|
||||||
|
],
|
||||||
|
max_tokens=256,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
for chunk in stream:
|
||||||
|
if chunk.choices[0].delta.content:
|
||||||
|
print(chunk.choices[0].delta.content, end="", flush=True)
|
||||||
|
print()
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Quick Start with curl
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://silicon.fhgr.ch:7080/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "qwen3.5-35b-a3b",
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What is the capital of Switzerland?"}
|
||||||
|
],
|
||||||
|
"max_tokens": 256,
|
||||||
|
"temperature": 0.7
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recommended Parameters
|
||||||
|
|
||||||
|
| Parameter | Recommended | Notes |
|
||||||
|
|-----------------|-------------|----------------------------------------------|
|
||||||
|
| `temperature` | 0.7 | Lower = more deterministic, higher = creative |
|
||||||
|
| `max_tokens` | 1024–4096 | Increase for long-form output |
|
||||||
|
| `top_p` | 0.95 | Nucleus sampling |
|
||||||
|
| `stream` | `true` | Better UX for interactive use |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tips & Etiquette
|
||||||
|
|
||||||
|
- **Be mindful of context length**: Avoid excessively long prompts (>8K tokens) unless necessary.
|
||||||
|
- **Use streaming**: Makes responses feel faster and reduces perceived latency.
|
||||||
|
- **Don't spam requests**: The server is shared among ~15 students.
|
||||||
|
- **Check the model name**: Always use `qwen3.5-35b-a3b` as the model parameter.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Streamlit Chat & File Editor App
|
||||||
|
|
||||||
|
A web UI is included for chatting with the model and editing files. It runs
|
||||||
|
on your own machine and connects to the GPU server.
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Clone the repository
|
||||||
|
git clone https://gitea.fhgr.ch/herzogfloria/LLM_Inferenz_Server_1.git
|
||||||
|
cd LLM_Inferenz_Server_1
|
||||||
|
|
||||||
|
# Create a virtual environment and install dependencies
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate # macOS / Linux
|
||||||
|
# .venv\Scripts\activate # Windows
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
streamlit run app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Opens at `http://localhost:8501` in your browser.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
**Chat Tab**
|
||||||
|
- Conversational interface with streaming responses
|
||||||
|
- "Save code" button extracts code from the LLM response and saves it to a
|
||||||
|
workspace file (strips markdown formatting automatically)
|
||||||
|
|
||||||
|
**File Editor Tab**
|
||||||
|
- Create and edit `.py`, `.tex`, `.html`, or any text file
|
||||||
|
- Syntax-highlighted preview of file content
|
||||||
|
- "Generate with LLM" button: describe a change in natural language and the
|
||||||
|
model rewrites the file (e.g. "add error handling", "fix the LaTeX formatting",
|
||||||
|
"translate comments to German")
|
||||||
|
|
||||||
|
**Sidebar Controls**
|
||||||
|
- **Connection**: API Base URL and API Key
|
||||||
|
- **LLM Parameters**: Adjustable for each request
|
||||||
|
|
||||||
|
| Parameter | Default | What it does |
|
||||||
|
|-----------|---------|--------------|
|
||||||
|
| Thinking Mode | Off | Toggle chain-of-thought reasoning (better for complex tasks, slower) |
|
||||||
|
| Temperature | 0.7 | Lower = predictable, higher = creative |
|
||||||
|
| Max Tokens | 4096 | Maximum response length |
|
||||||
|
| Top P | 0.95 | Nucleus sampling threshold |
|
||||||
|
| Presence Penalty | 0.0 | Encourage diverse topics |
|
||||||
|
|
||||||
|
- **File Manager**: Create new files and switch between them
|
||||||
|
|
||||||
|
All generated files are stored in a `workspace/` folder next to `app.py`.
|
||||||
|
|
||||||
|
> **Tip**: The app runs entirely on your local machine. Only the LLM requests
|
||||||
|
> go to the server — your files stay local.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Thinking Mode
|
||||||
|
|
||||||
|
By default, the model "thinks" before answering (internal chain-of-thought).
|
||||||
|
This is great for complex reasoning but adds latency for simple questions.
|
||||||
|
|
||||||
|
To disable thinking and get faster direct responses, add this to your API call:
|
||||||
|
|
||||||
|
```python
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="qwen3.5-35b-a3b",
|
||||||
|
messages=[...],
|
||||||
|
max_tokens=1024,
|
||||||
|
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
| Issue | Solution |
|
||||||
|
|-----------------------------|-----------------------------------------------------|
|
||||||
|
| Connection refused | Check you're on the university network / VPN |
|
||||||
|
| Model not found | Use model name `qwen3.5-35b-a3b` exactly |
|
||||||
|
| Slow responses | The model is shared — peak times may be slower |
|
||||||
|
| `401 Unauthorized` | Ask your instructor for the API key |
|
||||||
|
| Response cut off | Increase `max_tokens` in your request |
|
||||||
|
| Open WebUI login fails | Make sure you created an account first (Sign Up) |
|
||||||
|
| Open WebUI shows no models | The vLLM server may still be loading — wait a few minutes |
|
||||||
346
AISE501 LLM Zugang/app.py
Normal file
@ -0,0 +1,346 @@
|
|||||||
|
"""
|
||||||
|
Streamlit Chat & File Editor for Qwen3.5
|
||||||
|
|
||||||
|
A minimal interface to:
|
||||||
|
1. Chat with the local LLM (OpenAI-compatible API)
|
||||||
|
2. Edit, save, and generate code / LaTeX files
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
pip install streamlit openai
|
||||||
|
streamlit run app.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import streamlit as st
|
||||||
|
from openai import OpenAI
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Configuration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
st.sidebar.header("Connection")
|
||||||
|
API_BASE = st.sidebar.text_input("API Base URL", "http://silicon.fhgr.ch:7080/v1")
|
||||||
|
API_KEY = st.sidebar.text_input("API Key", "EMPTY", type="password")
|
||||||
|
WORKSPACE = Path("workspace")
|
||||||
|
WORKSPACE.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
client = OpenAI(base_url=API_BASE, api_key=API_KEY)
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data(ttl=30)
|
||||||
|
def fetch_models(base_url: str, api_key: str) -> list[str]:
|
||||||
|
"""Fetch available model IDs from the vLLM server."""
|
||||||
|
try:
|
||||||
|
c = OpenAI(base_url=base_url, api_key=api_key)
|
||||||
|
return [m.id for m in c.models.list().data]
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
available_models = fetch_models(API_BASE, API_KEY)
|
||||||
|
if available_models:
|
||||||
|
MODEL = st.sidebar.selectbox("Model", available_models)
|
||||||
|
else:
|
||||||
|
MODEL = st.sidebar.text_input("Model (server unreachable)", "qwen3.5-35b-a3b")
|
||||||
|
st.sidebar.warning("Could not fetch models from server.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Sidebar — LLM Parameters
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
st.sidebar.markdown("---")
|
||||||
|
st.sidebar.header("LLM Parameters")
|
||||||
|
|
||||||
|
thinking_mode = st.sidebar.toggle("Thinking Mode", value=False,
|
||||||
|
help="Enable chain-of-thought reasoning. Better for complex tasks, slower for simple ones.")
|
||||||
|
temperature = st.sidebar.slider("Temperature", 0.0, 2.0, 0.7, 0.05,
|
||||||
|
help="Lower = deterministic, higher = creative.")
|
||||||
|
max_tokens = st.sidebar.slider("Max Tokens", 256, 16384, 4096, 256,
|
||||||
|
help="Maximum length of the response.")
|
||||||
|
top_p = st.sidebar.slider("Top P", 0.0, 1.0, 0.95, 0.05,
|
||||||
|
help="Nucleus sampling: only consider tokens within this cumulative probability.")
|
||||||
|
presence_penalty = st.sidebar.slider("Presence Penalty", 0.0, 2.0, 0.0, 0.1,
|
||||||
|
help="Penalize repeated topics. Higher values encourage the model to talk about new topics.")
|
||||||
|
|
||||||
|
LANG_MAP = {
|
||||||
|
".py": "python", ".tex": "latex", ".js": "javascript",
|
||||||
|
".html": "html", ".css": "css", ".sh": "bash",
|
||||||
|
".json": "json", ".yaml": "yaml", ".yml": "yaml",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
MAX_CONTEXT = 32768
|
||||||
|
|
||||||
|
|
||||||
|
def extract_code(text: str, lang: str = "") -> str:
|
||||||
|
"""Extract the best code block from markdown text.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Prefer blocks tagged with the target language (e.g. ```python)
|
||||||
|
2. Among candidates, pick the longest block (skip trivial one-liners)
|
||||||
|
3. Fall back to the longest block of any language
|
||||||
|
4. Fall back to the full text if no fenced block is found
|
||||||
|
"""
|
||||||
|
tagged_pattern = r"```(\w*)\n(.*?)```"
|
||||||
|
matches = re.findall(tagged_pattern, text, re.DOTALL)
|
||||||
|
if not matches:
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
lang_lower = lang.lower()
|
||||||
|
lang_matches = [code for tag, code in matches if tag.lower() == lang_lower]
|
||||||
|
if lang_matches:
|
||||||
|
return max(lang_matches, key=len).strip()
|
||||||
|
|
||||||
|
all_blocks = [code for _, code in matches]
|
||||||
|
return max(all_blocks, key=len).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_tokens(messages: list[dict]) -> int:
|
||||||
|
"""Rough token estimate: ~4 characters per token."""
|
||||||
|
return sum(len(m["content"]) for m in messages) // 4
|
||||||
|
|
||||||
|
|
||||||
|
def trim_history(messages: list[dict], reserved: int) -> list[dict]:
|
||||||
|
"""Drop oldest message pairs to fit within context budget.
|
||||||
|
Always keeps the latest user message."""
|
||||||
|
budget = MAX_CONTEXT - reserved
|
||||||
|
while len(messages) > 1 and estimate_tokens(messages) > budget:
|
||||||
|
messages.pop(0)
|
||||||
|
return messages
|
||||||
|
|
||||||
|
|
||||||
|
RUNNABLE_EXTENSIONS = {".py", ".tex"}
|
||||||
|
RUN_TIMEOUT = 30
|
||||||
|
|
||||||
|
|
||||||
|
def run_file(file_path: Path) -> dict:
|
||||||
|
"""Execute a .py or .tex file and return stdout, stderr, and return code."""
|
||||||
|
suffix = file_path.suffix
|
||||||
|
cwd = file_path.parent.resolve()
|
||||||
|
|
||||||
|
if suffix == ".py":
|
||||||
|
cmd = ["python3", file_path.name]
|
||||||
|
elif suffix == ".tex":
|
||||||
|
cmd = [
|
||||||
|
"pdflatex",
|
||||||
|
"-interaction=nonstopmode",
|
||||||
|
f"-output-directory={cwd}",
|
||||||
|
file_path.name,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
return {"stdout": "", "stderr": f"Unsupported file type: {suffix}", "rc": 1}
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
cwd=cwd,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=RUN_TIMEOUT,
|
||||||
|
)
|
||||||
|
return {"stdout": proc.stdout, "stderr": proc.stderr, "rc": proc.returncode}
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return {"stdout": "", "stderr": f"Timed out after {RUN_TIMEOUT}s", "rc": -1}
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
return {"stdout": "", "stderr": str(e), "rc": -1}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Sidebar — File Manager
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
st.sidebar.markdown("---")
|
||||||
|
st.sidebar.header("File Manager")
|
||||||
|
|
||||||
|
new_filename = st.sidebar.text_input("New file name", placeholder="main.tex")
|
||||||
|
if st.sidebar.button("Create File") and new_filename:
|
||||||
|
(WORKSPACE / new_filename).touch()
|
||||||
|
st.sidebar.success(f"Created {new_filename}")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
files = sorted(WORKSPACE.iterdir()) if WORKSPACE.exists() else []
|
||||||
|
file_names = [f.name for f in files if f.is_file()]
|
||||||
|
selected_file = st.sidebar.selectbox("Open file", file_names if file_names else ["(no files)"])
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main Layout — Two Tabs
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
tab_chat, tab_editor = st.tabs(["Chat", "File Editor"])
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tab 1: Chat
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
with tab_chat:
|
||||||
|
st.header(f"Chat with {MODEL}")
|
||||||
|
|
||||||
|
if "messages" not in st.session_state:
|
||||||
|
st.session_state.messages = []
|
||||||
|
|
||||||
|
for msg in st.session_state.messages:
|
||||||
|
with st.chat_message(msg["role"]):
|
||||||
|
st.markdown(msg["content"])
|
||||||
|
|
||||||
|
if prompt := st.chat_input("Ask anything..."):
|
||||||
|
st.session_state.messages.append({"role": "user", "content": prompt})
|
||||||
|
with st.chat_message("user"):
|
||||||
|
st.markdown(prompt)
|
||||||
|
|
||||||
|
st.session_state.messages = trim_history(
|
||||||
|
st.session_state.messages, reserved=max_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
with st.chat_message("assistant"):
|
||||||
|
placeholder = st.empty()
|
||||||
|
full_response = ""
|
||||||
|
|
||||||
|
stream = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=st.session_state.messages,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
presence_penalty=presence_penalty,
|
||||||
|
stream=True,
|
||||||
|
extra_body={"chat_template_kwargs": {"enable_thinking": thinking_mode}},
|
||||||
|
)
|
||||||
|
for chunk in stream:
|
||||||
|
delta = chunk.choices[0].delta.content or ""
|
||||||
|
full_response += delta
|
||||||
|
placeholder.markdown(full_response + "▌")
|
||||||
|
placeholder.markdown(full_response)
|
||||||
|
|
||||||
|
st.session_state.messages.append({"role": "assistant", "content": full_response})
|
||||||
|
|
||||||
|
if st.session_state.messages:
|
||||||
|
used = estimate_tokens(st.session_state.messages)
|
||||||
|
pct = min(used / MAX_CONTEXT, 1.0)
|
||||||
|
label = f"Context: ~{used:,} / {MAX_CONTEXT:,} tokens"
|
||||||
|
if pct > 0.8:
|
||||||
|
label += " ⚠️ nearing limit — older messages will be trimmed"
|
||||||
|
st.progress(pct, text=label)
|
||||||
|
|
||||||
|
col_clear, col_save = st.columns([1, 3])
|
||||||
|
with col_clear:
|
||||||
|
if st.button("Clear Chat"):
|
||||||
|
st.session_state.messages = []
|
||||||
|
st.rerun()
|
||||||
|
with col_save:
|
||||||
|
if selected_file and selected_file != "(no files)":
|
||||||
|
if st.button(f"Save code → {selected_file}"):
|
||||||
|
last = st.session_state.messages[-1]["content"]
|
||||||
|
suffix = Path(selected_file).suffix
|
||||||
|
lang = LANG_MAP.get(suffix, "")
|
||||||
|
code = extract_code(last, lang)
|
||||||
|
(WORKSPACE / selected_file).write_text(code)
|
||||||
|
st.success(f"Extracted code saved to workspace/{selected_file}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tab 2: File Editor
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
with tab_editor:
|
||||||
|
st.header("File Editor")
|
||||||
|
|
||||||
|
if selected_file and selected_file != "(no files)":
|
||||||
|
file_path = WORKSPACE / selected_file
|
||||||
|
content = file_path.read_text() if file_path.exists() else ""
|
||||||
|
suffix = file_path.suffix
|
||||||
|
lang = LANG_MAP.get(suffix, "text")
|
||||||
|
runnable = suffix in RUNNABLE_EXTENSIONS
|
||||||
|
|
||||||
|
if runnable:
|
||||||
|
col_edit, col_term = st.columns([3, 2])
|
||||||
|
else:
|
||||||
|
col_edit = st.container()
|
||||||
|
|
||||||
|
with col_edit:
|
||||||
|
st.code(content, language=lang if lang != "text" else None, line_numbers=True)
|
||||||
|
|
||||||
|
edited = st.text_area(
|
||||||
|
"Edit below:",
|
||||||
|
value=content,
|
||||||
|
height=400,
|
||||||
|
key=f"editor_{selected_file}_{hash(content)}",
|
||||||
|
)
|
||||||
|
|
||||||
|
col_save, col_gen = st.columns(2)
|
||||||
|
|
||||||
|
with col_save:
|
||||||
|
if st.button("Save File"):
|
||||||
|
file_path.write_text(edited)
|
||||||
|
st.success(f"Saved {selected_file}")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with col_gen:
|
||||||
|
gen_prompt = st.text_input(
|
||||||
|
"Generation instruction",
|
||||||
|
placeholder="e.g. Add error handling / Fix the LaTeX formatting",
|
||||||
|
key="gen_prompt",
|
||||||
|
)
|
||||||
|
if st.button("Generate with LLM") and gen_prompt:
|
||||||
|
with st.spinner("Generating..."):
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=MODEL,
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": (
|
||||||
|
f"You are a coding assistant. The user has a {lang} file. "
|
||||||
|
"Return ONLY the raw file content inside a single code block. "
|
||||||
|
"No explanations, no comments about changes."
|
||||||
|
)},
|
||||||
|
{"role": "user", "content": (
|
||||||
|
f"Here is my {lang} file:\n\n```\n{edited}\n```\n\n"
|
||||||
|
f"Instruction: {gen_prompt}"
|
||||||
|
)},
|
||||||
|
],
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
extra_body={"chat_template_kwargs": {"enable_thinking": thinking_mode}},
|
||||||
|
)
|
||||||
|
result = response.choices[0].message.content
|
||||||
|
code = extract_code(result, lang)
|
||||||
|
file_path.write_text(code)
|
||||||
|
st.success("File updated by LLM")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if runnable:
|
||||||
|
with col_term:
|
||||||
|
run_label = "Compile LaTeX" if suffix == ".tex" else "Run Python"
|
||||||
|
st.subheader("Terminal Output")
|
||||||
|
|
||||||
|
if st.button(run_label, type="primary"):
|
||||||
|
file_path.write_text(edited)
|
||||||
|
with st.spinner(f"{'Compiling' if suffix == '.tex' else 'Running'}..."):
|
||||||
|
result = run_file(file_path)
|
||||||
|
st.session_state["last_run"] = result
|
||||||
|
|
||||||
|
result = st.session_state.get("last_run")
|
||||||
|
if result:
|
||||||
|
if result["rc"] == 0:
|
||||||
|
st.success(f"Exit code: {result['rc']}")
|
||||||
|
else:
|
||||||
|
st.error(f"Exit code: {result['rc']}")
|
||||||
|
|
||||||
|
if result["stdout"]:
|
||||||
|
st.text_area(
|
||||||
|
"stdout",
|
||||||
|
value=result["stdout"],
|
||||||
|
height=300,
|
||||||
|
disabled=True,
|
||||||
|
key="run_stdout",
|
||||||
|
)
|
||||||
|
if result["stderr"]:
|
||||||
|
st.text_area(
|
||||||
|
"stderr",
|
||||||
|
value=result["stderr"],
|
||||||
|
height=200,
|
||||||
|
disabled=True,
|
||||||
|
key="run_stderr",
|
||||||
|
)
|
||||||
|
if not result["stdout"] and not result["stderr"]:
|
||||||
|
st.info("No output produced.")
|
||||||
|
else:
|
||||||
|
st.caption(
|
||||||
|
f"Click **{run_label}** to execute the file "
|
||||||
|
f"(timeout: {RUN_TIMEOUT}s)."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
st.info("Create a file in the sidebar to start editing.")
|
||||||
2
AISE501 LLM Zugang/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
streamlit
|
||||||
|
openai
|
||||||
70
AISE501 LLM Zugang/test_server.py
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
"""
|
||||||
|
Quick test script to verify the vLLM server is running and responding.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
pip install openai
|
||||||
|
python test_server.py [--host HOST] [--port PORT] [--api-key KEY]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Test vLLM inference server")
|
||||||
|
parser.add_argument("--host", default="localhost", help="Server hostname")
|
||||||
|
parser.add_argument("--port", default=7080, type=int, help="Server port")
|
||||||
|
parser.add_argument("--api-key", default="EMPTY", help="API key")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
base_url = f"http://{args.host}:{args.port}/v1"
|
||||||
|
model = "qwen3.5-35b-a3b"
|
||||||
|
client = OpenAI(base_url=base_url, api_key=args.api_key)
|
||||||
|
|
||||||
|
print(f"Connecting to {base_url} ...")
|
||||||
|
|
||||||
|
print("\n--- Available Models ---")
|
||||||
|
try:
|
||||||
|
models = client.models.list()
|
||||||
|
for m in models.data:
|
||||||
|
print(f" {m.id}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: Cannot connect to server: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
print("\n--- Test Chat Completion ---")
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Create a latex document that derives and explains the principle component analysis (pca). Make a self contain document with introduction, derivation, examples of applications. This is for computer science undergraduate class."}
|
||||||
|
],
|
||||||
|
max_tokens=16384,
|
||||||
|
temperature=0.7,
|
||||||
|
)
|
||||||
|
print(f" Response: {response.choices[0].message.content}")
|
||||||
|
print(f" Tokens: prompt={response.usage.prompt_tokens}, "
|
||||||
|
f"completion={response.usage.completion_tokens}")
|
||||||
|
|
||||||
|
print("\n--- Test Streaming ---")
|
||||||
|
stream = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Count from 1 to 5."}
|
||||||
|
],
|
||||||
|
max_tokens=16384,
|
||||||
|
temperature=0.7,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
print(" Response: ", end="")
|
||||||
|
for chunk in stream:
|
||||||
|
if chunk.choices[0].delta.content:
|
||||||
|
print(chunk.choices[0].delta.content, end="", flush=True)
|
||||||
|
print("\n")
|
||||||
|
|
||||||
|
print("All tests passed!")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
\relax
|
||||||
|
\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo }
|
||||||
|
\@nameuse{bbl@beforestart}
|
||||||
|
\providecommand\hyper@newdestlabel[2]{}
|
||||||
|
\providecommand\HyField@AuxAddToFields[1]{}
|
||||||
|
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||||||
|
\babel@aux{english}{}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {1}Overview}{2}{section.1}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {2}Violation 1: Unused and Poorly Formatted Imports}{2}{section.2}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {3}Violation 2: No Module Docstring or Documentation}{2}{section.3}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {4}Violation 3: Poor Naming Conventions}{3}{section.4}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {5}Violation 4: Formatting and Whitespace}{4}{section.5}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {6}Violation 5: Error Handling}{5}{section.6}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {7}Violation 6: Function Structure and Single Responsibility}{6}{section.7}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {8}Violation 7: Missing \texttt {\_\_main\_\_} Guard}{7}{section.8}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {9}Violation 8: String Concatenation Instead of f-Strings}{7}{section.9}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {10}Summary of Violations}{8}{section.10}\protected@file@percent }
|
||||||
|
\gdef \@abspage@last{8}
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
\BOOKMARK [1][-]{section.1}{\376\377\000O\000v\000e\000r\000v\000i\000e\000w}{}% 1
|
||||||
|
\BOOKMARK [1][-]{section.2}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0001\000:\000\040\000U\000n\000u\000s\000e\000d\000\040\000a\000n\000d\000\040\000P\000o\000o\000r\000l\000y\000\040\000F\000o\000r\000m\000a\000t\000t\000e\000d\000\040\000I\000m\000p\000o\000r\000t\000s}{}% 2
|
||||||
|
\BOOKMARK [1][-]{section.3}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0002\000:\000\040\000N\000o\000\040\000M\000o\000d\000u\000l\000e\000\040\000D\000o\000c\000s\000t\000r\000i\000n\000g\000\040\000o\000r\000\040\000D\000o\000c\000u\000m\000e\000n\000t\000a\000t\000i\000o\000n}{}% 3
|
||||||
|
\BOOKMARK [1][-]{section.4}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0003\000:\000\040\000P\000o\000o\000r\000\040\000N\000a\000m\000i\000n\000g\000\040\000C\000o\000n\000v\000e\000n\000t\000i\000o\000n\000s}{}% 4
|
||||||
|
\BOOKMARK [1][-]{section.5}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0004\000:\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g\000\040\000a\000n\000d\000\040\000W\000h\000i\000t\000e\000s\000p\000a\000c\000e}{}% 5
|
||||||
|
\BOOKMARK [1][-]{section.6}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0005\000:\000\040\000E\000r\000r\000o\000r\000\040\000H\000a\000n\000d\000l\000i\000n\000g}{}% 6
|
||||||
|
\BOOKMARK [1][-]{section.7}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0006\000:\000\040\000F\000u\000n\000c\000t\000i\000o\000n\000\040\000S\000t\000r\000u\000c\000t\000u\000r\000e\000\040\000a\000n\000d\000\040\000S\000i\000n\000g\000l\000e\000\040\000R\000e\000s\000p\000o\000n\000s\000i\000b\000i\000l\000i\000t\000y}{}% 7
|
||||||
|
\BOOKMARK [1][-]{section.8}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0007\000:\000\040\000M\000i\000s\000s\000i\000n\000g\000\040\000\137\000\137\000m\000a\000i\000n\000\137\000\137\000\040\000G\000u\000a\000r\000d}{}% 8
|
||||||
|
\BOOKMARK [1][-]{section.9}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0008\000:\000\040\000S\000t\000r\000i\000n\000g\000\040\000C\000o\000n\000c\000a\000t\000e\000n\000a\000t\000i\000o\000n\000\040\000I\000n\000s\000t\000e\000a\000d\000\040\000o\000f\000\040\000f\000-\000S\000t\000r\000i\000n\000g\000s}{}% 9
|
||||||
|
\BOOKMARK [1][-]{section.10}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000o\000f\000\040\000V\000i\000o\000l\000a\000t\000i\000o\000n\000s}{}% 10
|
||||||
BIN
Clean Code exercise/example1_calculator/calculator_analysis.pdf
Normal file
415
Clean Code exercise/example1_calculator/calculator_analysis.tex
Normal file
@ -0,0 +1,415 @@
|
|||||||
|
\documentclass[12pt,a4paper]{article}
|
||||||
|
\usepackage[utf8]{inputenc}
|
||||||
|
\usepackage[T1]{fontenc}
|
||||||
|
\usepackage[english]{babel}
|
||||||
|
\usepackage{geometry}
|
||||||
|
\geometry{margin=2.5cm}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{tcolorbox}
|
||||||
|
\usepackage{booktabs}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\usepackage{listings}
|
||||||
|
\usepackage{enumitem}
|
||||||
|
|
||||||
|
\definecolor{seblue}{rgb}{0.0,0.28,0.67}
|
||||||
|
\definecolor{segreen}{rgb}{0.13,0.55,0.13}
|
||||||
|
\definecolor{sered}{rgb}{0.7,0.13,0.13}
|
||||||
|
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
|
||||||
|
\definecolor{codegreen}{rgb}{0,0.6,0}
|
||||||
|
\definecolor{codepurple}{rgb}{0.58,0,0.82}
|
||||||
|
|
||||||
|
\lstdefinestyle{pystyle}{
|
||||||
|
backgroundcolor=\color{backcolour},
|
||||||
|
commentstyle=\color{codegreen},
|
||||||
|
keywordstyle=\color{blue},
|
||||||
|
stringstyle=\color{codepurple},
|
||||||
|
basicstyle=\ttfamily\footnotesize,
|
||||||
|
breaklines=true,
|
||||||
|
keepspaces=true,
|
||||||
|
showstringspaces=false,
|
||||||
|
tabsize=4,
|
||||||
|
language=Python
|
||||||
|
}
|
||||||
|
\lstset{style=pystyle}
|
||||||
|
|
||||||
|
\newtcolorbox{badbox}{
|
||||||
|
colback=red!5!white,
|
||||||
|
colframe=sered,
|
||||||
|
title=Bad Code,
|
||||||
|
fonttitle=\bfseries\small,
|
||||||
|
boxrule=0.8pt, arc=2pt,
|
||||||
|
top=2pt, bottom=2pt, left=4pt, right=4pt
|
||||||
|
}
|
||||||
|
|
||||||
|
\newtcolorbox{goodbox}{
|
||||||
|
colback=green!5!white,
|
||||||
|
colframe=segreen,
|
||||||
|
title=Clean Code,
|
||||||
|
fonttitle=\bfseries\small,
|
||||||
|
boxrule=0.8pt, arc=2pt,
|
||||||
|
top=2pt, bottom=2pt, left=4pt, right=4pt
|
||||||
|
}
|
||||||
|
|
||||||
|
\newtcolorbox{principlebox}[1][]{
|
||||||
|
colback=blue!5!white,
|
||||||
|
colframe=seblue,
|
||||||
|
title=#1,
|
||||||
|
fonttitle=\bfseries\small,
|
||||||
|
boxrule=0.8pt, arc=2pt,
|
||||||
|
top=2pt, bottom=2pt, left=4pt, right=4pt
|
||||||
|
}
|
||||||
|
|
||||||
|
\title{\textcolor{seblue}{Code Analysis: Arithmetic Expression Calculator}\\[0.3em]
|
||||||
|
\large What Makes Code Bad and How to Fix It\\[0.3em]
|
||||||
|
\normalsize AISE501 -- AI in Software Engineering I}
|
||||||
|
\author{Dr.\ Florian Herzog}
|
||||||
|
\date{Spring Semester 2026}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\maketitle
|
||||||
|
\tableofcontents
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Overview}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
This document analyses two implementations of the same program --- an arithmetic expression calculator that parses and evaluates strings like \texttt{"3 + 5 * 2"} without using Python's \texttt{eval()}.
|
||||||
|
Both produce correct results, but the first version (\texttt{calculator\_bad.py}) violates numerous PEP\,8 and clean code principles, while the second (\texttt{calculator\_good.py}) follows them consistently.
|
||||||
|
|
||||||
|
The analysis is structured by violation category, with side-by-side comparisons of the bad and good code and references to the specific PEP\,8 rules or clean code principles that apply.
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 1: Unused and Poorly Formatted Imports}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
import sys,os,re;from typing import *
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \texttt{sys}, \texttt{os}, and \texttt{re} are imported but \textbf{never used} anywhere in the code.
|
||||||
|
\item Multiple imports are crammed onto \textbf{one line separated by commas}, violating PEP\,8's rule that imports should be on separate lines.
|
||||||
|
\item A \textbf{semicolon} joins two import statements on one line.
|
||||||
|
\item \texttt{from typing import *} is a \textbf{wildcard import} that pollutes the namespace.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
The good version has \textbf{no imports at all} --- the calculator uses only built-in Python features.
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Imports}: ``Imports should usually be on separate lines.'' Wildcard imports (\texttt{from X import *}) should be avoided.
|
||||||
|
\item \textbf{KISS}: Unused imports add unnecessary complexity.
|
||||||
|
\item \textbf{Clean Code}: Dead code (unused imports) confuses readers about dependencies.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 2: No Module Docstring or Documentation}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
# calculator program
|
||||||
|
def scicalc(s):
|
||||||
|
\end{lstlisting}
|
||||||
|
The only ``documentation'' is a single vague comment. No module docstring, no function docstrings.
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
"""Simple arithmetic expression calculator with a recursive-descent parser.
|
||||||
|
|
||||||
|
Supported operations: +, -, *, / and parentheses.
|
||||||
|
Does NOT use Python's eval().
|
||||||
|
|
||||||
|
Grammar:
|
||||||
|
expression = term (('+' | '-') term)*
|
||||||
|
term = factor (('*' | '/') factor)*
|
||||||
|
factor = NUMBER | '(' expression ')'
|
||||||
|
"""
|
||||||
|
\end{lstlisting}
|
||||||
|
The good version opens with a module docstring that explains the purpose, supported operations, and even the formal grammar. Every function also has a docstring.
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,257}: All public modules, functions, classes, and methods should have docstrings.
|
||||||
|
\item \textbf{Clean Code -- Documentation}: Good documentation helps current and future developers understand the intent.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 3: Poor Naming Conventions}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def scicalc(s): # What does "scicalc" mean?
|
||||||
|
def doPlusMinus(s,a,b):# camelCase, not snake_case
|
||||||
|
def doMulDiv(s,a,b): # "do" is vague
|
||||||
|
def getNum(s, a,b): # inconsistent spacing
|
||||||
|
t=s[a:b] # "t" for what?
|
||||||
|
c=t[i] # "c" for what?
|
||||||
|
L=doPlusMinus(...) # uppercase "L" for a local variable
|
||||||
|
R=doMulDiv(...) # uppercase "R" for a local variable
|
||||||
|
r=doPlusMinus(...) # "r" for result?
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def tokenize(expression_text):
|
||||||
|
def parse_expression(tokens, position):
|
||||||
|
def parse_term(tokens, position):
|
||||||
|
def parse_factor(tokens, position):
|
||||||
|
def calculate(expression_text):
|
||||||
|
character = expression_text[position]
|
||||||
|
operator = tokens[position]
|
||||||
|
right_value, position = parse_term(tokens, position)
|
||||||
|
result, final_position = parse_expression(tokens, 0)
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong in the bad version:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Function names use \textbf{camelCase} (\texttt{doPlusMinus}) instead of \textbf{snake\_case}.
|
||||||
|
\item Variable names are \textbf{single letters} (\texttt{s}, \texttt{a}, \texttt{b}, \texttt{t}, \texttt{c}, \texttt{r}) --- impossible to understand without reading every line.
|
||||||
|
\item \texttt{L} and \texttt{R} use \textbf{uppercase} for local variables, which PEP\,8 reserves for constants.
|
||||||
|
\item Names like \texttt{scicalc} are \textbf{abbreviations} that are not pronounceable or self-explanatory.
|
||||||
|
\item The list of test data is called \texttt{Data} (capitalised like a class) and results \texttt{Res}.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Naming}: Functions and variables use \texttt{lower\_case\_with\_underscores}. Constants use \texttt{UPPER\_CASE}.
|
||||||
|
\item \textbf{Clean Code -- Descriptive Names}: Names should reveal intent. A reader should know what a variable holds without tracing its assignment.
|
||||||
|
\item \textbf{Clean Code -- Pronounceable Names}: \texttt{scicalc} is not a word anyone would say in a conversation.
|
||||||
|
\item \textbf{Clean Code -- No Abbreviations}: \texttt{doPlusMinus} is better than \texttt{dPM}, but \texttt{parse\_expression} communicates the actual operation.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 4: Formatting and Whitespace}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def scicalc(s):
|
||||||
|
s=s.replace(' ','') # 2-space indent
|
||||||
|
if s=='':return 0 # no spaces around ==
|
||||||
|
r=doPlusMinus(s,0,len(s))
|
||||||
|
return r
|
||||||
|
|
||||||
|
def doPlusMinus(s,a,b):
|
||||||
|
t=s[a:b]; level=0; i=len(t)-1 # 4-space indent, semicolons
|
||||||
|
while i>=0: # no space around >=
|
||||||
|
if level==0 and(c=='*' or c=='/'): # missing space before (
|
||||||
|
L = doMulDiv(s,a,a+i); R = getNum(s,a+i+1,b)
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def parse_expression(tokens, position):
|
||||||
|
result, position = parse_term(tokens, position)
|
||||||
|
|
||||||
|
while position < len(tokens) and tokens[position] in ("+", "-"):
|
||||||
|
operator = tokens[position]
|
||||||
|
position += 1
|
||||||
|
right_value, position = parse_term(tokens, position)
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Inconsistent indentation}: \texttt{scicalc} uses 2 spaces, other functions use 4 spaces. PEP\,8 requires 4 spaces consistently.
|
||||||
|
\item \textbf{Semicolons} to put multiple statements on one line (\texttt{t=s[a:b]; level=0; i=len(t)-1}).
|
||||||
|
\item \textbf{Missing whitespace} around operators: \texttt{s=s.replace}, \texttt{i>=0}, \texttt{level==0 and(c==...}.
|
||||||
|
\item \textbf{No blank lines} between logical sections within functions or between function definitions. PEP\,8 requires two blank lines before and after top-level functions.
|
||||||
|
\item Multiple \texttt{return} or assignment statements \textbf{on the same line} as \texttt{if}: \texttt{if s=='':return 0}.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Indentation}: Use 4 spaces per indentation level.
|
||||||
|
\item \textbf{PEP\,8 -- Whitespace}: Surround binary operators with single spaces. Avoid compound statements on one line.
|
||||||
|
\item \textbf{PEP\,8 -- Blank Lines}: Two blank lines around top-level definitions.
|
||||||
|
\item \textbf{Zen of Python}: ``Sparse is better than dense.''
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 5: Error Handling}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if R==0:print("ERROR division by zero!!!") ;return 0
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
try:
|
||||||
|
x = float(t)
|
||||||
|
except:
|
||||||
|
print("bad number: "+t);x=0
|
||||||
|
return x
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if right_value == 0:
|
||||||
|
raise ZeroDivisionError("Division by zero")
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
try:
|
||||||
|
tokens = tokenize(expression_text)
|
||||||
|
result, final_position = parse_expression(tokens, 0)
|
||||||
|
...
|
||||||
|
except (ValueError, ZeroDivisionError) as error:
|
||||||
|
return f"Error: {error}"
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong in the bad version:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Bare \texttt{except}} catches every exception including \texttt{KeyboardInterrupt} and \texttt{SystemExit} --- masking real bugs.
|
||||||
|
\item Errors are handled by \textbf{printing and returning a dummy value} (0), which silently produces wrong results. The caller has no way to know an error occurred.
|
||||||
|
\item The error message style is inconsistent: \texttt{"ERROR division by zero!!!"} vs.\ \texttt{"bad number: ..."}.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\textbf{What the good version does:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Errors \textbf{raise specific exceptions} (\texttt{ValueError}, \texttt{ZeroDivisionError}) at the point of detection.
|
||||||
|
\item The top-level \texttt{calculate()} function catches \textbf{only expected exceptions} and returns a formatted error string.
|
||||||
|
\item Errors \textbf{propagate} rather than being silently swallowed.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Exceptions}: Catch specific exceptions, never use bare \texttt{except}.
|
||||||
|
\item \textbf{Zen of Python}: ``Errors should never pass silently. Unless explicitly silenced.''
|
||||||
|
\item \textbf{Clean Code -- Error Handling}: Anticipate errors and handle them gracefully. Returning magic values (0 for an error) is an anti-pattern.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 6: Function Structure and Single Responsibility}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
The bad version has three intertwined functions (\texttt{doPlusMinus}, \texttt{doMulDiv}, \texttt{getNum}) that each take the \textbf{entire string plus two index parameters} and internally slice the string. Parsing, tokenisation, and evaluation are all mixed together.
|
||||||
|
\begin{lstlisting}
|
||||||
|
def doPlusMinus(s,a,b):
|
||||||
|
t=s[a:b]; level=0; i=len(t)-1
|
||||||
|
while i>=0:
|
||||||
|
...
|
||||||
|
L=doPlusMinus(s,a,a+i);R=doMulDiv(s,a+i+1,b)
|
||||||
|
...
|
||||||
|
return doMulDiv(s,a,b)
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
The good version separates \textbf{tokenisation} from \textbf{parsing}:
|
||||||
|
\begin{lstlisting}
|
||||||
|
tokens = tokenize(expression_text) # Step 1: tokenise
|
||||||
|
result, position = parse_expression(tokens, 0) # Step 2: parse
|
||||||
|
\end{lstlisting}
|
||||||
|
Each parser function has a single, clear responsibility:
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \texttt{tokenize()} -- converts text to tokens
|
||||||
|
\item \texttt{parse\_expression()} -- handles \texttt{+} and \texttt{-}
|
||||||
|
\item \texttt{parse\_term()} -- handles \texttt{*} and \texttt{/}
|
||||||
|
\item \texttt{parse\_factor()} -- handles numbers and parentheses
|
||||||
|
\item \texttt{calculate()} -- orchestrates the pipeline and error handling
|
||||||
|
\end{itemize}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{SRP (Single Responsibility Principle)}: Each function should do one thing.
|
||||||
|
\item \textbf{SoC (Separation of Concerns)}: Tokenisation and parsing are different concerns.
|
||||||
|
\item \textbf{Clean Code -- Short Functions}: If a function takes more than a few minutes to comprehend, it should be refactored.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 7: Missing \texttt{\_\_main\_\_} Guard}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
main()
|
||||||
|
\end{lstlisting}
|
||||||
|
The bad version calls \texttt{main()} at the module level. If another script imports this file, the calculator runs immediately as a side effect.
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
\end{lstlisting}
|
||||||
|
The good version uses the standard \texttt{\_\_main\_\_} guard, so the module can be safely imported without executing the calculator.
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{Clean Code -- Avoid Side Effects}: Importing a module should not trigger execution.
|
||||||
|
\item \textbf{Python Best Practice}: The \texttt{if \_\_name\_\_ == "\_\_main\_\_"} guard is standard for all runnable scripts.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 8: String Concatenation Instead of f-Strings}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
print(d+" = "+str(Res))
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
print(f"{display_expr} = {result}")
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
String concatenation with \texttt{+} and manual \texttt{str()} calls is harder to read than f-strings, which are the idiomatic Python 3.6+ way to format output.
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{Pythonic Code}: Use f-strings for string formatting (readable, efficient).
|
||||||
|
\item \textbf{Clean Code -- Readability}: f-strings make the output format immediately visible.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Summary of Violations}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\small
|
||||||
|
\begin{tabular}{@{}rp{5cm}p{5.5cm}@{}}
|
||||||
|
\toprule
|
||||||
|
\textbf{\#} & \textbf{Violation} & \textbf{Principle / PEP\,8 Rule} \\
|
||||||
|
\midrule
|
||||||
|
1 & Unused imports, wildcard import, one-line imports & PEP\,8 Imports, KISS \\
|
||||||
|
2 & No docstrings or documentation & PEP\,257, Clean Code Documentation \\
|
||||||
|
3 & camelCase names, single-letter variables, abbreviations & PEP\,8 Naming, Descriptive Names \\
|
||||||
|
4 & Inconsistent indent, semicolons, missing whitespace & PEP\,8 Indentation \& Whitespace \\
|
||||||
|
5 & Bare except, silent error swallowing & PEP\,8 Exceptions, Zen of Python \\
|
||||||
|
6 & Mixed concerns, long tangled functions & SRP, SoC, Short Functions \\
|
||||||
|
7 & No \texttt{\_\_main\_\_} guard & Avoid Side Effects \\
|
||||||
|
8 & String concatenation instead of f-strings & Pythonic Code, Readability \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
\babel@toc {english}{}\relax
|
||||||
|
\contentsline {section}{\numberline {1}Overview}{2}{section.1}%
|
||||||
|
\contentsline {section}{\numberline {2}Violation 1: Unused and Poorly Formatted Imports}{2}{section.2}%
|
||||||
|
\contentsline {section}{\numberline {3}Violation 2: No Module Docstring or Documentation}{2}{section.3}%
|
||||||
|
\contentsline {section}{\numberline {4}Violation 3: Poor Naming Conventions}{3}{section.4}%
|
||||||
|
\contentsline {section}{\numberline {5}Violation 4: Formatting and Whitespace}{4}{section.5}%
|
||||||
|
\contentsline {section}{\numberline {6}Violation 5: Error Handling}{5}{section.6}%
|
||||||
|
\contentsline {section}{\numberline {7}Violation 6: Function Structure and Single Responsibility}{6}{section.7}%
|
||||||
|
\contentsline {section}{\numberline {8}Violation 7: Missing \texttt {\_\_main\_\_} Guard}{7}{section.8}%
|
||||||
|
\contentsline {section}{\numberline {9}Violation 8: String Concatenation Instead of f-Strings}{7}{section.9}%
|
||||||
|
\contentsline {section}{\numberline {10}Summary of Violations}{8}{section.10}%
|
||||||
64
Clean Code exercise/example1_calculator/calculator_bad.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import sys,os,re;from typing import *
|
||||||
|
|
||||||
|
# calculator program
|
||||||
|
def scicalc(s):
|
||||||
|
s=s.replace(' ','')
|
||||||
|
if s=='':return 0
|
||||||
|
r=doPlusMinus(s,0,len(s))
|
||||||
|
return r
|
||||||
|
|
||||||
|
def doPlusMinus(s,a,b):
|
||||||
|
t=s[a:b]; level=0; i=len(t)-1
|
||||||
|
while i>=0:
|
||||||
|
c=t[i]
|
||||||
|
if c==')':level=level+1
|
||||||
|
if c=='(':level=level-1
|
||||||
|
if level==0 and (c=='+' or c=='-'):
|
||||||
|
L=doPlusMinus(s,a,a+i);R=doMulDiv(s,a+i+1,b)
|
||||||
|
if c=='+': return L+R
|
||||||
|
else: return L-R
|
||||||
|
i=i-1
|
||||||
|
return doMulDiv(s,a,b)
|
||||||
|
|
||||||
|
def doMulDiv(s,a,b):
|
||||||
|
t=s[a:b];level=0;i=len(t)-1
|
||||||
|
while i >= 0:
|
||||||
|
c=t[i]
|
||||||
|
if c==')':level+=1
|
||||||
|
if c=='(':level-=1
|
||||||
|
if level==0 and(c=='*' or c=='/'):
|
||||||
|
L = doMulDiv(s,a,a+i); R = getNum(s,a+i+1,b)
|
||||||
|
if c=='*':return L*R
|
||||||
|
else:
|
||||||
|
if R==0:print("ERROR division by zero!!!") ;return 0
|
||||||
|
return L/R
|
||||||
|
i -= 1
|
||||||
|
return getNum(s,a,b)
|
||||||
|
|
||||||
|
def getNum(s, a,b):
|
||||||
|
t = s[a:b]
|
||||||
|
if t[0]=='(' and t[-1]==')':
|
||||||
|
return doPlusMinus(s,a+1,b-1)
|
||||||
|
try:
|
||||||
|
x = float(t)
|
||||||
|
except:
|
||||||
|
print("bad number: "+t);x=0
|
||||||
|
return x
|
||||||
|
|
||||||
|
def main():
|
||||||
|
Data = [
|
||||||
|
"3 + 5",
|
||||||
|
"10 - 2 * 3",
|
||||||
|
"( 4 + 6 ) * 2",
|
||||||
|
"100 / ( 5 * 2 )",
|
||||||
|
"3.5 + 2.5 * 4",
|
||||||
|
"( 1 + 2 ) * ( 3 + 4 )",
|
||||||
|
"",
|
||||||
|
"10 / 0",
|
||||||
|
"abc + 1",
|
||||||
|
]
|
||||||
|
for d in Data:
|
||||||
|
Res=scicalc(d)
|
||||||
|
print(d+" = "+str(Res))
|
||||||
|
|
||||||
|
main()
|
||||||
153
Clean Code exercise/example1_calculator/calculator_good.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
"""Simple arithmetic expression calculator with a recursive-descent parser.
|
||||||
|
|
||||||
|
Supported operations: +, -, *, / and parentheses.
|
||||||
|
Does NOT use Python's eval().
|
||||||
|
|
||||||
|
Grammar:
|
||||||
|
expression = term (('+' | '-') term)*
|
||||||
|
term = factor (('*' | '/') factor)*
|
||||||
|
factor = NUMBER | '(' expression ')'
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(expression_text):
|
||||||
|
"""Convert an expression string into a list of tokens.
|
||||||
|
|
||||||
|
Tokens are either numbers (float) or single-character operators / parentheses.
|
||||||
|
Raises ValueError for characters that are not part of a valid expression.
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
position = 0
|
||||||
|
|
||||||
|
while position < len(expression_text):
|
||||||
|
character = expression_text[position]
|
||||||
|
|
||||||
|
if character.isspace():
|
||||||
|
position += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if character in "+-*/()":
|
||||||
|
tokens.append(character)
|
||||||
|
position += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if character.isdigit() or character == ".":
|
||||||
|
start = position
|
||||||
|
while position < len(expression_text) and (
|
||||||
|
expression_text[position].isdigit()
|
||||||
|
or expression_text[position] == "."
|
||||||
|
):
|
||||||
|
position += 1
|
||||||
|
number_text = expression_text[start:position]
|
||||||
|
tokens.append(float(number_text))
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Unexpected character '{character}' at position {position}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def parse_expression(tokens, position):
|
||||||
|
"""Parse an expression: term (('+' | '-') term)*."""
|
||||||
|
result, position = parse_term(tokens, position)
|
||||||
|
|
||||||
|
while position < len(tokens) and tokens[position] in ("+", "-"):
|
||||||
|
operator = tokens[position]
|
||||||
|
position += 1
|
||||||
|
right_value, position = parse_term(tokens, position)
|
||||||
|
|
||||||
|
if operator == "+":
|
||||||
|
result += right_value
|
||||||
|
else:
|
||||||
|
result -= right_value
|
||||||
|
|
||||||
|
return result, position
|
||||||
|
|
||||||
|
|
||||||
|
def parse_term(tokens, position):
|
||||||
|
"""Parse a term: factor (('*' | '/') factor)*."""
|
||||||
|
result, position = parse_factor(tokens, position)
|
||||||
|
|
||||||
|
while position < len(tokens) and tokens[position] in ("*", "/"):
|
||||||
|
operator = tokens[position]
|
||||||
|
position += 1
|
||||||
|
right_value, position = parse_factor(tokens, position)
|
||||||
|
|
||||||
|
if operator == "*":
|
||||||
|
result *= right_value
|
||||||
|
else:
|
||||||
|
if right_value == 0:
|
||||||
|
raise ZeroDivisionError("Division by zero")
|
||||||
|
result /= right_value
|
||||||
|
|
||||||
|
return result, position
|
||||||
|
|
||||||
|
|
||||||
|
def parse_factor(tokens, position):
|
||||||
|
"""Parse a factor: NUMBER | '(' expression ')'."""
|
||||||
|
if position >= len(tokens):
|
||||||
|
raise ValueError("Unexpected end of expression")
|
||||||
|
|
||||||
|
token = tokens[position]
|
||||||
|
|
||||||
|
if token == "(":
|
||||||
|
position += 1
|
||||||
|
result, position = parse_expression(tokens, position)
|
||||||
|
if position >= len(tokens) or tokens[position] != ")":
|
||||||
|
raise ValueError("Missing closing parenthesis")
|
||||||
|
position += 1
|
||||||
|
return result, position
|
||||||
|
|
||||||
|
if isinstance(token, float):
|
||||||
|
return token, position + 1
|
||||||
|
|
||||||
|
raise ValueError(f"Unexpected token: {token}")
|
||||||
|
|
||||||
|
|
||||||
|
def calculate(expression_text):
|
||||||
|
"""Evaluate an arithmetic expression string and return the result.
|
||||||
|
|
||||||
|
Returns the numeric result or an error message string.
|
||||||
|
"""
|
||||||
|
if not expression_text.strip():
|
||||||
|
return "Error: empty expression"
|
||||||
|
|
||||||
|
try:
|
||||||
|
tokens = tokenize(expression_text)
|
||||||
|
result, final_position = parse_expression(tokens, 0)
|
||||||
|
|
||||||
|
if final_position != len(tokens):
|
||||||
|
return f"Error: unexpected token '{tokens[final_position]}'"
|
||||||
|
|
||||||
|
if result == int(result):
|
||||||
|
return int(result)
|
||||||
|
return round(result, 10)
|
||||||
|
|
||||||
|
except (ValueError, ZeroDivisionError) as error:
|
||||||
|
return f"Error: {error}"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run the calculator on a set of test expressions."""
|
||||||
|
test_expressions = [
|
||||||
|
"3 + 5",
|
||||||
|
"10 - 2 * 3",
|
||||||
|
"(4 + 6) * 2",
|
||||||
|
"100 / (5 * 2)",
|
||||||
|
"3.5 + 2.5 * 4",
|
||||||
|
"(1 + 2) * (3 + 4)",
|
||||||
|
"",
|
||||||
|
"10 / 0",
|
||||||
|
"abc + 1",
|
||||||
|
]
|
||||||
|
|
||||||
|
for expression in test_expressions:
|
||||||
|
result = calculate(expression)
|
||||||
|
display_expr = expression if expression else "(empty)"
|
||||||
|
print(f"{display_expr} = {result}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
\relax
|
||||||
|
\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo }
|
||||||
|
\@nameuse{bbl@beforestart}
|
||||||
|
\providecommand\hyper@newdestlabel[2]{}
|
||||||
|
\providecommand\HyField@AuxAddToFields[1]{}
|
||||||
|
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||||||
|
\babel@aux{english}{}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {1}Use Case}{1}{section.1}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {2}Example Input / Output}{1}{section.2}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {3}Exercise}{1}{section.3}\protected@file@percent }
|
||||||
|
\gdef \@abspage@last{2}
|
||||||
@ -0,0 +1,3 @@
|
|||||||
|
\BOOKMARK [1][-]{section.1}{\376\377\000U\000s\000e\000\040\000C\000a\000s\000e}{}% 1
|
||||||
|
\BOOKMARK [1][-]{section.2}{\376\377\000E\000x\000a\000m\000p\000l\000e\000\040\000I\000n\000p\000u\000t\000\040\000/\000\040\000O\000u\000t\000p\000u\000t}{}% 2
|
||||||
|
\BOOKMARK [1][-]{section.3}{\376\377\000E\000x\000e\000r\000c\000i\000s\000e}{}% 3
|
||||||
BIN
Clean Code exercise/example1_calculator/calculator_usecase.pdf
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
\documentclass[12pt,a4paper]{article}
|
||||||
|
\usepackage[utf8]{inputenc}
|
||||||
|
\usepackage[T1]{fontenc}
|
||||||
|
\usepackage[english]{babel}
|
||||||
|
\usepackage{geometry}
|
||||||
|
\geometry{margin=2.5cm}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{tcolorbox}
|
||||||
|
\usepackage{booktabs}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
|
||||||
|
\definecolor{seblue}{rgb}{0.0,0.28,0.67}
|
||||||
|
|
||||||
|
\title{\textcolor{seblue}{Exercise 1: Arithmetic Expression Calculator}\\[0.3em]
|
||||||
|
\large AISE501 -- AI in Software Engineering I}
|
||||||
|
\author{Dr.\ Florian Herzog}
|
||||||
|
\date{Spring Semester 2026}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\maketitle
|
||||||
|
|
||||||
|
\section{Use Case}
|
||||||
|
|
||||||
|
A user enters an arithmetic expression as a text string, for example \texttt{"3 + 5 * 2"}.
|
||||||
|
The program evaluates the expression and prints the result.
|
||||||
|
|
||||||
|
The calculator must:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Support the four basic operations: \texttt{+}, \texttt{-}, \texttt{*}, \texttt{/}
|
||||||
|
\item Respect standard operator precedence (\texttt{*} and \texttt{/} bind more tightly than \texttt{+} and \texttt{-})
|
||||||
|
\item Support parentheses for grouping, e.g.\ \texttt{"(4 + 6) * 2"}
|
||||||
|
\item Support decimal numbers, e.g.\ \texttt{"3.5 + 2.5"}
|
||||||
|
\item Handle errors gracefully (division by zero, invalid characters, empty input)
|
||||||
|
\item \textbf{Not} use Python's built-in \texttt{eval()} function
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\section{Example Input / Output}
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\begin{tabular}{ll}
|
||||||
|
\toprule
|
||||||
|
\textbf{Input Expression} & \textbf{Expected Output} \\
|
||||||
|
\midrule
|
||||||
|
\texttt{3 + 5} & \texttt{8} \\
|
||||||
|
\texttt{10 - 2 * 3} & \texttt{4} \\
|
||||||
|
\texttt{(4 + 6) * 2} & \texttt{20} \\
|
||||||
|
\texttt{100 / (5 * 2)} & \texttt{10} \\
|
||||||
|
\texttt{3.5 + 2.5 * 4} & \texttt{13.5} \\
|
||||||
|
\texttt{(1 + 2) * (3 + 4)} & \texttt{21} \\
|
||||||
|
\texttt{(empty)} & Error message \\
|
||||||
|
\texttt{10 / 0} & Error message \\
|
||||||
|
\texttt{abc + 1} & Error message \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\section{Exercise}
|
||||||
|
|
||||||
|
Two implementations are provided:
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{\texttt{calculator\_bad.py}} -- A working but poorly written version that violates many clean code and PEP\,8 principles.
|
||||||
|
\item \textbf{\texttt{calculator\_good.py}} -- A clean, well-structured version following PEP\,8 and clean code best practices.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection*{Tasks}
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Run both programs and verify they produce the same results.
|
||||||
|
\item Read the bad version and list all clean code / PEP\,8 violations you can find.
|
||||||
|
\item For each violation, explain which principle is broken and why it makes the code harder to read or maintain.
|
||||||
|
\item Compare your list with the good version to see how each issue was resolved.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection*{Violations to Look For}
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Unused imports
|
||||||
|
\item Missing or misleading comments and docstrings
|
||||||
|
\item Poor variable and function names (abbreviations, single letters)
|
||||||
|
\item Inconsistent indentation and spacing
|
||||||
|
\item Multiple statements on one line (semicolons)
|
||||||
|
\item Missing whitespace around operators
|
||||||
|
\item No proper error handling (bare \texttt{except}, printing instead of raising)
|
||||||
|
\item Magic numbers and unclear logic flow
|
||||||
|
\item Missing \texttt{if \_\_name\_\_ == "\_\_main\_\_"} guard
|
||||||
|
\item No type clarity in function signatures
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
25
Clean Code exercise/example2_bank/accounts.json
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"accounts": [
|
||||||
|
{
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"holder": "Alice Mueller",
|
||||||
|
"balance": 5000.00,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"holder": "Bob Schneider",
|
||||||
|
"balance": 1200.50,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"account_id": "ACC-003",
|
||||||
|
"holder": "Clara Brunner",
|
||||||
|
"balance": 300.00,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "frozen"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
25
Clean Code exercise/example2_bank/accounts_updated_bad.json
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"accounts": [
|
||||||
|
{
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"holder": "Alice Mueller",
|
||||||
|
"balance": 4550.0,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"holder": "Bob Schneider",
|
||||||
|
"balance": 1950.5,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"account_id": "ACC-003",
|
||||||
|
"holder": "Clara Brunner",
|
||||||
|
"balance": 300.0,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "frozen"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
25
Clean Code exercise/example2_bank/accounts_updated_good.json
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
{
|
||||||
|
"accounts": [
|
||||||
|
{
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"holder": "Alice Mueller",
|
||||||
|
"balance": 4550.0,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"holder": "Bob Schneider",
|
||||||
|
"balance": 1950.5,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"account_id": "ACC-003",
|
||||||
|
"holder": "Clara Brunner",
|
||||||
|
"balance": 300.0,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "frozen"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
20
Clean Code exercise/example2_bank/bank_analysis.aux
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
\relax
|
||||||
|
\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo }
|
||||||
|
\@nameuse{bbl@beforestart}
|
||||||
|
\providecommand\hyper@newdestlabel[2]{}
|
||||||
|
\providecommand\HyField@AuxAddToFields[1]{}
|
||||||
|
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||||||
|
\babel@aux{english}{}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {1}Overview}{2}{section.1}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {2}Violation 1: Unused Imports and Import Formatting}{2}{section.2}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {3}Violation 2: No Documentation or Docstrings}{2}{section.3}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {4}Violation 3: Implicit Data Model}{3}{section.4}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {5}Violation 4: Poor Naming}{4}{section.5}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {6}Violation 5: Formatting -- Semicolons and Dense Lines}{5}{section.6}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {7}Violation 6: No Context Managers for File I/O}{6}{section.7}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {8}Violation 7: God Function -- Single Responsibility Violation}{7}{section.8}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {9}Violation 8: Magic Strings Instead of Constants}{8}{section.9}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {10}Violation 9: Comparison with \texttt {None}}{8}{section.10}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {11}Violation 10: Missing \texttt {\_\_main\_\_} Guard and String Formatting}{9}{section.11}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {12}Summary of Violations}{10}{section.12}\protected@file@percent }
|
||||||
|
\gdef \@abspage@last{10}
|
||||||
12
Clean Code exercise/example2_bank/bank_analysis.out
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
\BOOKMARK [1][-]{section.1}{\376\377\000O\000v\000e\000r\000v\000i\000e\000w}{}% 1
|
||||||
|
\BOOKMARK [1][-]{section.2}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0001\000:\000\040\000U\000n\000u\000s\000e\000d\000\040\000I\000m\000p\000o\000r\000t\000s\000\040\000a\000n\000d\000\040\000I\000m\000p\000o\000r\000t\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g}{}% 2
|
||||||
|
\BOOKMARK [1][-]{section.3}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0002\000:\000\040\000N\000o\000\040\000D\000o\000c\000u\000m\000e\000n\000t\000a\000t\000i\000o\000n\000\040\000o\000r\000\040\000D\000o\000c\000s\000t\000r\000i\000n\000g\000s}{}% 3
|
||||||
|
\BOOKMARK [1][-]{section.4}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0003\000:\000\040\000I\000m\000p\000l\000i\000c\000i\000t\000\040\000D\000a\000t\000a\000\040\000M\000o\000d\000e\000l}{}% 4
|
||||||
|
\BOOKMARK [1][-]{section.5}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0004\000:\000\040\000P\000o\000o\000r\000\040\000N\000a\000m\000i\000n\000g}{}% 5
|
||||||
|
\BOOKMARK [1][-]{section.6}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0005\000:\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g\000\040\040\023\000\040\000S\000e\000m\000i\000c\000o\000l\000o\000n\000s\000\040\000a\000n\000d\000\040\000D\000e\000n\000s\000e\000\040\000L\000i\000n\000e\000s}{}% 6
|
||||||
|
\BOOKMARK [1][-]{section.7}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0006\000:\000\040\000N\000o\000\040\000C\000o\000n\000t\000e\000x\000t\000\040\000M\000a\000n\000a\000g\000e\000r\000s\000\040\000f\000o\000r\000\040\000F\000i\000l\000e\000\040\000I\000/\000O}{}% 7
|
||||||
|
\BOOKMARK [1][-]{section.8}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0007\000:\000\040\000G\000o\000d\000\040\000F\000u\000n\000c\000t\000i\000o\000n\000\040\040\023\000\040\000S\000i\000n\000g\000l\000e\000\040\000R\000e\000s\000p\000o\000n\000s\000i\000b\000i\000l\000i\000t\000y\000\040\000V\000i\000o\000l\000a\000t\000i\000o\000n}{}% 8
|
||||||
|
\BOOKMARK [1][-]{section.9}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0008\000:\000\040\000M\000a\000g\000i\000c\000\040\000S\000t\000r\000i\000n\000g\000s\000\040\000I\000n\000s\000t\000e\000a\000d\000\040\000o\000f\000\040\000C\000o\000n\000s\000t\000a\000n\000t\000s}{}% 9
|
||||||
|
\BOOKMARK [1][-]{section.10}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0009\000:\000\040\000C\000o\000m\000p\000a\000r\000i\000s\000o\000n\000\040\000w\000i\000t\000h\000\040\000N\000o\000n\000e}{}% 10
|
||||||
|
\BOOKMARK [1][-]{section.11}{\376\377\000V\000i\000o\000l\000a\000t\000i\000o\000n\000\040\0001\0000\000:\000\040\000M\000i\000s\000s\000i\000n\000g\000\040\000\137\000\137\000m\000a\000i\000n\000\137\000\137\000\040\000G\000u\000a\000r\000d\000\040\000a\000n\000d\000\040\000S\000t\000r\000i\000n\000g\000\040\000F\000o\000r\000m\000a\000t\000t\000i\000n\000g}{}% 11
|
||||||
|
\BOOKMARK [1][-]{section.12}{\376\377\000S\000u\000m\000m\000a\000r\000y\000\040\000o\000f\000\040\000V\000i\000o\000l\000a\000t\000i\000o\000n\000s}{}% 12
|
||||||
BIN
Clean Code exercise/example2_bank/bank_analysis.pdf
Normal file
526
Clean Code exercise/example2_bank/bank_analysis.tex
Normal file
@ -0,0 +1,526 @@
|
|||||||
|
\documentclass[12pt,a4paper]{article}
|
||||||
|
\usepackage[utf8]{inputenc}
|
||||||
|
\usepackage[T1]{fontenc}
|
||||||
|
\usepackage[english]{babel}
|
||||||
|
\usepackage{geometry}
|
||||||
|
\geometry{margin=2.5cm}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{tcolorbox}
|
||||||
|
\usepackage{booktabs}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\usepackage{listings}
|
||||||
|
\usepackage{enumitem}
|
||||||
|
|
||||||
|
\definecolor{seblue}{rgb}{0.0,0.28,0.67}
|
||||||
|
\definecolor{segreen}{rgb}{0.13,0.55,0.13}
|
||||||
|
\definecolor{sered}{rgb}{0.7,0.13,0.13}
|
||||||
|
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
|
||||||
|
\definecolor{codegreen}{rgb}{0,0.6,0}
|
||||||
|
\definecolor{codepurple}{rgb}{0.58,0,0.82}
|
||||||
|
|
||||||
|
\lstdefinestyle{pystyle}{
|
||||||
|
backgroundcolor=\color{backcolour},
|
||||||
|
commentstyle=\color{codegreen},
|
||||||
|
keywordstyle=\color{blue},
|
||||||
|
stringstyle=\color{codepurple},
|
||||||
|
basicstyle=\ttfamily\footnotesize,
|
||||||
|
breaklines=true,
|
||||||
|
keepspaces=true,
|
||||||
|
showstringspaces=false,
|
||||||
|
tabsize=4,
|
||||||
|
language=Python
|
||||||
|
}
|
||||||
|
\lstset{style=pystyle}
|
||||||
|
|
||||||
|
\newtcolorbox{badbox}{
|
||||||
|
colback=red!5!white,
|
||||||
|
colframe=sered,
|
||||||
|
title=Bad Code,
|
||||||
|
fonttitle=\bfseries\small,
|
||||||
|
boxrule=0.8pt, arc=2pt,
|
||||||
|
top=2pt, bottom=2pt, left=4pt, right=4pt
|
||||||
|
}
|
||||||
|
|
||||||
|
\newtcolorbox{goodbox}{
|
||||||
|
colback=green!5!white,
|
||||||
|
colframe=segreen,
|
||||||
|
title=Clean Code,
|
||||||
|
fonttitle=\bfseries\small,
|
||||||
|
boxrule=0.8pt, arc=2pt,
|
||||||
|
top=2pt, bottom=2pt, left=4pt, right=4pt
|
||||||
|
}
|
||||||
|
|
||||||
|
\newtcolorbox{principlebox}[1][]{
|
||||||
|
colback=blue!5!white,
|
||||||
|
colframe=seblue,
|
||||||
|
title=#1,
|
||||||
|
fonttitle=\bfseries\small,
|
||||||
|
boxrule=0.8pt, arc=2pt,
|
||||||
|
top=2pt, bottom=2pt, left=4pt, right=4pt
|
||||||
|
}
|
||||||
|
|
||||||
|
\title{\textcolor{seblue}{Code Analysis: Bank Account Transaction Processor}\\[0.3em]
|
||||||
|
\large What Makes Code Bad and How to Fix It\\[0.3em]
|
||||||
|
\normalsize AISE501 -- AI in Software Engineering I}
|
||||||
|
\author{Dr.\ Florian Herzog}
|
||||||
|
\date{Spring Semester 2026}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\maketitle
|
||||||
|
\tableofcontents
|
||||||
|
\newpage
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Overview}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
This document analyses two implementations of a bank account transaction processor.
|
||||||
|
Both read account state and transactions from JSON files, validate each transaction, apply valid ones, reject invalid ones, and write results.
|
||||||
|
Both produce identical output, but \texttt{bank\_bad.py} violates many PEP\,8 and clean code principles, while \texttt{bank\_good.py} follows them consistently.
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 1: Unused Imports and Import Formatting}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
import json,sys,os,copy;from datetime import datetime
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
import json
|
||||||
|
from typing import TypedDict, Optional
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \texttt{sys}, \texttt{os}, \texttt{copy}, and \texttt{datetime} are imported but \textbf{never used}.
|
||||||
|
\item All imports are \textbf{on a single line} separated by commas, with a semicolon joining two import statements.
|
||||||
|
\item PEP\,8 requires each import on its own line and groups separated by blank lines (standard library, third-party, local).
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Imports}: Imports should be on separate lines. Remove unused imports.
|
||||||
|
\item \textbf{KISS}: Unused imports add noise and suggest false dependencies.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 2: No Documentation or Docstrings}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
The file has \textbf{no module docstring} and \textbf{no function docstrings}. The only comment in the entire file is:
|
||||||
|
\begin{lstlisting}
|
||||||
|
# find account
|
||||||
|
...
|
||||||
|
# print results
|
||||||
|
\end{lstlisting}
|
||||||
|
These comments describe \textit{what} the next line does (which is already obvious from the code), not \textit{why}.
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
"""Bank account transaction processor.
|
||||||
|
|
||||||
|
Reads account state and a list of transactions from JSON files,
|
||||||
|
validates and applies each transaction, then writes updated account
|
||||||
|
state and a transaction log (accepted / rejected) to output files.
|
||||||
|
"""
|
||||||
|
\end{lstlisting}
|
||||||
|
Every function has a docstring:
|
||||||
|
\begin{lstlisting}
|
||||||
|
def validate_common(
|
||||||
|
account: Optional[Account],
|
||||||
|
amount: float,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Run validations shared by all transaction types.
|
||||||
|
|
||||||
|
Returns an error message string, or None if valid.
|
||||||
|
"""
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,257}: All public modules and functions should have docstrings.
|
||||||
|
\item \textbf{Clean Code -- Comments}: Don't add noise comments that just restate the code. Comments should explain \textit{why}, not \textit{what}.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 3: Implicit Data Model}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
The bad version operates on raw dictionaries with no type declarations.
|
||||||
|
A reader must trace through the JSON file and every dictionary access to understand the data shape:
|
||||||
|
\begin{lstlisting}
|
||||||
|
def proc(accs,txns):
|
||||||
|
for t in txns:
|
||||||
|
tp=t['type'];aid=t['account_id'];amt=t['amount'];tid=t['id']
|
||||||
|
a=None
|
||||||
|
for x in accs:
|
||||||
|
if x['account_id']==aid:a=x
|
||||||
|
\end{lstlisting}
|
||||||
|
What fields does \texttt{t} have? What fields does \texttt{a} have? There is no way to know without reading the JSON file.
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
The good version defines explicit data types:
|
||||||
|
\begin{lstlisting}
|
||||||
|
class Account(TypedDict):
|
||||||
|
"""A bank account with its current state."""
|
||||||
|
account_id: str
|
||||||
|
holder: str
|
||||||
|
balance: float
|
||||||
|
currency: str
|
||||||
|
status: str # "active" or "frozen"
|
||||||
|
|
||||||
|
class Transaction(TypedDict, total=False):
|
||||||
|
"""A financial transaction to be processed."""
|
||||||
|
id: str
|
||||||
|
type: str # "deposit", "withdrawal", or "transfer"
|
||||||
|
account_id: str
|
||||||
|
amount: float
|
||||||
|
description: str
|
||||||
|
to_account_id: str # only for transfers
|
||||||
|
status: str # added after processing
|
||||||
|
reason: str # added on rejection
|
||||||
|
\end{lstlisting}
|
||||||
|
All function signatures carry type annotations:
|
||||||
|
\begin{lstlisting}
|
||||||
|
def find_account(accounts: list[Account], account_id: str) -> Optional[Account]:
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{Zen of Python}: ``Explicit is better than implicit.''
|
||||||
|
\item \textbf{Clean Code -- Readability}: A reader should understand the data contract without tracing through runtime data.
|
||||||
|
\item \textbf{PEP\,484 / PEP\,589}: Use type hints and \texttt{TypedDict} to document the structure of dictionary-based data.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 4: Poor Naming}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def loadJ(p): # "J" for JSON? "p" for path?
|
||||||
|
def saveJ(p,d): # "d" for data?
|
||||||
|
def proc(accs,txns): # "proc" does what exactly?
|
||||||
|
ok=[];bad=[] # acceptable vs. rejected
|
||||||
|
tp=t['type'] # "tp" is unpronounceable
|
||||||
|
aid=t['account_id'] # "aid" looks like "aid" (help)
|
||||||
|
amt=t['amount'] # "amt" -- abbreviation
|
||||||
|
tid=t['id'] # "tid" -- never used again!
|
||||||
|
a=None # "a" for account
|
||||||
|
ta=None # "ta" for target account
|
||||||
|
for x in accs: # "x" for what?
|
||||||
|
D=loadJ(...) # capital "D" for a local variable
|
||||||
|
T=loadJ(...) # capital "T" for a local variable
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def load_json(file_path):
|
||||||
|
def save_json(file_path, data):
|
||||||
|
def find_account(accounts, account_id):
|
||||||
|
def validate_common(account, amount):
|
||||||
|
def process_deposit(accounts, transaction):
|
||||||
|
def process_withdrawal(accounts, transaction):
|
||||||
|
def process_transfer(accounts, transaction):
|
||||||
|
def process_all_transactions(accounts, transactions):
|
||||||
|
def print_results(accounts, accepted, rejected):
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item Function names use \textbf{abbreviations} (\texttt{loadJ}, \texttt{saveJ}, \texttt{proc}) instead of descriptive snake\_case names.
|
||||||
|
\item Variable names are \textbf{single letters or short abbreviations} (\texttt{a}, \texttt{t}, \texttt{x}, \texttt{tp}, \texttt{aid}, \texttt{amt}, \texttt{ta}).
|
||||||
|
\item \texttt{tid} is assigned but \textbf{never used} --- dead code.
|
||||||
|
\item \texttt{D} and \texttt{T} use \textbf{uppercase}, suggesting constants, but they are local variables.
|
||||||
|
\item The name \texttt{ok} for accepted transactions and \texttt{bad} for rejected ones is \textbf{imprecise}.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Naming}: Functions and variables use \texttt{lower\_case\_with\_underscores}. Constants use \texttt{UPPER\_CASE}.
|
||||||
|
\item \textbf{Clean Code -- Descriptive Names}: ``Other developers should figure out what a variable stores just by reading its name.''
|
||||||
|
\item \textbf{Clean Code -- Consistent Vocabulary}: Don't mix \texttt{ok}/\texttt{bad} with \texttt{accepted}/\texttt{rejected}.
|
||||||
|
\item \textbf{Clean Code -- No Abbreviations}: \texttt{amt}, \texttt{tp}, \texttt{tid} are not words.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 5: Formatting -- Semicolons and Dense Lines}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
f=open(p,'r');d=json.load(f);f.close();return d
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
tp=t['type'];aid=t['account_id'];amt=t['amount'];tid=t['id']
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
a['balance']=a['balance']+amt;t['status']='accepted';ok.append(t)
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if a==None:
|
||||||
|
t['reason']='account not found';bad.append(t);continue
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
Every statement is on its own line with proper whitespace:
|
||||||
|
\begin{lstlisting}
|
||||||
|
account = find_account(accounts, transaction["account_id"])
|
||||||
|
error = validate_common(account, transaction["amount"])
|
||||||
|
if error:
|
||||||
|
return False, error
|
||||||
|
|
||||||
|
account["balance"] += transaction["amount"]
|
||||||
|
return True, "accepted"
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item \textbf{Semicolons} pack 3--4 statements onto one line, making it nearly impossible to follow the logic.
|
||||||
|
\item \textbf{No whitespace} around \texttt{=} and after commas.
|
||||||
|
\item Control flow (\texttt{continue}) is \textbf{hidden at the end of a dense line}.
|
||||||
|
\item PEP\,8 explicitly states: ``Compound statements (multiple statements on the same line) are generally discouraged.''
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Compound Statements}: Generally discouraged. Each statement on its own line.
|
||||||
|
\item \textbf{PEP\,8 -- Whitespace}: Surround operators with spaces. Space after commas.
|
||||||
|
\item \textbf{Zen of Python}: ``Readability counts.'' ``Sparse is better than dense.''
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 6: No Context Managers for File I/O}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def loadJ(p):
|
||||||
|
f=open(p,'r');d=json.load(f);f.close();return d
|
||||||
|
|
||||||
|
def saveJ(p,d):
|
||||||
|
f=open(p,'w');json.dump(d,f,indent=2);f.close()
|
||||||
|
\end{lstlisting}
|
||||||
|
If \texttt{json.load(f)} raises an exception, the file is \textbf{never closed} because \texttt{f.close()} is skipped. This is a resource leak.
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def load_json(file_path: str) -> dict:
|
||||||
|
"""Read and parse a JSON file, returning the parsed data."""
|
||||||
|
with open(file_path, "r", encoding="utf-8") as file_handle:
|
||||||
|
return json.load(file_handle)
|
||||||
|
\end{lstlisting}
|
||||||
|
The \texttt{with} statement guarantees the file is closed even if an exception occurs.
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{Pythonic Code}: Always use context managers (\texttt{with}) for resource management.
|
||||||
|
\item \textbf{Clean Code -- Error Handling}: Code should be robust against exceptions. Manual \texttt{open}/\texttt{close} is error-prone.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 7: God Function -- Single Responsibility Violation}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
The function \texttt{proc()} is 38 lines long and handles \textbf{all of the following} in a single function:
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item Finding accounts by ID
|
||||||
|
\item Validating account status
|
||||||
|
\item Validating amounts
|
||||||
|
\item Processing deposits
|
||||||
|
\item Processing withdrawals
|
||||||
|
\item Processing transfers (including finding the target account)
|
||||||
|
\item Handling unknown transaction types
|
||||||
|
\item Building accepted and rejected lists
|
||||||
|
\end{itemize}
|
||||||
|
\begin{lstlisting}
|
||||||
|
def proc(accs,txns):
|
||||||
|
ok=[];bad=[]
|
||||||
|
for t in txns:
|
||||||
|
... # 35 lines of nested if/elif/else with continue
|
||||||
|
return accs,ok,bad
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
The good version splits this into \textbf{seven focused functions}:
|
||||||
|
\begin{lstlisting}
|
||||||
|
def find_account(accounts, account_id): # lookup
|
||||||
|
def validate_common(account, amount): # shared validation
|
||||||
|
def process_deposit(accounts, transaction): # deposit logic
|
||||||
|
def process_withdrawal(accounts, transaction):# withdrawal logic
|
||||||
|
def process_transfer(accounts, transaction): # transfer logic
|
||||||
|
def process_all_transactions(accounts, transactions): # orchestration
|
||||||
|
def print_results(accounts, accepted, rejected): # output
|
||||||
|
\end{lstlisting}
|
||||||
|
A dispatch dictionary replaces the \texttt{if/elif} chain:
|
||||||
|
\begin{lstlisting}
|
||||||
|
TRANSACTION_HANDLERS = {
|
||||||
|
"deposit": process_deposit,
|
||||||
|
"withdrawal": process_withdrawal,
|
||||||
|
"transfer": process_transfer,
|
||||||
|
}
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{SRP (Single Responsibility Principle)}: Each function should have one reason to change.
|
||||||
|
\item \textbf{DRY (Don't Repeat Yourself)}: The amount validation (\texttt{amt<=0}) is duplicated for deposits and transfers in the bad version; \texttt{validate\_common()} eliminates this.
|
||||||
|
\item \textbf{Clean Code -- Short Functions}: Functions should be comprehensible in a few minutes.
|
||||||
|
\item \textbf{Open-Closed Principle}: Adding a new transaction type in the bad version requires modifying the \texttt{proc()} function. In the good version, you add a new handler function and register it in the dictionary.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 8: Magic Strings Instead of Constants}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if a['status']!='active': # magic string
|
||||||
|
...
|
||||||
|
if tp=='deposit': # magic string
|
||||||
|
...
|
||||||
|
\end{lstlisting}
|
||||||
|
The strings \texttt{'active'}, \texttt{'deposit'}, \texttt{'withdrawal'}, and \texttt{'transfer'} appear throughout the code as \textbf{literals}. If the status name ever changed, every occurrence would need to be found and updated.
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
ACTIVE_STATUS = "active"
|
||||||
|
...
|
||||||
|
if account["status"] != ACTIVE_STATUS:
|
||||||
|
\end{lstlisting}
|
||||||
|
Transaction types are handled via the \texttt{TRANSACTION\_HANDLERS} dictionary, so the string literals appear only \textbf{once} in the handler registration.
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{Clean Code -- No Magic Numbers/Strings}: Use named constants for values that carry domain meaning.
|
||||||
|
\item \textbf{DRY}: The same literal repeated in multiple places is a maintenance risk.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 9: Comparison with \texttt{None}}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if a==None:
|
||||||
|
...
|
||||||
|
if ta==None:
|
||||||
|
...
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if account is None:
|
||||||
|
...
|
||||||
|
if target is None:
|
||||||
|
...
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
PEP\,8 explicitly states: ``Comparisons to singletons like \texttt{None} should always be done with \texttt{is} or \texttt{is not}, never the equality operators.''
|
||||||
|
The \texttt{is} operator checks \textbf{identity} (the correct test for \texttt{None}), while \texttt{==} checks \textbf{equality} and can be overridden by custom \texttt{\_\_eq\_\_} methods.
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{PEP\,8 -- Programming Recommendations}: Use \texttt{is None}, not \texttt{== None}.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Violation 10: Missing \texttt{\_\_main\_\_} Guard and String Formatting}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{badbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
main()
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
print(" "+a['account_id']+" "+a['holder']+": "+str(a['balance'])
|
||||||
|
+" "+a['currency']+" ("+a['status']+")")
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{badbox}
|
||||||
|
|
||||||
|
\begin{goodbox}
|
||||||
|
\begin{lstlisting}
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
\end{lstlisting}
|
||||||
|
\begin{lstlisting}
|
||||||
|
print(
|
||||||
|
f" {account['account_id']} {account['holder']}: "
|
||||||
|
f"{account['balance']:.2f} {account['currency']} "
|
||||||
|
f"({account['status']})"
|
||||||
|
)
|
||||||
|
\end{lstlisting}
|
||||||
|
\end{goodbox}
|
||||||
|
|
||||||
|
\textbf{What is wrong:}
|
||||||
|
\begin{itemize}
|
||||||
|
\item No \texttt{\_\_main\_\_} guard means importing the module triggers execution.
|
||||||
|
\item String concatenation with \texttt{+} and \texttt{str()} is harder to read than f-strings.
|
||||||
|
\item The bad version does not format numbers (\texttt{str(5000.0)} vs.\ \texttt{5000.00}).
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\begin{principlebox}[Principles Violated]
|
||||||
|
\begin{itemize}[nosep]
|
||||||
|
\item \textbf{Clean Code -- Avoid Side Effects}: Importing should not trigger execution.
|
||||||
|
\item \textbf{Pythonic Code}: Use f-strings for string formatting.
|
||||||
|
\end{itemize}
|
||||||
|
\end{principlebox}
|
||||||
|
|
||||||
|
% ============================================
|
||||||
|
\section{Summary of Violations}
|
||||||
|
% ============================================
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\small
|
||||||
|
\begin{tabular}{@{}rp{4.5cm}p{5.5cm}@{}}
|
||||||
|
\toprule
|
||||||
|
\textbf{\#} & \textbf{Violation} & \textbf{Principle / PEP\,8 Rule} \\
|
||||||
|
\midrule
|
||||||
|
1 & Unused imports, one-line format & PEP\,8 Imports, KISS \\
|
||||||
|
2 & No docstrings, noise comments & PEP\,257, Clean Code Documentation \\
|
||||||
|
3 & Implicit data model (raw dicts) & Explicit $>$ Implicit, PEP\,484/589 \\
|
||||||
|
4 & Abbreviations, single-letter names & PEP\,8 Naming, Descriptive Names \\
|
||||||
|
5 & Semicolons, dense lines, no whitespace & PEP\,8 Whitespace, Zen of Python \\
|
||||||
|
6 & Manual file open/close & Pythonic Code, Context Managers \\
|
||||||
|
7 & God function (38-line \texttt{proc}) & SRP, DRY, Open-Closed Principle \\
|
||||||
|
8 & Magic strings & No Magic Numbers, DRY \\
|
||||||
|
9 & \texttt{== None} instead of \texttt{is None} & PEP\,8 Programming Recommendations \\
|
||||||
|
10 & No \texttt{\_\_main\_\_} guard, string concat & Side Effects, Pythonic Code \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
13
Clean Code exercise/example2_bank/bank_analysis.toc
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
\babel@toc {english}{}\relax
|
||||||
|
\contentsline {section}{\numberline {1}Overview}{2}{section.1}%
|
||||||
|
\contentsline {section}{\numberline {2}Violation 1: Unused Imports and Import Formatting}{2}{section.2}%
|
||||||
|
\contentsline {section}{\numberline {3}Violation 2: No Documentation or Docstrings}{2}{section.3}%
|
||||||
|
\contentsline {section}{\numberline {4}Violation 3: Implicit Data Model}{3}{section.4}%
|
||||||
|
\contentsline {section}{\numberline {5}Violation 4: Poor Naming}{4}{section.5}%
|
||||||
|
\contentsline {section}{\numberline {6}Violation 5: Formatting -- Semicolons and Dense Lines}{5}{section.6}%
|
||||||
|
\contentsline {section}{\numberline {7}Violation 6: No Context Managers for File I/O}{6}{section.7}%
|
||||||
|
\contentsline {section}{\numberline {8}Violation 7: God Function -- Single Responsibility Violation}{7}{section.8}%
|
||||||
|
\contentsline {section}{\numberline {9}Violation 8: Magic Strings Instead of Constants}{8}{section.9}%
|
||||||
|
\contentsline {section}{\numberline {10}Violation 9: Comparison with \texttt {None}}{8}{section.10}%
|
||||||
|
\contentsline {section}{\numberline {11}Violation 10: Missing \texttt {\_\_main\_\_} Guard and String Formatting}{9}{section.11}%
|
||||||
|
\contentsline {section}{\numberline {12}Summary of Violations}{10}{section.12}%
|
||||||
62
Clean Code exercise/example2_bank/bank_bad.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import json,sys,os,copy;from datetime import datetime
|
||||||
|
|
||||||
|
def loadJ(p):
|
||||||
|
f=open(p,'r');d=json.load(f);f.close();return d
|
||||||
|
|
||||||
|
def saveJ(p,d):
|
||||||
|
f=open(p,'w');json.dump(d,f,indent=2);f.close()
|
||||||
|
|
||||||
|
def proc(accs,txns):
|
||||||
|
ok=[];bad=[]
|
||||||
|
for t in txns:
|
||||||
|
tp=t['type'];aid=t['account_id'];amt=t['amount'];tid=t['id']
|
||||||
|
# find account
|
||||||
|
a=None
|
||||||
|
for x in accs:
|
||||||
|
if x['account_id']==aid:a=x
|
||||||
|
if a==None:
|
||||||
|
t['reason']='account not found';bad.append(t);continue
|
||||||
|
if a['status']!='active':
|
||||||
|
t['reason']='account not active';bad.append(t);continue
|
||||||
|
if amt<=0 and tp!='withdrawal':
|
||||||
|
if tp=='deposit':t['reason']='invalid amount';bad.append(t);continue
|
||||||
|
if tp=='transfer':t['reason']='invalid amount';bad.append(t);continue
|
||||||
|
if amt<=0 and tp=='withdrawal':
|
||||||
|
t['reason']='invalid amount';bad.append(t);continue
|
||||||
|
if tp=='deposit':
|
||||||
|
a['balance']=a['balance']+amt;t['status']='accepted';ok.append(t)
|
||||||
|
elif tp=='withdrawal':
|
||||||
|
if a['balance']>=amt:
|
||||||
|
a['balance']=a['balance']-amt;t['status']='accepted';ok.append(t)
|
||||||
|
else:
|
||||||
|
t['reason']='insufficient funds';t['status']='rejected';bad.append(t)
|
||||||
|
elif tp=='transfer':
|
||||||
|
ta=None
|
||||||
|
for x in accs:
|
||||||
|
if x['account_id']==t.get('to_account_id',''):ta=x
|
||||||
|
if ta==None:t['reason']='target account not found';bad.append(t);continue
|
||||||
|
if ta['status']!='active':t['reason']='target account not active';bad.append(t);continue
|
||||||
|
if a['balance']>=amt:
|
||||||
|
a['balance']=a['balance']-amt;ta['balance']=ta['balance']+amt
|
||||||
|
t['status']='accepted';ok.append(t)
|
||||||
|
else:
|
||||||
|
t['reason']='insufficient funds';t['status']='rejected';bad.append(t)
|
||||||
|
else:
|
||||||
|
t['reason']='unknown type';bad.append(t)
|
||||||
|
return accs,ok,bad
|
||||||
|
|
||||||
|
def main():
|
||||||
|
D=loadJ('accounts.json');T=loadJ('transactions.json')
|
||||||
|
accs=D['accounts'];txns=T['transactions']
|
||||||
|
accs,ok,bad=proc(accs,txns)
|
||||||
|
# print results
|
||||||
|
print("=== UPDATED ACCOUNTS ===")
|
||||||
|
for a in accs:print(" "+a['account_id']+" "+a['holder']+": "+str(a['balance'])+" "+a['currency']+" ("+a['status']+")")
|
||||||
|
print("\n=== ACCEPTED ("+str(len(ok))+") ===")
|
||||||
|
for t in ok:print(" "+t['id']+" "+t['type']+" "+str(t['amount'])+" -> "+t.get('description',''))
|
||||||
|
print("\n=== REJECTED ("+str(len(bad))+") ===")
|
||||||
|
for t in bad:print(" "+t['id']+" "+t['type']+" "+str(t['amount'])+" -> "+t.get('reason','unknown'))
|
||||||
|
saveJ('accounts_updated_bad.json',{"accounts":accs})
|
||||||
|
saveJ('transaction_log_bad.json',{"accepted":ok,"rejected":bad})
|
||||||
|
|
||||||
|
main()
|
||||||
280
Clean Code exercise/example2_bank/bank_good.py
Normal file
@ -0,0 +1,280 @@
|
|||||||
|
"""Bank account transaction processor.
|
||||||
|
|
||||||
|
Reads account state and a list of transactions from JSON files,
|
||||||
|
validates and applies each transaction, then writes updated account
|
||||||
|
state and a transaction log (accepted / rejected) to output files.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import TypedDict, Optional
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Explicit data model -- defines the exact shape of every data structure
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class Account(TypedDict):
|
||||||
|
"""A bank account with its current state."""
|
||||||
|
account_id: str
|
||||||
|
holder: str
|
||||||
|
balance: float
|
||||||
|
currency: str
|
||||||
|
status: str # "active" or "frozen"
|
||||||
|
|
||||||
|
|
||||||
|
class Transaction(TypedDict, total=False):
|
||||||
|
"""A financial transaction to be processed.
|
||||||
|
|
||||||
|
Fields marked total=False are optional (e.g. to_account_id only
|
||||||
|
exists for transfers; status/reason are added during processing).
|
||||||
|
"""
|
||||||
|
id: str
|
||||||
|
type: str # "deposit", "withdrawal", or "transfer"
|
||||||
|
account_id: str
|
||||||
|
amount: float
|
||||||
|
description: str
|
||||||
|
to_account_id: str # only for transfers
|
||||||
|
status: str # added after processing: "accepted" / "rejected"
|
||||||
|
reason: str # added on rejection
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Constants
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
ACCOUNTS_INPUT = "accounts.json"
|
||||||
|
TRANSACTIONS_INPUT = "transactions.json"
|
||||||
|
ACCOUNTS_OUTPUT = "accounts_updated_good.json"
|
||||||
|
TRANSACTION_LOG_OUTPUT = "transaction_log_good.json"
|
||||||
|
|
||||||
|
ACTIVE_STATUS = "active"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File I/O
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_json(file_path: str) -> dict:
|
||||||
|
"""Read and parse a JSON file, returning the parsed data."""
|
||||||
|
with open(file_path, "r", encoding="utf-8") as file_handle:
|
||||||
|
return json.load(file_handle)
|
||||||
|
|
||||||
|
|
||||||
|
def save_json(file_path: str, data: dict) -> None:
|
||||||
|
"""Write data to a JSON file with readable indentation."""
|
||||||
|
with open(file_path, "w", encoding="utf-8") as file_handle:
|
||||||
|
json.dump(data, file_handle, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
|
def load_accounts(file_path: str) -> list[Account]:
|
||||||
|
"""Load and return the list of accounts from a JSON file."""
|
||||||
|
data = load_json(file_path)
|
||||||
|
return data["accounts"]
|
||||||
|
|
||||||
|
|
||||||
|
def load_transactions(file_path: str) -> list[Transaction]:
|
||||||
|
"""Load and return the list of transactions from a JSON file."""
|
||||||
|
data = load_json(file_path)
|
||||||
|
return data["transactions"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Account lookup
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def find_account(accounts: list[Account], account_id: str) -> Optional[Account]:
|
||||||
|
"""Find an account by its ID. Returns the account dict or None."""
|
||||||
|
for account in accounts:
|
||||||
|
if account["account_id"] == account_id:
|
||||||
|
return account
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Validation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def validate_common(
|
||||||
|
account: Optional[Account],
|
||||||
|
amount: float,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Run validations shared by all transaction types.
|
||||||
|
|
||||||
|
Returns an error message string, or None if valid.
|
||||||
|
"""
|
||||||
|
if account is None:
|
||||||
|
return "account not found"
|
||||||
|
|
||||||
|
if account["status"] != ACTIVE_STATUS:
|
||||||
|
return f"account is {account['status']}"
|
||||||
|
|
||||||
|
if amount is None or amount <= 0:
|
||||||
|
return "amount must be positive"
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Transaction handlers -- one function per transaction type
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def process_deposit(
|
||||||
|
accounts: list[Account],
|
||||||
|
transaction: Transaction,
|
||||||
|
) -> tuple[bool, str]:
|
||||||
|
"""Apply a deposit transaction. Returns (success, reason)."""
|
||||||
|
account = find_account(accounts, transaction["account_id"])
|
||||||
|
error = validate_common(account, transaction["amount"])
|
||||||
|
if error:
|
||||||
|
return False, error
|
||||||
|
|
||||||
|
account["balance"] += transaction["amount"]
|
||||||
|
return True, "accepted"
|
||||||
|
|
||||||
|
|
||||||
|
def process_withdrawal(
|
||||||
|
accounts: list[Account],
|
||||||
|
transaction: Transaction,
|
||||||
|
) -> tuple[bool, str]:
|
||||||
|
"""Apply a withdrawal transaction. Returns (success, reason)."""
|
||||||
|
account = find_account(accounts, transaction["account_id"])
|
||||||
|
error = validate_common(account, transaction["amount"])
|
||||||
|
if error:
|
||||||
|
return False, error
|
||||||
|
|
||||||
|
if account["balance"] < transaction["amount"]:
|
||||||
|
return False, "insufficient funds"
|
||||||
|
|
||||||
|
account["balance"] -= transaction["amount"]
|
||||||
|
return True, "accepted"
|
||||||
|
|
||||||
|
|
||||||
|
def process_transfer(
|
||||||
|
accounts: list[Account],
|
||||||
|
transaction: Transaction,
|
||||||
|
) -> tuple[bool, str]:
|
||||||
|
"""Apply a transfer between two accounts. Returns (success, reason)."""
|
||||||
|
source = find_account(accounts, transaction["account_id"])
|
||||||
|
error = validate_common(source, transaction["amount"])
|
||||||
|
if error:
|
||||||
|
return False, f"source: {error}"
|
||||||
|
|
||||||
|
target_id = transaction.get("to_account_id", "")
|
||||||
|
target = find_account(accounts, target_id)
|
||||||
|
|
||||||
|
if target is None:
|
||||||
|
return False, "target account not found"
|
||||||
|
if target["status"] != ACTIVE_STATUS:
|
||||||
|
return False, f"target account is {target['status']}"
|
||||||
|
|
||||||
|
if source["balance"] < transaction["amount"]:
|
||||||
|
return False, "insufficient funds"
|
||||||
|
|
||||||
|
source["balance"] -= transaction["amount"]
|
||||||
|
target["balance"] += transaction["amount"]
|
||||||
|
return True, "accepted"
|
||||||
|
|
||||||
|
|
||||||
|
TRANSACTION_HANDLERS = {
|
||||||
|
"deposit": process_deposit,
|
||||||
|
"withdrawal": process_withdrawal,
|
||||||
|
"transfer": process_transfer,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Processing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def process_all_transactions(
|
||||||
|
accounts: list[Account],
|
||||||
|
transactions: list[Transaction],
|
||||||
|
) -> tuple[list[Transaction], list[Transaction]]:
|
||||||
|
"""Process a list of transactions against the account state.
|
||||||
|
|
||||||
|
Returns two lists: (accepted_transactions, rejected_transactions).
|
||||||
|
Each transaction is augmented with 'status' and optionally 'reason'.
|
||||||
|
"""
|
||||||
|
accepted: list[Transaction] = []
|
||||||
|
rejected: list[Transaction] = []
|
||||||
|
|
||||||
|
for transaction in transactions:
|
||||||
|
transaction_type = transaction.get("type", "")
|
||||||
|
handler = TRANSACTION_HANDLERS.get(transaction_type)
|
||||||
|
|
||||||
|
if handler is None:
|
||||||
|
transaction["status"] = "rejected"
|
||||||
|
transaction["reason"] = f"unknown transaction type '{transaction_type}'"
|
||||||
|
rejected.append(transaction)
|
||||||
|
continue
|
||||||
|
|
||||||
|
success, reason = handler(accounts, transaction)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
transaction["status"] = "accepted"
|
||||||
|
accepted.append(transaction)
|
||||||
|
else:
|
||||||
|
transaction["status"] = "rejected"
|
||||||
|
transaction["reason"] = reason
|
||||||
|
rejected.append(transaction)
|
||||||
|
|
||||||
|
return accepted, rejected
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Output
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def print_results(
|
||||||
|
accounts: list[Account],
|
||||||
|
accepted: list[Transaction],
|
||||||
|
rejected: list[Transaction],
|
||||||
|
) -> None:
|
||||||
|
"""Print a human-readable summary to the console."""
|
||||||
|
print("=== UPDATED ACCOUNTS ===")
|
||||||
|
for account in accounts:
|
||||||
|
print(
|
||||||
|
f" {account['account_id']} {account['holder']}: "
|
||||||
|
f"{account['balance']:.2f} {account['currency']} "
|
||||||
|
f"({account['status']})"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n=== ACCEPTED TRANSACTIONS ({len(accepted)}) ===")
|
||||||
|
for txn in accepted:
|
||||||
|
print(
|
||||||
|
f" {txn['id']} {txn['type']:12s} {txn['amount']:>10.2f} "
|
||||||
|
f"{txn.get('description', '')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n=== REJECTED TRANSACTIONS ({len(rejected)}) ===")
|
||||||
|
for txn in rejected:
|
||||||
|
print(
|
||||||
|
f" {txn['id']} {txn['type']:12s} {txn['amount']:>10.2f} "
|
||||||
|
f"Reason: {txn.get('reason', 'unknown')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Load data, process transactions, print and save results."""
|
||||||
|
accounts: list[Account] = load_accounts(ACCOUNTS_INPUT)
|
||||||
|
transactions: list[Transaction] = load_transactions(TRANSACTIONS_INPUT)
|
||||||
|
|
||||||
|
accepted, rejected = process_all_transactions(accounts, transactions)
|
||||||
|
|
||||||
|
print_results(accounts, accepted, rejected)
|
||||||
|
|
||||||
|
save_json(ACCOUNTS_OUTPUT, {"accounts": accounts})
|
||||||
|
save_json(TRANSACTION_LOG_OUTPUT, {
|
||||||
|
"accepted": accepted,
|
||||||
|
"rejected": rejected,
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"\nOutput written to {ACCOUNTS_OUTPUT} and {TRANSACTION_LOG_OUTPUT}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
16
Clean Code exercise/example2_bank/bank_usecase.aux
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
\relax
|
||||||
|
\providecommand \babel@aux [2]{\global \let \babel@toc \@gobbletwo }
|
||||||
|
\@nameuse{bbl@beforestart}
|
||||||
|
\providecommand\hyper@newdestlabel[2]{}
|
||||||
|
\providecommand\HyField@AuxAddToFields[1]{}
|
||||||
|
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||||||
|
\babel@aux{english}{}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {1}Use Case}{1}{section.1}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {2}Input Files}{1}{section.2}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Account State (\texttt {accounts.json})}{1}{subsection.2.1}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Transactions (\texttt {transactions.json})}{1}{subsection.2.2}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {3}Validation Rules}{1}{section.3}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {4}Output}{2}{section.4}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {5}Expected Results}{2}{section.5}\protected@file@percent }
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {6}Exercise}{2}{section.6}\protected@file@percent }
|
||||||
|
\gdef \@abspage@last{3}
|
||||||
8
Clean Code exercise/example2_bank/bank_usecase.out
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
\BOOKMARK [1][-]{section.1}{\376\377\000U\000s\000e\000\040\000C\000a\000s\000e}{}% 1
|
||||||
|
\BOOKMARK [1][-]{section.2}{\376\377\000I\000n\000p\000u\000t\000\040\000F\000i\000l\000e\000s}{}% 2
|
||||||
|
\BOOKMARK [2][-]{subsection.2.1}{\376\377\000A\000c\000c\000o\000u\000n\000t\000\040\000S\000t\000a\000t\000e\000\040\000\050\000a\000c\000c\000o\000u\000n\000t\000s\000.\000j\000s\000o\000n\000\051}{section.2}% 3
|
||||||
|
\BOOKMARK [2][-]{subsection.2.2}{\376\377\000T\000r\000a\000n\000s\000a\000c\000t\000i\000o\000n\000s\000\040\000\050\000t\000r\000a\000n\000s\000a\000c\000t\000i\000o\000n\000s\000.\000j\000s\000o\000n\000\051}{section.2}% 4
|
||||||
|
\BOOKMARK [1][-]{section.3}{\376\377\000V\000a\000l\000i\000d\000a\000t\000i\000o\000n\000\040\000R\000u\000l\000e\000s}{}% 5
|
||||||
|
\BOOKMARK [1][-]{section.4}{\376\377\000O\000u\000t\000p\000u\000t}{}% 6
|
||||||
|
\BOOKMARK [1][-]{section.5}{\376\377\000E\000x\000p\000e\000c\000t\000e\000d\000\040\000R\000e\000s\000u\000l\000t\000s}{}% 7
|
||||||
|
\BOOKMARK [1][-]{section.6}{\376\377\000E\000x\000e\000r\000c\000i\000s\000e}{}% 8
|
||||||
BIN
Clean Code exercise/example2_bank/bank_usecase.pdf
Normal file
152
Clean Code exercise/example2_bank/bank_usecase.tex
Normal file
@ -0,0 +1,152 @@
|
|||||||
|
\documentclass[12pt,a4paper]{article}
|
||||||
|
\usepackage[utf8]{inputenc}
|
||||||
|
\usepackage[T1]{fontenc}
|
||||||
|
\usepackage[english]{babel}
|
||||||
|
\usepackage{geometry}
|
||||||
|
\geometry{margin=2.5cm}
|
||||||
|
\usepackage{xcolor}
|
||||||
|
\usepackage{tcolorbox}
|
||||||
|
\usepackage{booktabs}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\usepackage{listings}
|
||||||
|
|
||||||
|
\definecolor{seblue}{rgb}{0.0,0.28,0.67}
|
||||||
|
\definecolor{backcolour}{rgb}{0.95,0.95,0.92}
|
||||||
|
|
||||||
|
\lstdefinestyle{json}{
|
||||||
|
backgroundcolor=\color{backcolour},
|
||||||
|
basicstyle=\ttfamily\small,
|
||||||
|
breaklines=true,
|
||||||
|
showstringspaces=false,
|
||||||
|
tabsize=2
|
||||||
|
}
|
||||||
|
|
||||||
|
\title{\textcolor{seblue}{Exercise 2: Bank Account Transaction Processor}\\[0.3em]
|
||||||
|
\large AISE501 -- AI in Software Engineering I}
|
||||||
|
\author{Dr.\ Florian Herzog}
|
||||||
|
\date{Spring Semester 2026}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
\maketitle
|
||||||
|
|
||||||
|
\section{Use Case}
|
||||||
|
|
||||||
|
A simple bank system maintains a set of customer accounts, each with a balance, currency, and status (\texttt{active} or \texttt{frozen}).
|
||||||
|
A series of transactions is submitted for processing.
|
||||||
|
The program must validate each transaction, apply valid ones, reject invalid ones, and produce output files recording the results.
|
||||||
|
|
||||||
|
\section{Input Files}
|
||||||
|
|
||||||
|
\subsection{Account State (\texttt{accounts.json})}
|
||||||
|
|
||||||
|
A JSON file containing an array of account objects:
|
||||||
|
|
||||||
|
\begin{lstlisting}[style=json]
|
||||||
|
{
|
||||||
|
"accounts": [
|
||||||
|
{
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"holder": "Alice Mueller",
|
||||||
|
"balance": 5000.00,
|
||||||
|
"currency": "CHF",
|
||||||
|
"status": "active"
|
||||||
|
},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
}
|
||||||
|
\end{lstlisting}
|
||||||
|
|
||||||
|
\subsection{Transactions (\texttt{transactions.json})}
|
||||||
|
|
||||||
|
A JSON file containing an array of transaction objects.
|
||||||
|
Each transaction has a \texttt{type} (\texttt{deposit}, \texttt{withdrawal}, or \texttt{transfer}), an \texttt{account\_id}, an \texttt{amount}, and a \texttt{description}.
|
||||||
|
Transfers additionally have a \texttt{to\_account\_id}.
|
||||||
|
|
||||||
|
\section{Validation Rules}
|
||||||
|
|
||||||
|
A transaction is \textbf{rejected} if any of these conditions apply:
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\begin{tabular}{ll}
|
||||||
|
\toprule
|
||||||
|
\textbf{Condition} & \textbf{Applies to} \\
|
||||||
|
\midrule
|
||||||
|
Account ID does not exist & All types \\
|
||||||
|
Account status is not \texttt{active} & All types \\
|
||||||
|
Amount is zero or negative & All types \\
|
||||||
|
Balance is less than withdrawal amount & Withdrawal, Transfer \\
|
||||||
|
Target account does not exist & Transfer \\
|
||||||
|
Target account is not \texttt{active} & Transfer \\
|
||||||
|
Unknown transaction type & -- \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\section{Output}
|
||||||
|
|
||||||
|
The program produces:
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{Console output} -- A summary of updated account balances, accepted transactions, and rejected transactions with reasons.
|
||||||
|
\item \textbf{Updated account state} (\texttt{accounts\_updated.json}) -- The accounts JSON with balances modified by accepted transactions.
|
||||||
|
\item \textbf{Transaction log} (\texttt{transaction\_log.json}) -- Two arrays: \texttt{accepted} and \texttt{rejected}, each transaction annotated with its \texttt{status} and (for rejections) a \texttt{reason}.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\section{Expected Results}
|
||||||
|
|
||||||
|
Given the provided input files, the expected outcome is:
|
||||||
|
|
||||||
|
\begin{center}
|
||||||
|
\small
|
||||||
|
\begin{tabular}{lllp{5cm}}
|
||||||
|
\toprule
|
||||||
|
\textbf{TXN ID} & \textbf{Type} & \textbf{Result} & \textbf{Reason (if rejected)} \\
|
||||||
|
\midrule
|
||||||
|
TXN-001 & deposit & Accepted & -- \\
|
||||||
|
TXN-002 & withdrawal & Accepted & -- \\
|
||||||
|
TXN-003 & withdrawal & Rejected & Insufficient funds \\
|
||||||
|
TXN-004 & deposit & Rejected & Negative amount \\
|
||||||
|
TXN-005 & deposit & Rejected & Account is frozen \\
|
||||||
|
TXN-006 & transfer & Accepted & -- \\
|
||||||
|
TXN-007 & withdrawal & Rejected & Account not found \\
|
||||||
|
TXN-008 & deposit & Rejected & Zero amount \\
|
||||||
|
\bottomrule
|
||||||
|
\end{tabular}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
\section{Exercise}
|
||||||
|
|
||||||
|
Two implementations are provided:
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item \textbf{\texttt{bank\_bad.py}} -- A working but poorly written version that violates many clean code and PEP\,8 principles.
|
||||||
|
\item \textbf{\texttt{bank\_good.py}} -- A clean, well-structured version following PEP\,8 and clean code best practices.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection*{Tasks}
|
||||||
|
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Run both programs and verify they produce the same results.
|
||||||
|
\item Read the bad version and list all clean code / PEP\,8 violations you can find.
|
||||||
|
\item For each violation, explain which principle is broken and why it makes the code harder to read or maintain.
|
||||||
|
\item Compare your list with the good version to see how each issue was resolved.
|
||||||
|
\end{enumerate}
|
||||||
|
|
||||||
|
\subsection*{Violations to Look For}
|
||||||
|
|
||||||
|
\begin{itemize}
|
||||||
|
\item Unused imports (\texttt{sys}, \texttt{os}, \texttt{copy}, \texttt{datetime})
|
||||||
|
\item No docstrings or module documentation
|
||||||
|
\item Single-letter and abbreviated variable names (\texttt{a}, \texttt{t}, \texttt{d}, \texttt{tp}, \texttt{tid})
|
||||||
|
\item Multiple statements per line (semicolons)
|
||||||
|
\item No whitespace around operators and after commas
|
||||||
|
\item Manual file open/close instead of context managers (\texttt{with})
|
||||||
|
\item One giant function doing all validation (violates Single Responsibility)
|
||||||
|
\item Duplicated validation logic for deposit/transfer amount checks
|
||||||
|
\item No constants for file paths
|
||||||
|
\item Missing \texttt{if \_\_name\_\_ == "\_\_main\_\_"} guard
|
||||||
|
\item Inconsistent error handling and status assignment
|
||||||
|
\item Hard-to-follow control flow with nested \texttt{if}/\texttt{elif}/\texttt{continue}
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\end{document}
|
||||||
72
Clean Code exercise/example2_bank/transaction_log_bad.json
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
{
|
||||||
|
"accepted": [
|
||||||
|
{
|
||||||
|
"id": "TXN-001",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 500.0,
|
||||||
|
"description": "Salary payment",
|
||||||
|
"status": "accepted"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-002",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 200.0,
|
||||||
|
"description": "ATM withdrawal",
|
||||||
|
"status": "accepted"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-006",
|
||||||
|
"type": "transfer",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"to_account_id": "ACC-002",
|
||||||
|
"amount": 750.0,
|
||||||
|
"description": "Transfer to Bob",
|
||||||
|
"status": "accepted"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rejected": [
|
||||||
|
{
|
||||||
|
"id": "TXN-003",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"amount": 1500.0,
|
||||||
|
"description": "Rent payment - exceeds balance",
|
||||||
|
"reason": "insufficient funds",
|
||||||
|
"status": "rejected"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-004",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"amount": -100.0,
|
||||||
|
"description": "Invalid negative deposit",
|
||||||
|
"reason": "invalid amount"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-005",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-003",
|
||||||
|
"amount": 1000.0,
|
||||||
|
"description": "Deposit to frozen account",
|
||||||
|
"reason": "account not active"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-007",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-999",
|
||||||
|
"amount": 50.0,
|
||||||
|
"description": "Unknown account",
|
||||||
|
"reason": "account not found"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-008",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 0,
|
||||||
|
"description": "Zero-amount deposit",
|
||||||
|
"reason": "invalid amount"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
76
Clean Code exercise/example2_bank/transaction_log_good.json
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
{
|
||||||
|
"accepted": [
|
||||||
|
{
|
||||||
|
"id": "TXN-001",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 500.0,
|
||||||
|
"description": "Salary payment",
|
||||||
|
"status": "accepted"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-002",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 200.0,
|
||||||
|
"description": "ATM withdrawal",
|
||||||
|
"status": "accepted"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-006",
|
||||||
|
"type": "transfer",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"to_account_id": "ACC-002",
|
||||||
|
"amount": 750.0,
|
||||||
|
"description": "Transfer to Bob",
|
||||||
|
"status": "accepted"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"rejected": [
|
||||||
|
{
|
||||||
|
"id": "TXN-003",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"amount": 1500.0,
|
||||||
|
"description": "Rent payment - exceeds balance",
|
||||||
|
"status": "rejected",
|
||||||
|
"reason": "insufficient funds"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-004",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"amount": -100.0,
|
||||||
|
"description": "Invalid negative deposit",
|
||||||
|
"status": "rejected",
|
||||||
|
"reason": "amount must be positive"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-005",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-003",
|
||||||
|
"amount": 1000.0,
|
||||||
|
"description": "Deposit to frozen account",
|
||||||
|
"status": "rejected",
|
||||||
|
"reason": "account is frozen"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-007",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-999",
|
||||||
|
"amount": 50.0,
|
||||||
|
"description": "Unknown account",
|
||||||
|
"status": "rejected",
|
||||||
|
"reason": "account not found"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-008",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 0,
|
||||||
|
"description": "Zero-amount deposit",
|
||||||
|
"status": "rejected",
|
||||||
|
"reason": "amount must be positive"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
61
Clean Code exercise/example2_bank/transactions.json
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
{
|
||||||
|
"transactions": [
|
||||||
|
{
|
||||||
|
"id": "TXN-001",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 500.00,
|
||||||
|
"description": "Salary payment"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-002",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 200.00,
|
||||||
|
"description": "ATM withdrawal"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-003",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"amount": 1500.00,
|
||||||
|
"description": "Rent payment - exceeds balance"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-004",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-002",
|
||||||
|
"amount": -100.00,
|
||||||
|
"description": "Invalid negative deposit"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-005",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-003",
|
||||||
|
"amount": 1000.00,
|
||||||
|
"description": "Deposit to frozen account"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-006",
|
||||||
|
"type": "transfer",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"to_account_id": "ACC-002",
|
||||||
|
"amount": 750.00,
|
||||||
|
"description": "Transfer to Bob"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-007",
|
||||||
|
"type": "withdrawal",
|
||||||
|
"account_id": "ACC-999",
|
||||||
|
"amount": 50.00,
|
||||||
|
"description": "Unknown account"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TXN-008",
|
||||||
|
"type": "deposit",
|
||||||
|
"account_id": "ACC-001",
|
||||||
|
"amount": 0,
|
||||||
|
"description": "Zero-amount deposit"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
486
Code embeddings/00_tokens_and_embeddings_intro.py
Normal file
@ -0,0 +1,486 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 0: Tokens, Embeddings, and Language Similarity — An Introduction
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
GOAL:
|
||||||
|
Before we look at CODE embeddings, we need to understand the
|
||||||
|
foundational concepts: tokenization and text embeddings. This script
|
||||||
|
walks through the full pipeline step by step, using German words
|
||||||
|
and phrases so you can build intuition in your native language.
|
||||||
|
|
||||||
|
The pipeline is: Text → Tokens → Token IDs → Embedding Vectors
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
1. How text is split into TOKENS (sub-word units)
|
||||||
|
2. How tokens are mapped to integer IDs (the model's vocabulary)
|
||||||
|
3. How token IDs become dense EMBEDDING VECTORS (768 dimensions)
|
||||||
|
4. How cosine similarity measures meaning — similar phrases are
|
||||||
|
close in vector space, different phrases are far apart
|
||||||
|
5. How to VISUALIZE the embedding space in 2D using PCA
|
||||||
|
|
||||||
|
LANGUAGE:
|
||||||
|
All examples use German words and phrases to make the concepts
|
||||||
|
tangible. The model (multilingual) handles German natively.
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from transformers import AutoTokenizer, AutoModel, BertTokenizer
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
import matplotlib
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load a MULTILINGUAL EMBEDDING model ───────────────────────────────────
|
||||||
|
# We use paraphrase-multilingual-mpnet-base-v2: a sentence embedding model
|
||||||
|
# fine-tuned for semantic similarity across 50+ languages including German.
|
||||||
|
# It uses an XLM-RoBERTa backbone and produces 768-dimensional embeddings
|
||||||
|
# where cosine similarity directly reflects semantic similarity.
|
||||||
|
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
|
||||||
|
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# ── Load a German-only tokenizer for comparison ──────────────────────────
|
||||||
|
# gbert-base uses WordPiece trained exclusively on German text (~31k vocab).
|
||||||
|
# We only load its tokenizer — no model weights needed.
|
||||||
|
GERMAN_TOKENIZER_NAME = "deepset/gbert-base"
|
||||||
|
print(f"Loading German tokenizer: {GERMAN_TOKENIZER_NAME} ...")
|
||||||
|
german_tokenizer = BertTokenizer.from_pretrained(GERMAN_TOKENIZER_NAME)
|
||||||
|
print("German tokenizer loaded.\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
# PART 1: TOKENIZATION — How text becomes numbers
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 70)
|
||||||
|
print("PART 1: TOKENIZATION")
|
||||||
|
print("=" * 70)
|
||||||
|
print("""
|
||||||
|
Neural networks cannot read text — they only understand numbers.
|
||||||
|
TOKENIZATION is the first step: splitting text into sub-word pieces
|
||||||
|
called TOKENS, then mapping each token to an integer ID.
|
||||||
|
|
||||||
|
We compare two tokenizers:
|
||||||
|
• gbert (German-only, ~31k vocab) — trained exclusively on German text
|
||||||
|
• mpnet (multilingual, ~250k vocab) — trained on 100+ languages
|
||||||
|
""")
|
||||||
|
|
||||||
|
german_words = [
|
||||||
|
"Fachhochschule",
|
||||||
|
"Softwareentwicklung",
|
||||||
|
"Künstliche Intelligenz",
|
||||||
|
"Programmiersprache",
|
||||||
|
"Datenbank",
|
||||||
|
"Maschinelles Lernen",
|
||||||
|
"Graubünden",
|
||||||
|
"unhappiness", # English comparison
|
||||||
|
]
|
||||||
|
|
||||||
|
# ── 1a: German-only tokenizer (gbert / WordPiece) ────────────────────────
|
||||||
|
print("─── 1a: German-Only Tokenizer (gbert, WordPiece, 31k vocab) ───\n")
|
||||||
|
print(f"{'Word/Phrase':<28s} {'#':>3s} {'Tokens'}")
|
||||||
|
print("-" * 90)
|
||||||
|
|
||||||
|
for word in german_words:
|
||||||
|
ids = german_tokenizer.encode(word, add_special_tokens=False)
|
||||||
|
toks = german_tokenizer.convert_ids_to_tokens(ids)
|
||||||
|
print(f"{word:<28s} {len(toks):3d} {' | '.join(toks)}")
|
||||||
|
|
||||||
|
# ── 1b: Multilingual tokenizer (mpnet / SentencePiece) ───────────────────
|
||||||
|
print(f"\n─── 1b: Multilingual Tokenizer (mpnet, SentencePiece, 250k vocab) ───\n")
|
||||||
|
print(f"{'Word/Phrase':<28s} {'#':>3s} {'Tokens'}")
|
||||||
|
print("-" * 90)
|
||||||
|
|
||||||
|
for word in german_words:
|
||||||
|
ids = tokenizer.encode(word, add_special_tokens=False)
|
||||||
|
toks = tokenizer.convert_ids_to_tokens(ids)
|
||||||
|
print(f"{word:<28s} {len(toks):3d} {' | '.join(toks)}")
|
||||||
|
|
||||||
|
print("""
|
||||||
|
KEY OBSERVATIONS:
|
||||||
|
• The GERMAN tokenizer keeps common words intact: "Fachhochschule" is
|
||||||
|
a SINGLE token, "Programmiersprache" splits at the natural compound
|
||||||
|
boundary "Programmier" + "sprache".
|
||||||
|
• The MULTILINGUAL tokenizer fragments German more aggressively:
|
||||||
|
"Fachhochschule" → 4 tokens ("Fach", "ho", "ch", "schule"), because
|
||||||
|
its 250k vocabulary is shared across 100+ languages — German gets
|
||||||
|
a smaller budget per word.
|
||||||
|
• Both tokenizers use STATISTICAL sub-word splitting (not morphological
|
||||||
|
analysis). The German tokenizer simply has more German-specific
|
||||||
|
entries because its entire vocabulary is dedicated to one language.
|
||||||
|
• Trade-off: the multilingual tokenizer needs more tokens per German
|
||||||
|
word, but it enables CROSS-LINGUAL capabilities (comparing German
|
||||||
|
and English in the same embedding space — see Part 3b).
|
||||||
|
• The rest of this script uses the multilingual model for embeddings.
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
# PART 2: FROM TOKENS TO EMBEDDING VECTORS
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 70)
|
||||||
|
print("PART 2: FROM TOKENS TO EMBEDDING VECTORS")
|
||||||
|
print("=" * 70)
|
||||||
|
print("""
|
||||||
|
Each token ID is looked up in an EMBEDDING TABLE — a large matrix where
|
||||||
|
each row is a dense vector (768 dimensions in this model, up to 4096 in
|
||||||
|
large LLMs). The transformer then refines these vectors through 12 layers
|
||||||
|
of self-attention, producing contextual embeddings where each token's
|
||||||
|
vector depends on ALL surrounding tokens.
|
||||||
|
""")
|
||||||
|
|
||||||
|
example_sentence = "Der Student lernt Programmieren an der Fachhochschule"
|
||||||
|
|
||||||
|
inputs = tokenizer(example_sentence, return_tensors="pt").to(DEVICE)
|
||||||
|
token_ids = inputs["input_ids"].squeeze().tolist()
|
||||||
|
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
|
||||||
|
# outputs.last_hidden_state: shape [1, num_tokens, 768]
|
||||||
|
hidden_states = outputs.last_hidden_state.squeeze(0)
|
||||||
|
|
||||||
|
print(f'Sentence: "{example_sentence}"\n')
|
||||||
|
print(f"{'Pos':>4s} {'Token':<20s} {'ID':>7s} {'Vector (first 8 of 768 dims)...'}")
|
||||||
|
print("-" * 80)
|
||||||
|
|
||||||
|
for i, (tok, tid) in enumerate(zip(tokens, token_ids)):
|
||||||
|
vec = hidden_states[i].cpu().numpy()
|
||||||
|
vec_preview = " ".join(f"{v:+.3f}" for v in vec[:8])
|
||||||
|
print(f"{i:4d} {tok:<20s} {tid:7d} [{vec_preview} ...]")
|
||||||
|
|
||||||
|
print(f"""
|
||||||
|
KEY OBSERVATIONS:
|
||||||
|
• Each token becomes a vector of {hidden_states.shape[1]} numbers.
|
||||||
|
• These numbers are NOT random — they encode the token's meaning
|
||||||
|
IN CONTEXT. The vector for "Fachhochschule" here is different from
|
||||||
|
the vector for "Fachhochschule" in a different sentence.
|
||||||
|
• The full sentence has {len(tokens)} tokens, producing a matrix of
|
||||||
|
shape [{len(tokens)} × {hidden_states.shape[1]}].
|
||||||
|
• To get a single vector for the whole sentence, we average all
|
||||||
|
token vectors (mean pooling).
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
# PART 3: MEASURING SIMILARITY BETWEEN WORDS
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 70)
|
||||||
|
print("PART 3: WORD AND PHRASE SIMILARITY")
|
||||||
|
print("=" * 70)
|
||||||
|
print("""
|
||||||
|
If embeddings capture meaning, then SIMILAR words should have SIMILAR
|
||||||
|
vectors (high cosine similarity) and DIFFERENT words should have
|
||||||
|
DIFFERENT vectors (low cosine similarity). Let's test this with German.
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def embed_text(text: str) -> torch.Tensor:
|
||||||
|
"""Embed a word or phrase into a single normalized vector."""
|
||||||
|
inputs = tokenizer(text, return_tensors="pt", truncation=True,
|
||||||
|
max_length=128, padding=True).to(DEVICE)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
mask = inputs["attention_mask"].unsqueeze(-1)
|
||||||
|
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||||||
|
return F.normalize(embedding, p=2, dim=1).squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── 3a: Single word similarities ─────────────────────────────────────────
|
||||||
|
print("─── 3a: Single Word Similarities ───\n")
|
||||||
|
|
||||||
|
word_pairs = [
|
||||||
|
# Semantically SIMILAR pairs (synonyms or near-synonyms)
|
||||||
|
("Auto", "Fahrzeug"), # car / vehicle — near-synonyms
|
||||||
|
("Arzt", "Doktor"), # physician / doctor — synonyms
|
||||||
|
("Programmierer", "Entwickler"), # programmer / developer
|
||||||
|
("schnell", "rasch"), # fast / swift — synonyms
|
||||||
|
("Haus", "Gebäude"), # house / building — closely related
|
||||||
|
|
||||||
|
# SAME CATEGORY but different concepts
|
||||||
|
("Hund", "Katze"), # dog / cat — both pets, but different!
|
||||||
|
("Montag", "Freitag"), # Monday / Friday — both weekdays
|
||||||
|
|
||||||
|
# Semantically UNRELATED pairs
|
||||||
|
("Hund", "Mathematik"), # dog vs math
|
||||||
|
("Auto", "Philosophie"), # car vs philosophy
|
||||||
|
("schnell", "Datenbank"), # fast vs database
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"{'Word A':<20s} {'Word B':<20s} {'Cosine Sim':>10s} {'Relationship'}")
|
||||||
|
print("-" * 75)
|
||||||
|
|
||||||
|
for w1, w2 in word_pairs:
|
||||||
|
v1, v2 = embed_text(w1), embed_text(w2)
|
||||||
|
sim = torch.dot(v1.cpu(), v2.cpu()).item()
|
||||||
|
if sim > 0.6:
|
||||||
|
rel = "synonyms/close"
|
||||||
|
elif sim > 0.3:
|
||||||
|
rel = "related"
|
||||||
|
else:
|
||||||
|
rel = "unrelated"
|
||||||
|
bar = "█" * int(max(0, sim) * 30)
|
||||||
|
print(f"{w1:<20s} {w2:<20s} {sim:10.3f} {bar} ({rel})")
|
||||||
|
|
||||||
|
print("""
|
||||||
|
KEY OBSERVATIONS:
|
||||||
|
→ Synonyms (Auto/Fahrzeug, Arzt/Doktor) have HIGHEST similarity.
|
||||||
|
→ Same-category but different concepts (Hund/Katze) have MODERATE
|
||||||
|
similarity — they share context (both are pets) but a dog is NOT
|
||||||
|
a cat. The model captures this nuance!
|
||||||
|
→ Completely unrelated words (Hund/Mathematik) have LOW similarity.
|
||||||
|
→ Embedding similarity reflects MEANING OVERLAP, not just category.
|
||||||
|
""")
|
||||||
|
|
||||||
|
# ── 3b: Phrase/sentence similarities ─────────────────────────────────────
|
||||||
|
print("─── 3b: Phrase and Sentence Similarities ───\n")
|
||||||
|
|
||||||
|
phrases = {
|
||||||
|
"ML_de": "Maschinelles Lernen ist ein Teilgebiet der Informatik",
|
||||||
|
"ML_en": "Machine learning is a subfield of computer science",
|
||||||
|
"DL_de": "Deep Learning verwendet neuronale Netze mit vielen Schichten",
|
||||||
|
"Koch": "Der Koch bereitet das Abendessen in der Küche vor",
|
||||||
|
"Wetter": "Morgen wird es regnen und kalt sein",
|
||||||
|
"Prog": "Python ist eine beliebte Programmiersprache",
|
||||||
|
}
|
||||||
|
|
||||||
|
phrase_embeddings = {name: embed_text(text) for name, text in phrases.items()}
|
||||||
|
|
||||||
|
names = list(phrases.keys())
|
||||||
|
print(f"{'':>10s}", end="")
|
||||||
|
for n in names:
|
||||||
|
print(f"{n:>10s}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for n1 in names:
|
||||||
|
print(f"{n1:>10s}", end="")
|
||||||
|
for n2 in names:
|
||||||
|
sim = torch.dot(phrase_embeddings[n1].cpu(),
|
||||||
|
phrase_embeddings[n2].cpu()).item()
|
||||||
|
print(f"{sim:10.3f}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("""
|
||||||
|
KEY OBSERVATIONS:
|
||||||
|
• "Maschinelles Lernen..." (German) and "Machine learning..." (English)
|
||||||
|
should have HIGH similarity — the model understands both languages
|
||||||
|
and maps equivalent meanings to nearby vectors.
|
||||||
|
• ML and Deep Learning sentences should be moderately similar (related
|
||||||
|
topics in computer science).
|
||||||
|
• The cooking sentence and weather sentence should be DISSIMILAR to
|
||||||
|
the tech sentences — completely different topics.
|
||||||
|
• This CROSS-LINGUAL capability is what makes multilingual embeddings
|
||||||
|
so powerful.
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
# PART 4: VISUALIZING THE EMBEDDING SPACE
|
||||||
|
# ══════════════════════════════════════════════════════════════════════════
|
||||||
|
print("=" * 70)
|
||||||
|
print("PART 4: VISUALIZING THE EMBEDDING SPACE")
|
||||||
|
print("=" * 70)
|
||||||
|
print("""
|
||||||
|
768 dimensions are impossible to visualize. We use PCA to project the
|
||||||
|
vectors down to 2D while preserving as much structure as possible.
|
||||||
|
If the embeddings truly capture meaning, we should see CLUSTERS of
|
||||||
|
related words in the 2D plot.
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Groups of German words organized by semantic category
|
||||||
|
word_groups = {
|
||||||
|
"Tiere": ["Hund", "Katze", "Pferd", "Vogel", "Fisch", "Kuh"],
|
||||||
|
"Technik": ["Computer", "Software", "Programmieren", "Datenbank",
|
||||||
|
"Algorithmus", "Internet"],
|
||||||
|
"Essen": ["Brot", "Käse", "Apfel", "Suppe", "Kuchen", "Wurst"],
|
||||||
|
"Natur": ["Berg", "Fluss", "Wald", "See", "Wiese", "Schnee"],
|
||||||
|
"Berufe": ["Arzt", "Lehrer", "Ingenieur", "Koch", "Pilot", "Anwalt"],
|
||||||
|
}
|
||||||
|
|
||||||
|
all_words = []
|
||||||
|
all_categories = []
|
||||||
|
all_vectors = []
|
||||||
|
|
||||||
|
print("Computing embeddings for word groups...")
|
||||||
|
for category, words in word_groups.items():
|
||||||
|
for word in words:
|
||||||
|
vec = embed_text(word).cpu().numpy()
|
||||||
|
all_words.append(word)
|
||||||
|
all_categories.append(category)
|
||||||
|
all_vectors.append(vec)
|
||||||
|
print(f" {category}: {', '.join(words)}")
|
||||||
|
|
||||||
|
X = np.stack(all_vectors)
|
||||||
|
print(f"\nEmbedding matrix: {X.shape[0]} words × {X.shape[1]} dimensions")
|
||||||
|
|
||||||
|
# ── PCA to 2D ────────────────────────────────────────────────────────────
|
||||||
|
pca = PCA(n_components=2)
|
||||||
|
X_2d = pca.fit_transform(X)
|
||||||
|
|
||||||
|
# ── Plot ──────────────────────────────────────────────────────────────────
|
||||||
|
category_names = list(word_groups.keys())
|
||||||
|
cmap = plt.cm.Set1
|
||||||
|
colors = {cat: cmap(i / len(category_names)) for i, cat in enumerate(category_names)}
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(12, 9))
|
||||||
|
|
||||||
|
for i, (word, cat) in enumerate(zip(all_words, all_categories)):
|
||||||
|
x, y = X_2d[i]
|
||||||
|
ax.scatter(x, y, c=[colors[cat]], s=120, edgecolors="black",
|
||||||
|
linewidth=0.5, zorder=3)
|
||||||
|
ax.annotate(word, (x, y), fontsize=9, ha="center", va="bottom",
|
||||||
|
xytext=(0, 7), textcoords="offset points",
|
||||||
|
fontweight="bold")
|
||||||
|
|
||||||
|
for cat in category_names:
|
||||||
|
ax.scatter([], [], c=[colors[cat]], s=100, label=cat,
|
||||||
|
edgecolors="black", linewidth=0.5)
|
||||||
|
|
||||||
|
ax.legend(loc="best", fontsize=11, title="Kategorie", title_fontsize=12,
|
||||||
|
framealpha=0.9)
|
||||||
|
|
||||||
|
var = pca.explained_variance_ratio_
|
||||||
|
ax.set_title(
|
||||||
|
"Deutsche Wörter im Embedding-Raum (768D → 2D via PCA)\n"
|
||||||
|
f"PC1: {var[0]:.1%} Varianz, PC2: {var[1]:.1%} Varianz",
|
||||||
|
fontsize=14, fontweight="bold"
|
||||||
|
)
|
||||||
|
ax.set_xlabel("Hauptkomponente 1 (PC1)", fontsize=12)
|
||||||
|
ax.set_ylabel("Hauptkomponente 2 (PC2)", fontsize=12)
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig("embedding_space_german.png", dpi=150)
|
||||||
|
print(f"\nSaved: embedding_space_german.png")
|
||||||
|
|
||||||
|
# ── Second plot: Phrases including cross-lingual ──────────────────────────
|
||||||
|
print("\nComputing phrase embeddings for visualization...")
|
||||||
|
|
||||||
|
viz_phrases = {
|
||||||
|
# German CS phrases
|
||||||
|
"Maschinelles Lernen": "Technik (DE)",
|
||||||
|
"Neuronale Netze": "Technik (DE)",
|
||||||
|
"Softwareentwicklung": "Technik (DE)",
|
||||||
|
"Künstliche Intelligenz": "Technik (DE)",
|
||||||
|
# English equivalents
|
||||||
|
"Machine Learning": "Technik (EN)",
|
||||||
|
"Neural Networks": "Technik (EN)",
|
||||||
|
"Software Development": "Technik (EN)",
|
||||||
|
"Artificial Intelligence": "Technik (EN)",
|
||||||
|
# German everyday phrases
|
||||||
|
"Guten Morgen": "Alltag (DE)",
|
||||||
|
"Wie geht es Ihnen": "Alltag (DE)",
|
||||||
|
"Das Wetter ist schön": "Alltag (DE)",
|
||||||
|
"Ich gehe einkaufen": "Alltag (DE)",
|
||||||
|
# English everyday phrases
|
||||||
|
"Good morning": "Alltag (EN)",
|
||||||
|
"How are you": "Alltag (EN)",
|
||||||
|
"The weather is nice": "Alltag (EN)",
|
||||||
|
"I am going shopping": "Alltag (EN)",
|
||||||
|
}
|
||||||
|
|
||||||
|
phrase_labels = list(viz_phrases.keys())
|
||||||
|
phrase_cats = list(viz_phrases.values())
|
||||||
|
phrase_vecs = np.stack([embed_text(p).cpu().numpy() for p in phrase_labels])
|
||||||
|
|
||||||
|
pca2 = PCA(n_components=2)
|
||||||
|
P_2d = pca2.fit_transform(phrase_vecs)
|
||||||
|
|
||||||
|
cat_colors = {
|
||||||
|
"Technik (DE)": "#1f77b4",
|
||||||
|
"Technik (EN)": "#aec7e8",
|
||||||
|
"Alltag (DE)": "#d62728",
|
||||||
|
"Alltag (EN)": "#ff9896",
|
||||||
|
}
|
||||||
|
|
||||||
|
fig2, ax2 = plt.subplots(figsize=(12, 9))
|
||||||
|
|
||||||
|
for i, (label, cat) in enumerate(zip(phrase_labels, phrase_cats)):
|
||||||
|
x, y = P_2d[i]
|
||||||
|
marker = "o" if "(DE)" in cat else "s" # circle=German, square=English
|
||||||
|
ax2.scatter(x, y, c=cat_colors[cat], s=140, marker=marker,
|
||||||
|
edgecolors="black", linewidth=0.5, zorder=3)
|
||||||
|
ax2.annotate(label, (x, y), fontsize=8, ha="center", va="bottom",
|
||||||
|
xytext=(0, 8), textcoords="offset points")
|
||||||
|
|
||||||
|
for cat, color in cat_colors.items():
|
||||||
|
marker = "o" if "(DE)" in cat else "s"
|
||||||
|
ax2.scatter([], [], c=color, s=100, marker=marker, label=cat,
|
||||||
|
edgecolors="black", linewidth=0.5)
|
||||||
|
|
||||||
|
ax2.legend(loc="best", fontsize=10, title="Kategorie & Sprache",
|
||||||
|
title_fontsize=11, framealpha=0.9)
|
||||||
|
|
||||||
|
var2 = pca2.explained_variance_ratio_
|
||||||
|
ax2.set_title(
|
||||||
|
"Cross-lingual Embeddings: Deutsche & Englische Phrasen\n"
|
||||||
|
f"PC1: {var2[0]:.1%} Varianz, PC2: {var2[1]:.1%} Varianz",
|
||||||
|
fontsize=14, fontweight="bold"
|
||||||
|
)
|
||||||
|
ax2.set_xlabel("Hauptkomponente 1 (PC1)", fontsize=12)
|
||||||
|
ax2.set_ylabel("Hauptkomponente 2 (PC2)", fontsize=12)
|
||||||
|
ax2.grid(True, alpha=0.3)
|
||||||
|
fig2.tight_layout()
|
||||||
|
fig2.savefig("embedding_space_crosslingual.png", dpi=150)
|
||||||
|
print(f"Saved: embedding_space_crosslingual.png")
|
||||||
|
|
||||||
|
print(f"""
|
||||||
|
{'=' * 70}
|
||||||
|
SUMMARY: THE FULL PIPELINE
|
||||||
|
{'=' * 70}
|
||||||
|
|
||||||
|
Text → Tokens → Token IDs → Embeddings
|
||||||
|
"Fachhochschule" [▁Fach, ho, [28356, 497, [0.012, -0.34,
|
||||||
|
ch, schule] 206, 72460] 0.88, ...]
|
||||||
|
(768 dimensions)
|
||||||
|
|
||||||
|
1. TOKENIZATION splits text into statistical sub-word pieces.
|
||||||
|
→ Splits are based on frequency, not German morphology.
|
||||||
|
→ Each token maps to an integer ID from the vocabulary.
|
||||||
|
|
||||||
|
2. EMBEDDING VECTORS are 768-dimensional representations of meaning.
|
||||||
|
→ Computed by the transformer's 12 layers of self-attention.
|
||||||
|
→ Similar meanings → nearby vectors (high cosine similarity).
|
||||||
|
→ Different meanings → distant vectors (low cosine similarity).
|
||||||
|
|
||||||
|
3. COSINE SIMILARITY measures how "aligned" two vectors are.
|
||||||
|
→ 1.0 = identical meaning, 0.0 = unrelated, -1.0 = opposite.
|
||||||
|
|
||||||
|
4. CROSS-LINGUAL EMBEDDINGS map equivalent phrases in different
|
||||||
|
languages to nearby vectors. "Maschinelles Lernen" ≈ "Machine
|
||||||
|
Learning" in embedding space.
|
||||||
|
|
||||||
|
5. The SAME PRINCIPLES apply to CODE EMBEDDINGS (next examples):
|
||||||
|
→ Code is tokenized into sub-word pieces
|
||||||
|
→ A transformer produces embedding vectors
|
||||||
|
→ Similar code has similar vectors
|
||||||
|
→ This enables semantic code search, clone detection, and RAG
|
||||||
|
|
||||||
|
Check the two PNG files for visual confirmation:
|
||||||
|
• embedding_space_german.png — German word clusters
|
||||||
|
• embedding_space_crosslingual.png — DE/EN phrase alignment
|
||||||
|
""")
|
||||||
231
Code embeddings/01_basic_embeddings.py
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 1: Computing Code Embeddings and Measuring Similarity
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
GOAL:
|
||||||
|
Load a pre-trained code embedding model, embed several code snippets,
|
||||||
|
and compute pairwise cosine similarities to see which snippets the
|
||||||
|
model considers semantically similar.
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
- How to load a code embedding model with PyTorch
|
||||||
|
- How code is tokenized and converted to vectors
|
||||||
|
- How cosine similarity reveals semantic relationships
|
||||||
|
- That similar functionality → high similarity, different purpose → low
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
# PyTorch supports three backends:
|
||||||
|
# - "cuda" → NVIDIA GPUs (Linux/Windows)
|
||||||
|
# - "mps" → Apple Silicon GPUs (macOS M1/M2/M3/M4)
|
||||||
|
# - "cpu" → always available, slower
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load model and tokenizer ─────────────────────────────────────────────
|
||||||
|
# We use st-codesearch-distilroberta-base — a DistilRoBERTa model (82M params)
|
||||||
|
# specifically fine-tuned on 1.38M code-comment pairs from CodeSearchNet using
|
||||||
|
# contrastive learning. It produces 768-dim embeddings optimized for matching
|
||||||
|
# natural language descriptions to code, making it ideal for code search and
|
||||||
|
# similarity tasks.
|
||||||
|
MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base"
|
||||||
|
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval() # disable dropout — we want deterministic embeddings
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Define code snippets to compare ──────────────────────────────────────
|
||||||
|
# We intentionally include:
|
||||||
|
# - Two sorting functions (similar purpose, different implementation)
|
||||||
|
# - A function that does something completely different (JSON parsing)
|
||||||
|
# - A sorting function in a different style (list comprehension)
|
||||||
|
snippets = {
|
||||||
|
"bubble_sort": """
|
||||||
|
def bubble_sort(arr):
|
||||||
|
n = len(arr)
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(0, n - i - 1):
|
||||||
|
if arr[j] > arr[j + 1]:
|
||||||
|
arr[j], arr[j + 1] = arr[j + 1], arr[j]
|
||||||
|
return arr
|
||||||
|
""",
|
||||||
|
"quick_sort": """
|
||||||
|
def quick_sort(arr):
|
||||||
|
if len(arr) <= 1:
|
||||||
|
return arr
|
||||||
|
pivot = arr[len(arr) // 2]
|
||||||
|
left = [x for x in arr if x < pivot]
|
||||||
|
middle = [x for x in arr if x == pivot]
|
||||||
|
right = [x for x in arr if x > pivot]
|
||||||
|
return quick_sort(left) + middle + quick_sort(right)
|
||||||
|
""",
|
||||||
|
"sorted_builtin": """
|
||||||
|
def sort_list(data):
|
||||||
|
return sorted(data)
|
||||||
|
""",
|
||||||
|
"parse_json": """
|
||||||
|
import json
|
||||||
|
|
||||||
|
def parse_config(filepath):
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
config = json.load(f)
|
||||||
|
return config
|
||||||
|
""",
|
||||||
|
"read_csv": """
|
||||||
|
import csv
|
||||||
|
|
||||||
|
def read_csv_file(filepath):
|
||||||
|
rows = []
|
||||||
|
with open(filepath, 'r') as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
for row in reader:
|
||||||
|
rows.append(row)
|
||||||
|
return rows
|
||||||
|
""",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def embed_code(code_text: str) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Convert a code snippet into a single embedding vector.
|
||||||
|
|
||||||
|
This function implements the full pipeline from the lecture:
|
||||||
|
raw code → tokens → token embeddings → single vector → unit vector
|
||||||
|
|
||||||
|
Why a function like this is needed:
|
||||||
|
A transformer model outputs one vector *per token*, but we need a single
|
||||||
|
vector that represents the entire snippet so we can compare snippets using
|
||||||
|
cosine similarity. This function handles tokenization, the forward pass,
|
||||||
|
pooling (many vectors → one), and normalization (arbitrary length → unit).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A 768-dimensional unit vector (torch.Tensor) representing the code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ── Step 1: Tokenization ──────────────────────────────────────────────
|
||||||
|
# The model cannot read raw text. We must split the code into sub-word
|
||||||
|
# tokens and convert each token to its integer ID from the vocabulary.
|
||||||
|
#
|
||||||
|
# The tokenizer also produces an "attention mask": a tensor of 1s and 0s
|
||||||
|
# indicating which positions are real tokens (1) vs. padding (0).
|
||||||
|
# Padding is needed because tensors must have uniform length.
|
||||||
|
#
|
||||||
|
# truncation=True: if the code exceeds 512 tokens, cut it off.
|
||||||
|
# Why 512? This model was trained with a max context of 512 tokens.
|
||||||
|
# Anything beyond that would be out-of-distribution.
|
||||||
|
inputs = tokenizer(
|
||||||
|
code_text,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
max_length=512,
|
||||||
|
padding=True
|
||||||
|
).to(DEVICE)
|
||||||
|
|
||||||
|
# ── Step 2: Forward pass through the transformer ──────────────────────
|
||||||
|
# The model processes all tokens through multiple layers of self-attention
|
||||||
|
# (as covered in the lecture). Each layer refines the representation.
|
||||||
|
#
|
||||||
|
# torch.no_grad() disables gradient tracking because we are only doing
|
||||||
|
# inference, not training. This saves memory and speeds things up.
|
||||||
|
#
|
||||||
|
# The output contains a CONTEXTUAL embedding for EACH token:
|
||||||
|
# outputs.last_hidden_state has shape [1, seq_len, 768]
|
||||||
|
# → 1 batch, seq_len tokens, each represented as a 768-dim vector.
|
||||||
|
#
|
||||||
|
# These are NOT the static input embeddings — they have been transformed
|
||||||
|
# by the attention mechanism, so each token's vector now encodes context
|
||||||
|
# from ALL other tokens in the sequence.
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
|
||||||
|
# ── Step 3: Mean pooling — many token vectors → one snippet vector ────
|
||||||
|
# Problem: we have one 768-dim vector per token, but we need ONE vector
|
||||||
|
# for the entire code snippet (so we can compare it to other snippets).
|
||||||
|
#
|
||||||
|
# Solution: average all token vectors. This is called "mean pooling."
|
||||||
|
#
|
||||||
|
# Subtlety: we must ignore padding tokens. If the code has 30 real tokens
|
||||||
|
# but the tensor was padded to 40, we don't want the 10 zero-vectors from
|
||||||
|
# padding to dilute the average. The attention mask lets us do this:
|
||||||
|
# 1. Multiply each token vector by its mask (1 for real, 0 for padding)
|
||||||
|
# 2. Sum the masked vectors
|
||||||
|
# 3. Divide by the number of real tokens (not the padded length)
|
||||||
|
attention_mask = inputs["attention_mask"].unsqueeze(-1) # [1, seq_len, 1]
|
||||||
|
masked_output = outputs.last_hidden_state * attention_mask
|
||||||
|
embedding = masked_output.sum(dim=1) / attention_mask.sum(dim=1)
|
||||||
|
|
||||||
|
# ── Step 4: L2 normalization — project onto the unit hypersphere ──────
|
||||||
|
# From the lecture: when vectors are normalized to length 1, cosine
|
||||||
|
# similarity simplifies to the dot product:
|
||||||
|
#
|
||||||
|
# cos(θ) = (a · b) / (‖a‖ · ‖b‖) → if ‖a‖=‖b‖=1 → cos(θ) = a · b
|
||||||
|
#
|
||||||
|
# This is not just a convenience — it is standard practice in production
|
||||||
|
# embedding systems (OpenAI, Cohere, etc.) because:
|
||||||
|
# - Dot products are faster to compute than full cosine similarity
|
||||||
|
# - Vector databases are optimized for dot-product search
|
||||||
|
# - It removes magnitude differences so we compare direction only
|
||||||
|
embedding = F.normalize(embedding, p=2, dim=1)
|
||||||
|
|
||||||
|
return embedding.squeeze(0) # remove batch dim → shape: [768]
|
||||||
|
|
||||||
|
|
||||||
|
# ── Compute embeddings for all snippets ───────────────────────────────────
|
||||||
|
print("Computing embeddings...")
|
||||||
|
embeddings = {}
|
||||||
|
for name, code in snippets.items():
|
||||||
|
embeddings[name] = embed_code(code)
|
||||||
|
num_tokens = len(tokenizer.encode(code))
|
||||||
|
print(f" {name:20s} → {num_tokens:3d} tokens → vector of dim {embeddings[name].shape[0]}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ── Compute pairwise cosine similarities ──────────────────────────────────
|
||||||
|
# cosine_similarity = dot product of unit vectors (we already normalized above)
|
||||||
|
names = list(embeddings.keys())
|
||||||
|
print("Pairwise Cosine Similarities:")
|
||||||
|
print(f"{'':22s}", end="")
|
||||||
|
for n in names:
|
||||||
|
print(f"{n:>16s}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i, n1 in enumerate(names):
|
||||||
|
print(f"{n1:22s}", end="")
|
||||||
|
for j, n2 in enumerate(names):
|
||||||
|
sim = torch.dot(embeddings[n1].cpu(), embeddings[n2].cpu()).item()
|
||||||
|
print(f"{sim:16.3f}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ── Interpretation ────────────────────────────────────────────────────────
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("INTERPRETATION:")
|
||||||
|
print("=" * 70)
|
||||||
|
print("""
|
||||||
|
- bubble_sort, quick_sort, and sorted_builtin should have HIGH similarity
|
||||||
|
(all perform sorting, despite very different implementations).
|
||||||
|
- parse_json and read_csv should be similar to each other (both read files)
|
||||||
|
but DISSIMILAR to the sorting functions (different purpose).
|
||||||
|
- This demonstrates that code embeddings capture WHAT code does,
|
||||||
|
not just HOW it looks syntactically.
|
||||||
|
""")
|
||||||
251
Code embeddings/02_text_to_code_search.py
Normal file
@ -0,0 +1,251 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 2: Text-to-Code Semantic Search
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
GOAL:
|
||||||
|
Build a mini code search engine: given a natural language query like
|
||||||
|
"sort a list", find the most relevant code snippet from a collection.
|
||||||
|
This is the core mechanism behind semantic code search in tools like
|
||||||
|
Cursor, GitHub Copilot, and code search engines.
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
- How the SAME embedding model maps both text and code into a shared
|
||||||
|
vector space — this is what makes text-to-code search possible.
|
||||||
|
- How to build a simple search index and query it.
|
||||||
|
- Why embedding-based search beats keyword search for code.
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load model ────────────────────────────────────────────────────────────
|
||||||
|
MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base"
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# ── Code "database" ──────────────────────────────────────────────────────
|
||||||
|
# Imagine these are functions in a large codebase that we want to search.
|
||||||
|
code_database = [
|
||||||
|
{
|
||||||
|
"name": "binary_search",
|
||||||
|
"code": """
|
||||||
|
def binary_search(arr, target):
|
||||||
|
low, high = 0, len(arr) - 1
|
||||||
|
while low <= high:
|
||||||
|
mid = (low + high) // 2
|
||||||
|
if arr[mid] == target:
|
||||||
|
return mid
|
||||||
|
elif arr[mid] < target:
|
||||||
|
low = mid + 1
|
||||||
|
else:
|
||||||
|
high = mid - 1
|
||||||
|
return -1
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "merge_sort",
|
||||||
|
"code": """
|
||||||
|
def merge_sort(arr):
|
||||||
|
if len(arr) <= 1:
|
||||||
|
return arr
|
||||||
|
mid = len(arr) // 2
|
||||||
|
left = merge_sort(arr[:mid])
|
||||||
|
right = merge_sort(arr[mid:])
|
||||||
|
return merge(left, right)
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "read_json_file",
|
||||||
|
"code": """
|
||||||
|
import json
|
||||||
|
def read_json_file(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
return json.load(f)
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "calculate_average",
|
||||||
|
"code": """
|
||||||
|
def calculate_average(numbers):
|
||||||
|
if not numbers:
|
||||||
|
return 0.0
|
||||||
|
return sum(numbers) / len(numbers)
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "connect_database",
|
||||||
|
"code": """
|
||||||
|
import sqlite3
|
||||||
|
def connect_database(db_path):
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
return conn, cursor
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "send_http_request",
|
||||||
|
"code": """
|
||||||
|
import requests
|
||||||
|
def send_http_request(url, method='GET', data=None):
|
||||||
|
if method == 'GET':
|
||||||
|
response = requests.get(url)
|
||||||
|
else:
|
||||||
|
response = requests.post(url, json=data)
|
||||||
|
return response.json()
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "flatten_nested_list",
|
||||||
|
"code": """
|
||||||
|
def flatten(nested_list):
|
||||||
|
result = []
|
||||||
|
for item in nested_list:
|
||||||
|
if isinstance(item, list):
|
||||||
|
result.extend(flatten(item))
|
||||||
|
else:
|
||||||
|
result.append(item)
|
||||||
|
return result
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "count_words",
|
||||||
|
"code": """
|
||||||
|
def count_words(text):
|
||||||
|
words = text.lower().split()
|
||||||
|
word_count = {}
|
||||||
|
for word in words:
|
||||||
|
word_count[word] = word_count.get(word, 0) + 1
|
||||||
|
return word_count
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "validate_email",
|
||||||
|
"code": """
|
||||||
|
import re
|
||||||
|
def validate_email(email):
|
||||||
|
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$'
|
||||||
|
return bool(re.match(pattern, email))
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "fibonacci",
|
||||||
|
"code": """
|
||||||
|
def fibonacci(n):
|
||||||
|
if n <= 1:
|
||||||
|
return n
|
||||||
|
a, b = 0, 1
|
||||||
|
for _ in range(2, n + 1):
|
||||||
|
a, b = b, a + b
|
||||||
|
return b
|
||||||
|
"""
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def embed_text(text: str) -> torch.Tensor:
|
||||||
|
"""Embed a piece of text or code into a normalized vector."""
|
||||||
|
inputs = tokenizer(
|
||||||
|
text, return_tensors="pt", truncation=True, max_length=512, padding=True
|
||||||
|
).to(DEVICE)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
|
||||||
|
# Mean pooling over non-padding tokens
|
||||||
|
mask = inputs["attention_mask"].unsqueeze(-1)
|
||||||
|
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||||||
|
return F.normalize(embedding, p=2, dim=1).squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Step 1: Index the code database ───────────────────────────────────────
|
||||||
|
# In a real system this would be stored in a vector database (ChromaDB,
|
||||||
|
# Pinecone, pgvector). Here we keep it simple with a list of tensors.
|
||||||
|
print("Indexing code database...")
|
||||||
|
code_vectors = []
|
||||||
|
for entry in code_database:
|
||||||
|
vec = embed_text(entry["code"])
|
||||||
|
code_vectors.append(vec)
|
||||||
|
print(f" Indexed: {entry['name']}")
|
||||||
|
|
||||||
|
# Stack into a matrix: shape [num_snippets, embedding_dim]
|
||||||
|
code_matrix = torch.stack(code_vectors)
|
||||||
|
print(f"\nIndex built: {code_matrix.shape[0]} snippets, {code_matrix.shape[1]} dimensions\n")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Step 2: Search with natural language queries ──────────────────────────
|
||||||
|
queries = [
|
||||||
|
"sort a list of numbers",
|
||||||
|
"find an element in a sorted array",
|
||||||
|
"compute the mean of a list",
|
||||||
|
"make an HTTP API call",
|
||||||
|
"open and read a JSON file",
|
||||||
|
"check if an email address is valid",
|
||||||
|
"count word frequencies in a string",
|
||||||
|
"generate fibonacci numbers",
|
||||||
|
"connect to a SQL database",
|
||||||
|
"flatten a nested list into a single list",
|
||||||
|
]
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("SEMANTIC CODE SEARCH RESULTS")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
for query in queries:
|
||||||
|
# Embed the natural language query with the SAME model
|
||||||
|
query_vec = embed_text(query)
|
||||||
|
|
||||||
|
# Compute cosine similarity against all code embeddings
|
||||||
|
# Because vectors are normalized, dot product = cosine similarity
|
||||||
|
similarities = torch.mv(code_matrix.cpu(), query_vec.cpu())
|
||||||
|
|
||||||
|
# Rank results by similarity (highest first)
|
||||||
|
ranked_indices = torch.argsort(similarities, descending=True)
|
||||||
|
|
||||||
|
print(f'\nQuery: "{query}"')
|
||||||
|
print(f" Rank Score Function")
|
||||||
|
print(f" ---- ----- --------")
|
||||||
|
for rank, idx in enumerate(ranked_indices[:3]): # show top 3
|
||||||
|
score = similarities[idx].item()
|
||||||
|
name = code_database[idx]["name"]
|
||||||
|
marker = " ← best match" if rank == 0 else ""
|
||||||
|
print(f" {rank+1:4d} {score:.3f} {name}{marker}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("KEY OBSERVATIONS:")
|
||||||
|
print("=" * 70)
|
||||||
|
print("""
|
||||||
|
1. The model maps NATURAL LANGUAGE queries and CODE into the same vector
|
||||||
|
space. This is why "sort a list" finds merge_sort and "find an element
|
||||||
|
in a sorted array" finds binary_search — even though the queries
|
||||||
|
contain none of the function identifiers.
|
||||||
|
|
||||||
|
2. This is fundamentally different from grep/keyword search:
|
||||||
|
- grep "sort" would miss functions named "order" or "arrange"
|
||||||
|
- grep "find element" would miss "binary_search"
|
||||||
|
Embeddings understand MEANING, not just string matching.
|
||||||
|
|
||||||
|
3. This is exactly how Cursor, Copilot, and other AI coding tools
|
||||||
|
retrieve relevant code from your project to feed into the LLM.
|
||||||
|
""")
|
||||||
199
Code embeddings/03_cross_language.py
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 3: Cross-Language Code Similarity
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
GOAL:
|
||||||
|
Demonstrate that code embeddings capture FUNCTIONALITY, not syntax.
|
||||||
|
The same algorithm written in Python, JavaScript, Java, and C++
|
||||||
|
should produce similar embedding vectors — even though the surface
|
||||||
|
syntax is completely different.
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
- Code embedding models create a language-agnostic semantic space.
|
||||||
|
- Functionally equivalent code clusters together regardless of language.
|
||||||
|
- This enables cross-language code search (e.g., find the Java
|
||||||
|
equivalent of a Python function).
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load model ────────────────────────────────────────────────────────────
|
||||||
|
MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base"
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# ── Same algorithm in four languages ──────────────────────────────────────
|
||||||
|
# Task A: Factorial — a simple recursive/iterative computation
|
||||||
|
# Task B: Reverse a string
|
||||||
|
# If embeddings are truly semantic, Task A functions should cluster together
|
||||||
|
# and Task B functions should cluster together, regardless of language.
|
||||||
|
|
||||||
|
code_snippets = {
|
||||||
|
# ── Task A: Factorial ──
|
||||||
|
"factorial_python": """
|
||||||
|
def factorial(n):
|
||||||
|
result = 1
|
||||||
|
for i in range(2, n + 1):
|
||||||
|
result *= i
|
||||||
|
return result
|
||||||
|
""",
|
||||||
|
"factorial_javascript": """
|
||||||
|
function factorial(n) {
|
||||||
|
let result = 1;
|
||||||
|
for (let i = 2; i <= n; i++) {
|
||||||
|
result *= i;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
"factorial_java": """
|
||||||
|
public static int factorial(int n) {
|
||||||
|
int result = 1;
|
||||||
|
for (int i = 2; i <= n; i++) {
|
||||||
|
result *= i;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
"factorial_cpp": """
|
||||||
|
int factorial(int n) {
|
||||||
|
int result = 1;
|
||||||
|
for (int i = 2; i <= n; i++) {
|
||||||
|
result *= i;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
|
||||||
|
# ── Task B: Reverse a string ──
|
||||||
|
"reverse_python": """
|
||||||
|
def reverse_string(s):
|
||||||
|
return s[::-1]
|
||||||
|
""",
|
||||||
|
"reverse_javascript": """
|
||||||
|
function reverseString(s) {
|
||||||
|
return s.split('').reverse().join('');
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
"reverse_java": """
|
||||||
|
public static String reverseString(String s) {
|
||||||
|
return new StringBuilder(s).reverse().toString();
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
"reverse_cpp": """
|
||||||
|
std::string reverseString(std::string s) {
|
||||||
|
std::reverse(s.begin(), s.end());
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def embed_code(code: str) -> torch.Tensor:
|
||||||
|
"""Embed code into a normalized vector."""
|
||||||
|
inputs = tokenizer(
|
||||||
|
code, return_tensors="pt", truncation=True, max_length=512, padding=True
|
||||||
|
).to(DEVICE)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
mask = inputs["attention_mask"].unsqueeze(-1)
|
||||||
|
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||||||
|
return F.normalize(embedding, p=2, dim=1).squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Compute all embeddings ────────────────────────────────────────────────
|
||||||
|
print("Computing embeddings for all snippets...")
|
||||||
|
embeddings = {}
|
||||||
|
for name, code in code_snippets.items():
|
||||||
|
embeddings[name] = embed_code(code)
|
||||||
|
print(f"Done. {len(embeddings)} embeddings computed.\n")
|
||||||
|
|
||||||
|
# ── Compute similarity matrix ─────────────────────────────────────────────
|
||||||
|
names = list(embeddings.keys())
|
||||||
|
n = len(names)
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("CROSS-LANGUAGE SIMILARITY MATRIX")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# Print header (abbreviated names for readability)
|
||||||
|
short_names = [n.replace("factorial_", "F:").replace("reverse_", "R:") for n in names]
|
||||||
|
|
||||||
|
print(f"\n{'':14s}", end="")
|
||||||
|
for sn in short_names:
|
||||||
|
print(f"{sn:>10s}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
print(f"{short_names[i]:14s}", end="")
|
||||||
|
for j in range(n):
|
||||||
|
sim = torch.dot(embeddings[names[i]].cpu(), embeddings[names[j]].cpu()).item()
|
||||||
|
print(f"{sim:10.3f}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ── Compute average within-task and across-task similarities ──────────────
|
||||||
|
factorial_names = [n for n in names if "factorial" in n]
|
||||||
|
reverse_names = [n for n in names if "reverse" in n]
|
||||||
|
|
||||||
|
within_factorial = []
|
||||||
|
within_reverse = []
|
||||||
|
across_tasks = []
|
||||||
|
|
||||||
|
for i, n1 in enumerate(names):
|
||||||
|
for j, n2 in enumerate(names):
|
||||||
|
if i >= j:
|
||||||
|
continue
|
||||||
|
sim = torch.dot(embeddings[n1].cpu(), embeddings[n2].cpu()).item()
|
||||||
|
if n1 in factorial_names and n2 in factorial_names:
|
||||||
|
within_factorial.append(sim)
|
||||||
|
elif n1 in reverse_names and n2 in reverse_names:
|
||||||
|
within_reverse.append(sim)
|
||||||
|
else:
|
||||||
|
across_tasks.append(sim)
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("ANALYSIS")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"\nAvg similarity WITHIN factorial (across languages): "
|
||||||
|
f"{sum(within_factorial)/len(within_factorial):.3f}")
|
||||||
|
print(f"Avg similarity WITHIN reverse (across languages): "
|
||||||
|
f"{sum(within_reverse)/len(within_reverse):.3f}")
|
||||||
|
print(f"Avg similarity ACROSS tasks (factorial vs reverse): "
|
||||||
|
f"{sum(across_tasks)/len(across_tasks):.3f}")
|
||||||
|
|
||||||
|
print("""
|
||||||
|
EXPECTED RESULT:
|
||||||
|
Within-task similarity should be MUCH HIGHER than across-task similarity.
|
||||||
|
This proves that the embedding model groups code by WHAT IT DOES,
|
||||||
|
not by WHAT LANGUAGE it is written in.
|
||||||
|
|
||||||
|
factorial_python ≈ factorial_java ≈ factorial_cpp ≈ factorial_javascript
|
||||||
|
reverse_python ≈ reverse_java ≈ reverse_cpp ≈ reverse_javascript
|
||||||
|
factorial_* ≠ reverse_*
|
||||||
|
|
||||||
|
This is what enables cross-language code search: you can find a Java
|
||||||
|
implementation by providing a Python query, or vice versa.
|
||||||
|
""")
|
||||||
237
Code embeddings/04_clone_detection.py
Normal file
@ -0,0 +1,237 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 4: Code Clone Detection
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
GOAL:
|
||||||
|
Detect code clones (duplicate/similar code) in a collection of
|
||||||
|
functions using embeddings. We simulate a real-world scenario
|
||||||
|
where a codebase contains multiple near-duplicate implementations
|
||||||
|
that should be refactored into a single function.
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
- The four types of code clones (Type 1–4)
|
||||||
|
- How embeddings detect clones that text-based tools miss
|
||||||
|
- Ranking-based clone detection via cosine similarity
|
||||||
|
- Practical application: finding refactoring opportunities
|
||||||
|
|
||||||
|
CLONE TYPES:
|
||||||
|
Type 1: Exact copy (trivial — grep can find these)
|
||||||
|
Type 2: Renamed variables (grep misses these)
|
||||||
|
Type 3: Modified structure (added/removed lines)
|
||||||
|
Type 4: Same functionality, completely different implementation
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from itertools import combinations
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load model ────────────────────────────────────────────────────────────
|
||||||
|
MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base"
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# ── Simulated codebase ────────────────────────────────────────────────────
|
||||||
|
# These functions simulate what you'd find in a messy, real-world codebase
|
||||||
|
# where different developers wrote similar functionality independently.
|
||||||
|
#
|
||||||
|
# IMPORTANT: The clone groups share ZERO common words (besides Python
|
||||||
|
# keywords). This demonstrates that embeddings capture semantics, not
|
||||||
|
# surface-level text overlap. grep would never find these.
|
||||||
|
codebase = {
|
||||||
|
# ── Clone group 1: Computing the maximum of a list ──
|
||||||
|
# Three completely different implementations — no shared identifiers,
|
||||||
|
# no shared structure, but identical purpose.
|
||||||
|
"utils/find_max.py": """
|
||||||
|
def find_max(numbers):
|
||||||
|
result = numbers[0]
|
||||||
|
for candidate in numbers[1:]:
|
||||||
|
if candidate > result:
|
||||||
|
result = candidate
|
||||||
|
return result
|
||||||
|
""",
|
||||||
|
"legacy/find_max_old.py": """
|
||||||
|
def find_max(numbers):
|
||||||
|
result = numbers[0]
|
||||||
|
for candidate in numbers[1:]:
|
||||||
|
if candidate > result:
|
||||||
|
result = candidate
|
||||||
|
return result
|
||||||
|
""",
|
||||||
|
"analytics/top_scorer.py": """
|
||||||
|
import heapq
|
||||||
|
def fetch_top_element(collection):
|
||||||
|
return heapq.nlargest(1, collection)[0]
|
||||||
|
""",
|
||||||
|
"stats/dominant_value.py": """
|
||||||
|
def extract_peak(dataset):
|
||||||
|
dataset = sorted(dataset, reverse=True)
|
||||||
|
return dataset[0]
|
||||||
|
""",
|
||||||
|
|
||||||
|
# ── Clone group 2: String reversal ──
|
||||||
|
# Two implementations with zero lexical overlap — slicing vs index-based.
|
||||||
|
"text/flip_text.py": """
|
||||||
|
def flip_text(content):
|
||||||
|
return content[::-1]
|
||||||
|
""",
|
||||||
|
"helpers/mirror.py": """
|
||||||
|
def mirror_characters(phrase):
|
||||||
|
output = []
|
||||||
|
idx = len(phrase) - 1
|
||||||
|
while idx >= 0:
|
||||||
|
output.append(phrase[idx])
|
||||||
|
idx -= 1
|
||||||
|
return ''.join(output)
|
||||||
|
""",
|
||||||
|
|
||||||
|
# ── Not a clone: completely different functionality ──
|
||||||
|
# Each uses a different Python construct and domain to ensure
|
||||||
|
# they don't cluster with each other or with the clone groups.
|
||||||
|
"math/square_root.py": """
|
||||||
|
def square_root(x):
|
||||||
|
return x ** 0.5
|
||||||
|
""",
|
||||||
|
"calendar/leap_year.py": """
|
||||||
|
def is_leap_year(year):
|
||||||
|
return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
|
||||||
|
""",
|
||||||
|
"formatting/currency.py": """
|
||||||
|
def format_currency(amount, symbol="$"):
|
||||||
|
return f"{symbol}{amount:,.2f}"
|
||||||
|
""",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def embed_code(code: str) -> torch.Tensor:
|
||||||
|
"""Embed code into a normalized vector."""
|
||||||
|
inputs = tokenizer(
|
||||||
|
code, return_tensors="pt", truncation=True, max_length=512, padding=True
|
||||||
|
).to(DEVICE)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
mask = inputs["attention_mask"].unsqueeze(-1)
|
||||||
|
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||||||
|
return F.normalize(embedding, p=2, dim=1).squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Embed all functions ───────────────────────────────────────────────────
|
||||||
|
print("Embedding all functions in the codebase...")
|
||||||
|
embeddings = {}
|
||||||
|
for path, code in codebase.items():
|
||||||
|
embeddings[path] = embed_code(code)
|
||||||
|
print(f" {path}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ── Compute pairwise similarity matrix ────────────────────────────────────
|
||||||
|
paths = list(embeddings.keys())
|
||||||
|
n = len(paths)
|
||||||
|
|
||||||
|
def short_name(path):
|
||||||
|
"""Extract a readable label from the file path."""
|
||||||
|
return path.split("/")[-1].replace(".py", "")
|
||||||
|
|
||||||
|
labels = [short_name(p) for p in paths]
|
||||||
|
|
||||||
|
sim_matrix = {}
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(n):
|
||||||
|
sim = torch.dot(embeddings[paths[i]].cpu(), embeddings[paths[j]].cpu()).item()
|
||||||
|
sim_matrix[(i, j)] = sim
|
||||||
|
|
||||||
|
# ── Print similarity matrix ───────────────────────────────────────────────
|
||||||
|
col_w = max(len(l) for l in labels) + 2
|
||||||
|
header_w = col_w
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("SIMILARITY MATRIX")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print(f"\n{'':>{header_w}}", end="")
|
||||||
|
for label in labels:
|
||||||
|
print(f"{label:>{col_w}}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
print(f"{labels[i]:>{header_w}}", end="")
|
||||||
|
for j in range(n):
|
||||||
|
print(f"{sim_matrix[(i, j)]:>{col_w}.3f}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# ── Most similar match per function ───────────────────────────────────────
|
||||||
|
print()
|
||||||
|
print(f"{'BEST MATCH':>{header_w}}", end="")
|
||||||
|
for i in range(n):
|
||||||
|
best_j, best_sim = -1, -1.0
|
||||||
|
for j in range(n):
|
||||||
|
if i != j and sim_matrix[(i, j)] > best_sim:
|
||||||
|
best_sim = sim_matrix[(i, j)]
|
||||||
|
best_j = j
|
||||||
|
print(f"{labels[best_j]:>{col_w}}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"{'(similarity)':>{header_w}}", end="")
|
||||||
|
for i in range(n):
|
||||||
|
best_sim = max(sim_matrix[(i, j)] for j in range(n) if i != j)
|
||||||
|
print(f"{best_sim:>{col_w}.3f}", end="")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print(f"""
|
||||||
|
{'=' * 70}
|
||||||
|
INTERPRETATION:
|
||||||
|
{'=' * 70}
|
||||||
|
|
||||||
|
HOW TO READ THE TABLE:
|
||||||
|
Each cell shows the cosine similarity between two functions.
|
||||||
|
The BEST MATCH row shows which other function is most similar
|
||||||
|
to each column — these are the clone candidates a developer
|
||||||
|
would investigate.
|
||||||
|
|
||||||
|
EXPECTED CLONE GROUPS:
|
||||||
|
|
||||||
|
1. find_max ↔ find_max_old (Type 1: exact copy)
|
||||||
|
→ Similarity ≈ 1.000
|
||||||
|
|
||||||
|
2. find_max / fetch_top_element / extract_peak (Type 4 clones)
|
||||||
|
→ Same purpose (find the largest value), completely different
|
||||||
|
code: for-loop vs heapq.nlargest() vs sorted(reverse=True)
|
||||||
|
→ Zero shared identifiers between implementations
|
||||||
|
|
||||||
|
3. flip_text ↔ mirror_characters (Type 4 clone)
|
||||||
|
→ Same purpose (reverse a string), completely different code:
|
||||||
|
slicing ([::-1]) vs while-loop with index
|
||||||
|
→ Zero shared identifiers
|
||||||
|
|
||||||
|
NON-CLONES:
|
||||||
|
square_root, is_leap_year, format_currency each use a different
|
||||||
|
domain and code structure. Their best matches should have low
|
||||||
|
similarity compared to the clone groups.
|
||||||
|
|
||||||
|
KEY INSIGHT:
|
||||||
|
The clone groups share NO common words (besides Python keywords
|
||||||
|
like def/return/if). grep or any text-matching tool would never
|
||||||
|
find these clones. Only semantic understanding — which is what
|
||||||
|
embeddings provide — can detect that these functions do the same
|
||||||
|
thing despite having completely different code.
|
||||||
|
""")
|
||||||
216
Code embeddings/05_visualize_embeddings.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 5: Visualizing Code Embeddings with PCA and t-SNE
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
GOAL:
|
||||||
|
Reduce 768-dimensional code embeddings to 2D and plot them.
|
||||||
|
This makes the embedding space visible: you can SEE that similar
|
||||||
|
code clusters together and different code is far apart.
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
- How PCA projects high-dimensional vectors to 2D (linear reduction)
|
||||||
|
- How t-SNE creates a non-linear 2D map that preserves neighborhoods
|
||||||
|
- How to interpret embedding space visualizations
|
||||||
|
- That code functionality determines position, not syntax or language
|
||||||
|
|
||||||
|
OUTPUT:
|
||||||
|
Saves two PNG plots: code_embeddings_pca.png and code_embeddings_tsne.png
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from sklearn.manifold import TSNE
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib
|
||||||
|
|
||||||
|
# Use a non-interactive backend so the script works in headless environments
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load model ────────────────────────────────────────────────────────────
|
||||||
|
MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base"
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# ── Code snippets organized by CATEGORY ───────────────────────────────────
|
||||||
|
# Each category represents a type of task. We expect snippets within the
|
||||||
|
# same category to cluster together in the embedding space.
|
||||||
|
categories = {
|
||||||
|
"Sorting": {
|
||||||
|
"bubble_sort_py": "def bubble_sort(arr):\n n = len(arr)\n for i in range(n):\n for j in range(n-i-1):\n if arr[j] > arr[j+1]:\n arr[j], arr[j+1] = arr[j+1], arr[j]\n return arr",
|
||||||
|
"quick_sort_py": "def quick_sort(a):\n if len(a) <= 1: return a\n p = a[0]\n return quick_sort([x for x in a[1:] if x < p]) + [p] + quick_sort([x for x in a[1:] if x >= p])",
|
||||||
|
"sort_js": "function sortArray(arr) { return arr.sort((a, b) => a - b); }",
|
||||||
|
"insertion_sort": "def insertion_sort(arr):\n for i in range(1, len(arr)):\n key = arr[i]\n j = i - 1\n while j >= 0 and arr[j] > key:\n arr[j+1] = arr[j]\n j -= 1\n arr[j+1] = key\n return arr",
|
||||||
|
},
|
||||||
|
"File I/O": {
|
||||||
|
"read_json": "import json\ndef read_json(path):\n with open(path) as f:\n return json.load(f)",
|
||||||
|
"write_file": "def write_file(path, content):\n with open(path, 'w') as f:\n f.write(content)",
|
||||||
|
"read_csv": "import csv\ndef read_csv(path):\n with open(path) as f:\n return list(csv.reader(f))",
|
||||||
|
"read_yaml": "import yaml\ndef read_yaml(path):\n with open(path) as f:\n return yaml.safe_load(f)",
|
||||||
|
},
|
||||||
|
"String ops": {
|
||||||
|
"reverse_str": "def reverse(s): return s[::-1]",
|
||||||
|
"capitalize": "def capitalize_words(s): return ' '.join(w.capitalize() for w in s.split())",
|
||||||
|
"count_chars": "def count_chars(s):\n return {c: s.count(c) for c in set(s)}",
|
||||||
|
"is_palindrome": "def is_palindrome(s): return s == s[::-1]",
|
||||||
|
},
|
||||||
|
"Math": {
|
||||||
|
"factorial": "def factorial(n):\n r = 1\n for i in range(2, n+1): r *= i\n return r",
|
||||||
|
"fibonacci": "def fib(n):\n a, b = 0, 1\n for _ in range(n): a, b = b, a+b\n return a",
|
||||||
|
"gcd": "def gcd(a, b):\n while b: a, b = b, a % b\n return a",
|
||||||
|
"is_prime": "def is_prime(n):\n if n < 2: return False\n for i in range(2, int(n**0.5)+1):\n if n % i == 0: return False\n return True",
|
||||||
|
},
|
||||||
|
"Networking": {
|
||||||
|
"http_get": "import requests\ndef http_get(url): return requests.get(url).json()",
|
||||||
|
"fetch_url": "import urllib.request\ndef fetch(url):\n with urllib.request.urlopen(url) as r:\n return r.read().decode()",
|
||||||
|
"post_data": "import requests\ndef post_json(url, data): return requests.post(url, json=data).status_code",
|
||||||
|
"download_file": "import urllib.request\ndef download(url, path): urllib.request.urlretrieve(url, path)",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def embed_code(code: str) -> torch.Tensor:
|
||||||
|
"""Embed code into a normalized vector."""
|
||||||
|
inputs = tokenizer(
|
||||||
|
code, return_tensors="pt", truncation=True, max_length=512, padding=True
|
||||||
|
).to(DEVICE)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
mask = inputs["attention_mask"].unsqueeze(-1)
|
||||||
|
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||||||
|
return F.normalize(embedding, p=2, dim=1).squeeze(0).cpu().numpy()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Compute embeddings ────────────────────────────────────────────────────
|
||||||
|
print("Computing embeddings...")
|
||||||
|
all_embeddings = []
|
||||||
|
all_labels = []
|
||||||
|
all_categories = []
|
||||||
|
|
||||||
|
for category, snippets in categories.items():
|
||||||
|
for label, code in snippets.items():
|
||||||
|
vec = embed_code(code)
|
||||||
|
all_embeddings.append(vec)
|
||||||
|
all_labels.append(label)
|
||||||
|
all_categories.append(category)
|
||||||
|
print(f" [{category:12s}] {label}")
|
||||||
|
|
||||||
|
# Convert to numpy matrix: shape [num_snippets, 768]
|
||||||
|
X = np.stack(all_embeddings)
|
||||||
|
print(f"\nEmbedding matrix: {X.shape[0]} snippets × {X.shape[1]} dimensions\n")
|
||||||
|
|
||||||
|
# ── Color map for categories ──────────────────────────────────────────────
|
||||||
|
category_names = list(categories.keys())
|
||||||
|
colors = plt.cm.Set1(np.linspace(0, 1, len(category_names)))
|
||||||
|
color_map = {cat: colors[i] for i, cat in enumerate(category_names)}
|
||||||
|
point_colors = [color_map[cat] for cat in all_categories]
|
||||||
|
|
||||||
|
# ── Plot 1: PCA ──────────────────────────────────────────────────────────
|
||||||
|
# PCA finds the two directions of maximum variance in the 1024-dim space
|
||||||
|
# and projects all points onto those two directions.
|
||||||
|
print("Computing PCA (2 components)...")
|
||||||
|
pca = PCA(n_components=2)
|
||||||
|
X_pca = pca.fit_transform(X)
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 8))
|
||||||
|
for i, (x, y) in enumerate(X_pca):
|
||||||
|
ax.scatter(x, y, c=[point_colors[i]], s=100, edgecolors="black", linewidth=0.5, zorder=3)
|
||||||
|
ax.annotate(all_labels[i], (x, y), fontsize=7, ha="center", va="bottom",
|
||||||
|
xytext=(0, 6), textcoords="offset points")
|
||||||
|
|
||||||
|
# Legend
|
||||||
|
for cat in category_names:
|
||||||
|
ax.scatter([], [], c=[color_map[cat]], s=80, label=cat, edgecolors="black", linewidth=0.5)
|
||||||
|
ax.legend(loc="best", fontsize=9, title="Category", title_fontsize=10)
|
||||||
|
|
||||||
|
variance_explained = pca.explained_variance_ratio_
|
||||||
|
ax.set_title(f"Code Embeddings — PCA Projection\n"
|
||||||
|
f"(PC1: {variance_explained[0]:.1%} variance, PC2: {variance_explained[1]:.1%} variance)",
|
||||||
|
fontsize=13)
|
||||||
|
ax.set_xlabel("Principal Component 1", fontsize=11)
|
||||||
|
ax.set_ylabel("Principal Component 2", fontsize=11)
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig("code_embeddings_pca.png", dpi=150)
|
||||||
|
print(f" Saved: code_embeddings_pca.png")
|
||||||
|
print(f" Variance explained: PC1={variance_explained[0]:.1%}, PC2={variance_explained[1]:.1%}\n")
|
||||||
|
|
||||||
|
# ── Plot 2: t-SNE ────────────────────────────────────────────────────────
|
||||||
|
# t-SNE is a non-linear method that preserves LOCAL neighborhood structure.
|
||||||
|
# Points that are close in 1024-dim space stay close in 2D.
|
||||||
|
# Perplexity controls the balance between local and global structure.
|
||||||
|
print("Computing t-SNE (this may take a few seconds)...")
|
||||||
|
tsne = TSNE(n_components=2, perplexity=5, random_state=42, max_iter=1000)
|
||||||
|
X_tsne = tsne.fit_transform(X)
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 8))
|
||||||
|
for i, (x, y) in enumerate(X_tsne):
|
||||||
|
ax.scatter(x, y, c=[point_colors[i]], s=100, edgecolors="black", linewidth=0.5, zorder=3)
|
||||||
|
ax.annotate(all_labels[i], (x, y), fontsize=7, ha="center", va="bottom",
|
||||||
|
xytext=(0, 6), textcoords="offset points")
|
||||||
|
|
||||||
|
for cat in category_names:
|
||||||
|
ax.scatter([], [], c=[color_map[cat]], s=80, label=cat, edgecolors="black", linewidth=0.5)
|
||||||
|
ax.legend(loc="best", fontsize=9, title="Category", title_fontsize=10)
|
||||||
|
|
||||||
|
ax.set_title("Code Embeddings — t-SNE Projection\n"
|
||||||
|
"(non-linear dimensionality reduction)", fontsize=13)
|
||||||
|
ax.set_xlabel("t-SNE Dimension 1", fontsize=11)
|
||||||
|
ax.set_ylabel("t-SNE Dimension 2", fontsize=11)
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
fig.tight_layout()
|
||||||
|
fig.savefig("code_embeddings_tsne.png", dpi=150)
|
||||||
|
print(f" Saved: code_embeddings_tsne.png\n")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("INTERPRETATION")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"""
|
||||||
|
Both plots project {X.shape[1]}-dimensional embedding vectors to 2D:
|
||||||
|
|
||||||
|
PCA (Principal Component Analysis):
|
||||||
|
- Linear projection onto the two axes of maximum variance.
|
||||||
|
- Preserves global structure: large distances are meaningful.
|
||||||
|
- Good for seeing overall separation between categories.
|
||||||
|
- The % variance tells you how much information is retained.
|
||||||
|
|
||||||
|
t-SNE (t-distributed Stochastic Neighbor Embedding):
|
||||||
|
- Non-linear: distorts distances but preserves neighborhoods.
|
||||||
|
- Points that are close in the original space stay close in 2D.
|
||||||
|
- Better at revealing tight clusters within categories.
|
||||||
|
- Distances BETWEEN clusters are not meaningful.
|
||||||
|
|
||||||
|
EXPECTED RESULT:
|
||||||
|
You should see 5 distinct clusters, one per category:
|
||||||
|
- Sorting functions (bubble, quick, insertion, JS sort) cluster together
|
||||||
|
- File I/O functions cluster together
|
||||||
|
- String operations cluster together
|
||||||
|
- Math functions cluster together
|
||||||
|
- Networking functions cluster together
|
||||||
|
|
||||||
|
This visually confirms that code embeddings organize code by
|
||||||
|
PURPOSE, not by surface syntax or programming language.
|
||||||
|
""")
|
||||||
716
Code embeddings/06_pca_denoising.py
Normal file
@ -0,0 +1,716 @@
|
|||||||
|
"""
|
||||||
|
============================================================================
|
||||||
|
Example 6: PCA Denoising — Can Fewer Dimensions Improve Similarity?
|
||||||
|
============================================================================
|
||||||
|
AISE501 – AI in Software Engineering I
|
||||||
|
Fachhochschule Graubünden
|
||||||
|
|
||||||
|
HYPOTHESIS:
|
||||||
|
Embedding vectors live in a 768-dimensional space, but most of the
|
||||||
|
semantic signal may be concentrated in a small number of principal
|
||||||
|
components. The remaining dimensions could add "noise" that dilutes
|
||||||
|
cosine similarity. If true, projecting embeddings onto a small PCA
|
||||||
|
subspace should INCREASE similarity within semantic groups and
|
||||||
|
DECREASE similarity across groups — making code search sharper.
|
||||||
|
|
||||||
|
WHAT YOU WILL LEARN:
|
||||||
|
- How PCA decomposes the embedding space into ranked components
|
||||||
|
- How to measure retrieval quality (intra- vs inter-group similarity)
|
||||||
|
- Whether dimensionality reduction helps or hurts in practice
|
||||||
|
- The concept of an "optimal" embedding dimension for a given task
|
||||||
|
|
||||||
|
OUTPUT:
|
||||||
|
Saves pca_denoising_analysis.png with three sub-plots.
|
||||||
|
|
||||||
|
HARDWARE:
|
||||||
|
Works on CPU, CUDA (NVIDIA), and MPS (Apple Silicon Mac).
|
||||||
|
============================================================================
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
from transformers import AutoTokenizer, AutoModel
|
||||||
|
import torch.nn.functional as F
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import matplotlib
|
||||||
|
matplotlib.use("Agg")
|
||||||
|
|
||||||
|
# ── Device selection ──────────────────────────────────────────────────────
|
||||||
|
def get_device():
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
elif torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
return torch.device("cpu")
|
||||||
|
|
||||||
|
DEVICE = get_device()
|
||||||
|
print(f"Using device: {DEVICE}\n")
|
||||||
|
|
||||||
|
# ── Load model ────────────────────────────────────────────────────────────
|
||||||
|
MODEL_NAME = "flax-sentence-embeddings/st-codesearch-distilroberta-base"
|
||||||
|
print(f"Loading model: {MODEL_NAME} ...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||||
|
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||||
|
model.eval()
|
||||||
|
print("Model loaded.\n")
|
||||||
|
|
||||||
|
# ── Code snippets organized into semantic GROUPS ──────────────────────────
|
||||||
|
# We need clear groups so we can measure intra-group vs inter-group similarity.
|
||||||
|
groups = {
|
||||||
|
"Sorting": {
|
||||||
|
"bubble_sort": """
|
||||||
|
def bubble_sort(arr):
|
||||||
|
n = len(arr)
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(0, n - i - 1):
|
||||||
|
if arr[j] > arr[j + 1]:
|
||||||
|
arr[j], arr[j + 1] = arr[j + 1], arr[j]
|
||||||
|
return arr""",
|
||||||
|
"quick_sort": """
|
||||||
|
def quick_sort(arr):
|
||||||
|
if len(arr) <= 1:
|
||||||
|
return arr
|
||||||
|
pivot = arr[len(arr) // 2]
|
||||||
|
left = [x for x in arr if x < pivot]
|
||||||
|
middle = [x for x in arr if x == pivot]
|
||||||
|
right = [x for x in arr if x > pivot]
|
||||||
|
return quick_sort(left) + middle + quick_sort(right)""",
|
||||||
|
"merge_sort": """
|
||||||
|
def merge_sort(arr):
|
||||||
|
if len(arr) <= 1:
|
||||||
|
return arr
|
||||||
|
mid = len(arr) // 2
|
||||||
|
left = merge_sort(arr[:mid])
|
||||||
|
right = merge_sort(arr[mid:])
|
||||||
|
merged = []
|
||||||
|
i = j = 0
|
||||||
|
while i < len(left) and j < len(right):
|
||||||
|
if left[i] <= right[j]:
|
||||||
|
merged.append(left[i]); i += 1
|
||||||
|
else:
|
||||||
|
merged.append(right[j]); j += 1
|
||||||
|
return merged + left[i:] + right[j:]""",
|
||||||
|
"insertion_sort": """
|
||||||
|
def insertion_sort(arr):
|
||||||
|
for i in range(1, len(arr)):
|
||||||
|
key = arr[i]
|
||||||
|
j = i - 1
|
||||||
|
while j >= 0 and arr[j] > key:
|
||||||
|
arr[j + 1] = arr[j]
|
||||||
|
j -= 1
|
||||||
|
arr[j + 1] = key
|
||||||
|
return arr""",
|
||||||
|
"selection_sort": """
|
||||||
|
def selection_sort(arr):
|
||||||
|
for i in range(len(arr)):
|
||||||
|
min_idx = i
|
||||||
|
for j in range(i + 1, len(arr)):
|
||||||
|
if arr[j] < arr[min_idx]:
|
||||||
|
min_idx = j
|
||||||
|
arr[i], arr[min_idx] = arr[min_idx], arr[i]
|
||||||
|
return arr""",
|
||||||
|
"heap_sort": """
|
||||||
|
def heap_sort(arr):
|
||||||
|
import heapq
|
||||||
|
heapq.heapify(arr)
|
||||||
|
return [heapq.heappop(arr) for _ in range(len(arr))]""",
|
||||||
|
},
|
||||||
|
"File I/O": {
|
||||||
|
"read_json": """
|
||||||
|
import json
|
||||||
|
def read_json(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
return json.load(f)""",
|
||||||
|
"write_file": """
|
||||||
|
def write_file(path, content):
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
f.write(content)""",
|
||||||
|
"read_csv": """
|
||||||
|
import csv
|
||||||
|
def read_csv(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
return list(reader)""",
|
||||||
|
"read_yaml": """
|
||||||
|
import yaml
|
||||||
|
def load_yaml(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
return yaml.safe_load(f)""",
|
||||||
|
"write_json": """
|
||||||
|
import json
|
||||||
|
def write_json(path, data):
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
json.dump(data, f, indent=2)""",
|
||||||
|
"read_lines": """
|
||||||
|
def read_lines(path):
|
||||||
|
with open(path, 'r') as f:
|
||||||
|
return f.readlines()""",
|
||||||
|
},
|
||||||
|
"Math": {
|
||||||
|
"factorial": """
|
||||||
|
def factorial(n):
|
||||||
|
if n <= 1:
|
||||||
|
return 1
|
||||||
|
return n * factorial(n - 1)""",
|
||||||
|
"fibonacci": """
|
||||||
|
def fibonacci(n):
|
||||||
|
a, b = 0, 1
|
||||||
|
for _ in range(n):
|
||||||
|
a, b = b, a + b
|
||||||
|
return a""",
|
||||||
|
"gcd": """
|
||||||
|
def gcd(a, b):
|
||||||
|
while b:
|
||||||
|
a, b = b, a % b
|
||||||
|
return a""",
|
||||||
|
"is_prime": """
|
||||||
|
def is_prime(n):
|
||||||
|
if n < 2:
|
||||||
|
return False
|
||||||
|
for i in range(2, int(n**0.5) + 1):
|
||||||
|
if n % i == 0:
|
||||||
|
return False
|
||||||
|
return True""",
|
||||||
|
"power": """
|
||||||
|
def power(base, exp):
|
||||||
|
if exp == 0:
|
||||||
|
return 1
|
||||||
|
if exp % 2 == 0:
|
||||||
|
half = power(base, exp // 2)
|
||||||
|
return half * half
|
||||||
|
return base * power(base, exp - 1)""",
|
||||||
|
"sum_digits": """
|
||||||
|
def sum_digits(n):
|
||||||
|
total = 0
|
||||||
|
while n > 0:
|
||||||
|
total += n % 10
|
||||||
|
n //= 10
|
||||||
|
return total""",
|
||||||
|
},
|
||||||
|
"Networking": {
|
||||||
|
"http_get": """
|
||||||
|
import requests
|
||||||
|
def http_get(url):
|
||||||
|
response = requests.get(url)
|
||||||
|
return response.json()""",
|
||||||
|
"post_data": """
|
||||||
|
import requests
|
||||||
|
def post_data(url, payload):
|
||||||
|
response = requests.post(url, json=payload)
|
||||||
|
return response.status_code, response.json()""",
|
||||||
|
"fetch_url": """
|
||||||
|
import urllib.request
|
||||||
|
def fetch_url(url):
|
||||||
|
with urllib.request.urlopen(url) as resp:
|
||||||
|
return resp.read().decode('utf-8')""",
|
||||||
|
"download_file": """
|
||||||
|
import urllib.request
|
||||||
|
def download_file(url, dest):
|
||||||
|
urllib.request.urlretrieve(url, dest)
|
||||||
|
return dest""",
|
||||||
|
"http_put": """
|
||||||
|
import requests
|
||||||
|
def http_put(url, data):
|
||||||
|
response = requests.put(url, json=data)
|
||||||
|
return response.status_code""",
|
||||||
|
"http_delete": """
|
||||||
|
import requests
|
||||||
|
def http_delete(url):
|
||||||
|
response = requests.delete(url)
|
||||||
|
return response.status_code""",
|
||||||
|
},
|
||||||
|
"String ops": {
|
||||||
|
"reverse_str": """
|
||||||
|
def reverse_string(s):
|
||||||
|
return s[::-1]""",
|
||||||
|
"is_palindrome": """
|
||||||
|
def is_palindrome(s):
|
||||||
|
s = s.lower().replace(' ', '')
|
||||||
|
return s == s[::-1]""",
|
||||||
|
"count_vowels": """
|
||||||
|
def count_vowels(s):
|
||||||
|
return sum(1 for c in s.lower() if c in 'aeiou')""",
|
||||||
|
"capitalize_words": """
|
||||||
|
def capitalize_words(s):
|
||||||
|
return ' '.join(w.capitalize() for w in s.split())""",
|
||||||
|
"remove_duplicates": """
|
||||||
|
def remove_duplicate_chars(s):
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for c in s:
|
||||||
|
if c not in seen:
|
||||||
|
seen.add(c)
|
||||||
|
result.append(c)
|
||||||
|
return ''.join(result)""",
|
||||||
|
"count_words": """
|
||||||
|
def count_words(text):
|
||||||
|
words = text.lower().split()
|
||||||
|
freq = {}
|
||||||
|
for w in words:
|
||||||
|
freq[w] = freq.get(w, 0) + 1
|
||||||
|
return freq""",
|
||||||
|
},
|
||||||
|
"Data structures": {
|
||||||
|
"stack_push_pop": """
|
||||||
|
class Stack:
|
||||||
|
def __init__(self):
|
||||||
|
self.items = []
|
||||||
|
def push(self, item):
|
||||||
|
self.items.append(item)
|
||||||
|
def pop(self):
|
||||||
|
return self.items.pop()""",
|
||||||
|
"queue_impl": """
|
||||||
|
from collections import deque
|
||||||
|
class Queue:
|
||||||
|
def __init__(self):
|
||||||
|
self.items = deque()
|
||||||
|
def enqueue(self, item):
|
||||||
|
self.items.append(item)
|
||||||
|
def dequeue(self):
|
||||||
|
return self.items.popleft()""",
|
||||||
|
"linked_list": """
|
||||||
|
class Node:
|
||||||
|
def __init__(self, val):
|
||||||
|
self.val = val
|
||||||
|
self.next = None
|
||||||
|
class LinkedList:
|
||||||
|
def __init__(self):
|
||||||
|
self.head = None
|
||||||
|
def append(self, val):
|
||||||
|
node = Node(val)
|
||||||
|
if not self.head:
|
||||||
|
self.head = node
|
||||||
|
return
|
||||||
|
curr = self.head
|
||||||
|
while curr.next:
|
||||||
|
curr = curr.next
|
||||||
|
curr.next = node""",
|
||||||
|
"binary_tree": """
|
||||||
|
class TreeNode:
|
||||||
|
def __init__(self, val):
|
||||||
|
self.val = val
|
||||||
|
self.left = None
|
||||||
|
self.right = None
|
||||||
|
def inorder(root):
|
||||||
|
if root:
|
||||||
|
yield from inorder(root.left)
|
||||||
|
yield root.val
|
||||||
|
yield from inorder(root.right)""",
|
||||||
|
"hash_map": """
|
||||||
|
class HashMap:
|
||||||
|
def __init__(self, size=256):
|
||||||
|
self.buckets = [[] for _ in range(size)]
|
||||||
|
def put(self, key, value):
|
||||||
|
idx = hash(key) % len(self.buckets)
|
||||||
|
for i, (k, v) in enumerate(self.buckets[idx]):
|
||||||
|
if k == key:
|
||||||
|
self.buckets[idx][i] = (key, value)
|
||||||
|
return
|
||||||
|
self.buckets[idx].append((key, value))""",
|
||||||
|
"priority_queue": """
|
||||||
|
import heapq
|
||||||
|
class PriorityQueue:
|
||||||
|
def __init__(self):
|
||||||
|
self.heap = []
|
||||||
|
def push(self, priority, item):
|
||||||
|
heapq.heappush(self.heap, (priority, item))
|
||||||
|
def pop(self):
|
||||||
|
return heapq.heappop(self.heap)[1]""",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def embed_code(code: str) -> torch.Tensor:
|
||||||
|
"""Embed code into a normalized vector."""
|
||||||
|
inputs = tokenizer(
|
||||||
|
code, return_tensors="pt", truncation=True, max_length=512, padding=True
|
||||||
|
).to(DEVICE)
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = model(**inputs)
|
||||||
|
mask = inputs["attention_mask"].unsqueeze(-1)
|
||||||
|
embedding = (outputs.last_hidden_state * mask).sum(dim=1) / mask.sum(dim=1)
|
||||||
|
return F.normalize(embedding, p=2, dim=1).squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Step 1: Compute all embeddings ────────────────────────────────────────
|
||||||
|
print("Computing embeddings...")
|
||||||
|
all_names = []
|
||||||
|
all_labels = []
|
||||||
|
all_vectors = []
|
||||||
|
|
||||||
|
for group_name, snippets in groups.items():
|
||||||
|
for snippet_name, code in snippets.items():
|
||||||
|
vec = embed_code(code).cpu().numpy()
|
||||||
|
all_names.append(snippet_name)
|
||||||
|
all_labels.append(group_name)
|
||||||
|
all_vectors.append(vec)
|
||||||
|
print(f" [{group_name:12s}] {snippet_name}")
|
||||||
|
|
||||||
|
X = np.stack(all_vectors) # shape: [N, 768]
|
||||||
|
N, D = X.shape
|
||||||
|
print(f"\nEmbedding matrix: {N} snippets × {D} dimensions\n")
|
||||||
|
|
||||||
|
# ── Step 2: Define similarity metrics ─────────────────────────────────────
|
||||||
|
def cosine_matrix(vectors):
|
||||||
|
"""Compute pairwise cosine similarity for L2-normalized vectors."""
|
||||||
|
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
||||||
|
norms = np.maximum(norms, 1e-10)
|
||||||
|
normed = vectors / norms
|
||||||
|
return normed @ normed.T
|
||||||
|
|
||||||
|
def compute_metrics(sim_matrix, labels):
|
||||||
|
"""
|
||||||
|
Compute intra-group (same category) and inter-group (different category)
|
||||||
|
average similarities. The GAP between them measures discriminability.
|
||||||
|
"""
|
||||||
|
intra_sims = []
|
||||||
|
inter_sims = []
|
||||||
|
n = len(labels)
|
||||||
|
for i in range(n):
|
||||||
|
for j in range(i + 1, n):
|
||||||
|
if labels[i] == labels[j]:
|
||||||
|
intra_sims.append(sim_matrix[i, j])
|
||||||
|
else:
|
||||||
|
inter_sims.append(sim_matrix[i, j])
|
||||||
|
intra_mean = np.mean(intra_sims)
|
||||||
|
inter_mean = np.mean(inter_sims)
|
||||||
|
gap = intra_mean - inter_mean
|
||||||
|
return intra_mean, inter_mean, gap
|
||||||
|
|
||||||
|
|
||||||
|
# ── Step 3: Sweep across PCA dimensions ──────────────────────────────────
|
||||||
|
# PCA can have at most min(N, D) components; cap accordingly
|
||||||
|
max_components = min(N, D)
|
||||||
|
dims_to_test = sorted(set(
|
||||||
|
k for k in [2, 3, 5, 8, 10, 15, 20, 30, 50, 75, 100, 150, 200,
|
||||||
|
300, 400, 500, 600, D]
|
||||||
|
if k <= max_components
|
||||||
|
))
|
||||||
|
dims_to_test.append(D) # always include full dimensionality as baseline
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("PCA DENOISING EXPERIMENT")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"\n{'Components':>12s} {'Intra-Group':>12s} {'Inter-Group':>12s} "
|
||||||
|
f"{'Gap':>8s} {'vs Full':>8s}")
|
||||||
|
print("-" * 62)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for k in dims_to_test:
|
||||||
|
if k >= D:
|
||||||
|
# Full dimensionality — no PCA, just use original vectors
|
||||||
|
X_reduced = X.copy()
|
||||||
|
actual_k = D
|
||||||
|
else:
|
||||||
|
pca = PCA(n_components=k, random_state=42)
|
||||||
|
X_reduced = pca.fit_transform(X)
|
||||||
|
actual_k = k
|
||||||
|
|
||||||
|
sim = cosine_matrix(X_reduced)
|
||||||
|
intra, inter, gap = compute_metrics(sim, all_labels)
|
||||||
|
results.append((actual_k, intra, inter, gap))
|
||||||
|
|
||||||
|
# Compute full-dim gap for comparison
|
||||||
|
full_intra, full_inter, full_gap = results[-1][1], results[-1][2], results[-1][3]
|
||||||
|
|
||||||
|
for k, intra, inter, gap in results:
|
||||||
|
delta = gap - full_gap
|
||||||
|
delta_str = f"{delta:+.4f}" if k < D else " (base)"
|
||||||
|
print(f"{k:>12d} {intra:>12.4f} {inter:>12.4f} {gap:>8.4f} {delta_str:>8s}")
|
||||||
|
|
||||||
|
# ── Step 4: Find the optimal dimensionality ──────────────────────────────
|
||||||
|
dims_arr = np.array([r[0] for r in results])
|
||||||
|
gaps_arr = np.array([r[3] for r in results])
|
||||||
|
best_idx = np.argmax(gaps_arr)
|
||||||
|
best_k, best_gap = int(dims_arr[best_idx]), gaps_arr[best_idx]
|
||||||
|
|
||||||
|
print(f"\n{'=' * 70}")
|
||||||
|
print(f"BEST DIMENSIONALITY: {best_k} components")
|
||||||
|
print(f" Gap (intra - inter): {best_gap:.4f} vs {full_gap:.4f} at full 768-d")
|
||||||
|
print(f" Improvement: {best_gap - full_gap:+.4f}")
|
||||||
|
print(f"{'=' * 70}")
|
||||||
|
|
||||||
|
# ── Step 5: Show detailed comparison at optimal k vs full ────────────────
|
||||||
|
print(f"\n── Detailed Similarity Matrix at k={best_k} vs k={D} ──\n")
|
||||||
|
|
||||||
|
if best_k < D:
|
||||||
|
pca_best = PCA(n_components=best_k, random_state=42)
|
||||||
|
X_best = pca_best.fit_transform(X)
|
||||||
|
else:
|
||||||
|
X_best = X.copy()
|
||||||
|
|
||||||
|
sim_full = cosine_matrix(X)
|
||||||
|
sim_best = cosine_matrix(X_best)
|
||||||
|
|
||||||
|
# Show a selection of interesting pairs
|
||||||
|
print(f"{'Snippet A':>20s} {'Snippet B':>20s} {'Full 768d':>10s} "
|
||||||
|
f"{'PCA {0}d'.format(best_k):>10s} {'Change':>8s}")
|
||||||
|
print("-" * 78)
|
||||||
|
|
||||||
|
interesting_pairs = [
|
||||||
|
# Intra-group: should be high
|
||||||
|
("bubble_sort", "quick_sort"),
|
||||||
|
("bubble_sort", "merge_sort"),
|
||||||
|
("read_json", "read_csv"),
|
||||||
|
("http_get", "fetch_url"),
|
||||||
|
("factorial", "fibonacci"),
|
||||||
|
("reverse_str", "is_palindrome"),
|
||||||
|
("stack_push_pop", "queue_impl"),
|
||||||
|
# Inter-group: should be low
|
||||||
|
("bubble_sort", "read_json"),
|
||||||
|
("factorial", "http_get"),
|
||||||
|
("reverse_str", "download_file"),
|
||||||
|
("is_prime", "write_file"),
|
||||||
|
("stack_push_pop", "count_vowels"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for n1, n2 in interesting_pairs:
|
||||||
|
i = all_names.index(n1)
|
||||||
|
j = all_names.index(n2)
|
||||||
|
s_full = sim_full[i, j]
|
||||||
|
s_best = sim_best[i, j]
|
||||||
|
same = all_labels[i] == all_labels[j]
|
||||||
|
marker = "SAME" if same else "DIFF"
|
||||||
|
change = s_best - s_full
|
||||||
|
print(f"{n1:>20s} {n2:>20s} {s_full:>10.4f} {s_best:>10.4f} "
|
||||||
|
f"{change:>+8.4f} [{marker}]")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Step 6: Text-to-code search comparison ────────────────────────────────
|
||||||
|
print(f"\n── Text-to-Code Search: Full 768d vs PCA {best_k}d ──\n")
|
||||||
|
|
||||||
|
search_queries = [
|
||||||
|
("sort a list of numbers", "Sorting"),
|
||||||
|
("read a JSON config file", "File I/O"),
|
||||||
|
("compute factorial recursively", "Math"),
|
||||||
|
("make an HTTP GET request", "Networking"),
|
||||||
|
("check if a number is prime", "Math"),
|
||||||
|
]
|
||||||
|
|
||||||
|
if best_k < D:
|
||||||
|
pca_search = PCA(n_components=best_k, random_state=42)
|
||||||
|
X_search = pca_search.fit_transform(X)
|
||||||
|
else:
|
||||||
|
X_search = X.copy()
|
||||||
|
pca_search = None
|
||||||
|
|
||||||
|
for query, expected_group in search_queries:
|
||||||
|
q_vec = embed_code(query).cpu().numpy().reshape(1, -1)
|
||||||
|
|
||||||
|
# Full dimension search
|
||||||
|
q_norm = q_vec / np.linalg.norm(q_vec)
|
||||||
|
X_norm = X / np.linalg.norm(X, axis=1, keepdims=True)
|
||||||
|
scores_full = (X_norm @ q_norm.T).flatten()
|
||||||
|
|
||||||
|
# PCA-reduced search
|
||||||
|
if pca_search is not None:
|
||||||
|
q_reduced = pca_search.transform(q_vec)
|
||||||
|
else:
|
||||||
|
q_reduced = q_vec.copy()
|
||||||
|
q_r_norm = q_reduced / np.linalg.norm(q_reduced)
|
||||||
|
X_s_norm = X_search / np.linalg.norm(X_search, axis=1, keepdims=True)
|
||||||
|
scores_pca = (X_s_norm @ q_r_norm.T).flatten()
|
||||||
|
|
||||||
|
top_full = np.argsort(-scores_full)[:3]
|
||||||
|
top_pca = np.argsort(-scores_pca)[:3]
|
||||||
|
|
||||||
|
print(f' Query: "{query}"')
|
||||||
|
print(f' Full 768d: {all_names[top_full[0]]:>16s} ({scores_full[top_full[0]]:.3f})'
|
||||||
|
f' {all_names[top_full[1]]:>16s} ({scores_full[top_full[1]]:.3f})'
|
||||||
|
f' {all_names[top_full[2]]:>16s} ({scores_full[top_full[2]]:.3f})')
|
||||||
|
print(f' PCA {best_k:>3d}d: {all_names[top_pca[0]]:>16s} ({scores_pca[top_pca[0]]:.3f})'
|
||||||
|
f' {all_names[top_pca[1]]:>16s} ({scores_pca[top_pca[1]]:.3f})'
|
||||||
|
f' {all_names[top_pca[2]]:>16s} ({scores_pca[top_pca[2]]:.3f})')
|
||||||
|
|
||||||
|
full_correct = all_labels[top_full[0]] == expected_group
|
||||||
|
pca_correct = all_labels[top_pca[0]] == expected_group
|
||||||
|
print(f' Full correct: {full_correct} | PCA correct: {pca_correct}')
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Step 7: Visualization ─────────────────────────────────────────────────
|
||||||
|
# Six-panel figure for a comprehensive visual analysis.
|
||||||
|
|
||||||
|
group_colors = {
|
||||||
|
"Sorting": "#1f77b4", "File I/O": "#ff7f0e", "Math": "#2ca02c",
|
||||||
|
"Networking": "#d62728", "String ops": "#9467bd", "Data structures": "#8c564b",
|
||||||
|
}
|
||||||
|
label_colors = [group_colors[g] for g in all_labels]
|
||||||
|
unique_groups = list(dict.fromkeys(all_labels))
|
||||||
|
|
||||||
|
fig = plt.figure(figsize=(20, 13))
|
||||||
|
fig.suptitle("PCA Denoising Analysis — Can Fewer Dimensions Improve Code Similarity?",
|
||||||
|
fontsize=15, fontweight="bold", y=0.98)
|
||||||
|
|
||||||
|
# ── Row 1 ──
|
||||||
|
|
||||||
|
# Plot 1: Intra/inter similarity vs number of PCA components
|
||||||
|
ax1 = fig.add_subplot(2, 3, 1)
|
||||||
|
dims_plot = [r[0] for r in results]
|
||||||
|
intra_plot = [r[1] for r in results]
|
||||||
|
inter_plot = [r[2] for r in results]
|
||||||
|
ax1.fill_between(dims_plot, inter_plot, intra_plot, alpha=0.15, color="tab:green")
|
||||||
|
ax1.plot(dims_plot, intra_plot, "o-", color="tab:blue", linewidth=2,
|
||||||
|
label="Intra-group (same category)", markersize=6)
|
||||||
|
ax1.plot(dims_plot, inter_plot, "s-", color="tab:red", linewidth=2,
|
||||||
|
label="Inter-group (different category)", markersize=6)
|
||||||
|
ax1.axvline(x=best_k, color="green", linestyle="--", alpha=0.7,
|
||||||
|
label=f"Best gap at k={best_k}")
|
||||||
|
ax1.set_xlabel("Number of PCA Components", fontsize=10)
|
||||||
|
ax1.set_ylabel("Average Cosine Similarity", fontsize=10)
|
||||||
|
ax1.set_title("(a) Intra- vs Inter-Group Similarity", fontsize=11, fontweight="bold")
|
||||||
|
ax1.legend(fontsize=7, loc="center right")
|
||||||
|
ax1.set_xscale("log")
|
||||||
|
ax1.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# Plot 2: Gap (discriminability) vs number of PCA components
|
||||||
|
ax2 = fig.add_subplot(2, 3, 2)
|
||||||
|
gaps_plot = [r[3] for r in results]
|
||||||
|
ax2.plot(dims_plot, gaps_plot, "D-", color="tab:green", linewidth=2, markersize=7)
|
||||||
|
ax2.axvline(x=best_k, color="green", linestyle="--", alpha=0.7,
|
||||||
|
label=f"Best k={best_k} (gap={best_gap:.3f})")
|
||||||
|
ax2.axhline(y=full_gap, color="gray", linestyle=":", alpha=0.7,
|
||||||
|
label=f"Full 768d (gap={full_gap:.3f})")
|
||||||
|
ax2.fill_between(dims_plot, full_gap, gaps_plot, alpha=0.12, color="tab:green",
|
||||||
|
where=[g > full_gap for g in gaps_plot])
|
||||||
|
ax2.set_xlabel("Number of PCA Components", fontsize=10)
|
||||||
|
ax2.set_ylabel("Gap (Intra − Inter)", fontsize=10)
|
||||||
|
ax2.set_title("(b) Discriminability vs Dimensionality", fontsize=11, fontweight="bold")
|
||||||
|
ax2.legend(fontsize=8)
|
||||||
|
ax2.set_xscale("log")
|
||||||
|
ax2.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# Plot 3: Cumulative variance explained
|
||||||
|
pca_full = PCA(n_components=min(N, D), random_state=42)
|
||||||
|
pca_full.fit(X)
|
||||||
|
cumvar = np.cumsum(pca_full.explained_variance_ratio_) * 100
|
||||||
|
ax3 = fig.add_subplot(2, 3, 3)
|
||||||
|
ax3.plot(range(1, len(cumvar) + 1), cumvar, "-", color="tab:purple", linewidth=2)
|
||||||
|
ax3.axvline(x=best_k, color="green", linestyle="--", alpha=0.7,
|
||||||
|
label=f"Best k={best_k}")
|
||||||
|
for threshold in [90, 95, 99]:
|
||||||
|
k_thresh = np.searchsorted(cumvar, threshold) + 1
|
||||||
|
if k_thresh <= len(cumvar):
|
||||||
|
ax3.axhline(y=threshold, color="gray", linestyle=":", alpha=0.4)
|
||||||
|
ax3.annotate(f"{threshold}% → k={k_thresh}", xy=(k_thresh, threshold),
|
||||||
|
fontsize=8, color="gray", ha="left",
|
||||||
|
xytext=(k_thresh + 1, threshold - 2))
|
||||||
|
ax3.set_xlabel("Number of PCA Components", fontsize=10)
|
||||||
|
ax3.set_ylabel("Cumulative Variance Explained (%)", fontsize=10)
|
||||||
|
ax3.set_title("(c) Variance Concentration", fontsize=11, fontweight="bold")
|
||||||
|
ax3.legend(fontsize=8)
|
||||||
|
ax3.set_xscale("log")
|
||||||
|
ax3.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# ── Row 2 ──
|
||||||
|
|
||||||
|
# Plot 4 & 5: Side-by-side heatmaps (full vs PCA-denoised)
|
||||||
|
# Sort indices by group for a block-diagonal structure
|
||||||
|
sorted_idx = sorted(range(N), key=lambda i: all_labels[i])
|
||||||
|
sorted_names = [all_names[i] for i in sorted_idx]
|
||||||
|
sorted_labels = [all_labels[i] for i in sorted_idx]
|
||||||
|
|
||||||
|
sim_full_sorted = sim_full[np.ix_(sorted_idx, sorted_idx)]
|
||||||
|
sim_best_sorted = sim_best[np.ix_(sorted_idx, sorted_idx)]
|
||||||
|
|
||||||
|
for panel_idx, (mat, title_str) in enumerate([
|
||||||
|
(sim_full_sorted, f"(d) Similarity Heatmap — Full 768d"),
|
||||||
|
(sim_best_sorted, f"(e) Similarity Heatmap — PCA {best_k}d (Denoised)"),
|
||||||
|
]):
|
||||||
|
ax = fig.add_subplot(2, 3, 4 + panel_idx)
|
||||||
|
im = ax.imshow(mat, cmap="RdBu_r", vmin=-1, vmax=1, aspect="auto")
|
||||||
|
ax.set_xticks(range(N))
|
||||||
|
ax.set_yticks(range(N))
|
||||||
|
ax.set_xticklabels(sorted_names, rotation=90, fontsize=5)
|
||||||
|
ax.set_yticklabels(sorted_names, fontsize=5)
|
||||||
|
|
||||||
|
# Draw group boundary lines
|
||||||
|
prev_label = sorted_labels[0]
|
||||||
|
for i, lab in enumerate(sorted_labels):
|
||||||
|
if lab != prev_label:
|
||||||
|
ax.axhline(y=i - 0.5, color="black", linewidth=1)
|
||||||
|
ax.axvline(x=i - 0.5, color="black", linewidth=1)
|
||||||
|
prev_label = lab
|
||||||
|
|
||||||
|
ax.set_title(title_str, fontsize=11, fontweight="bold")
|
||||||
|
plt.colorbar(im, ax=ax, shrink=0.8, label="Cosine Similarity")
|
||||||
|
|
||||||
|
# Plot 6: Bar chart comparing specific pairs at full vs PCA
|
||||||
|
ax6 = fig.add_subplot(2, 3, 6)
|
||||||
|
pair_labels = []
|
||||||
|
full_scores = []
|
||||||
|
pca_scores = []
|
||||||
|
pair_colors = []
|
||||||
|
|
||||||
|
for n1, n2 in interesting_pairs:
|
||||||
|
i = all_names.index(n1)
|
||||||
|
j = all_names.index(n2)
|
||||||
|
pair_labels.append(f"{n1}\nvs {n2}")
|
||||||
|
full_scores.append(sim_full[i, j])
|
||||||
|
pca_scores.append(sim_best[i, j])
|
||||||
|
pair_colors.append("#2ca02c" if all_labels[i] == all_labels[j] else "#d62728")
|
||||||
|
|
||||||
|
y_pos = np.arange(len(pair_labels))
|
||||||
|
bar_h = 0.35
|
||||||
|
bars_full = ax6.barh(y_pos + bar_h / 2, full_scores, bar_h, label="Full 768d",
|
||||||
|
color="tab:blue", alpha=0.7)
|
||||||
|
bars_pca = ax6.barh(y_pos - bar_h / 2, pca_scores, bar_h, label=f"PCA {best_k}d",
|
||||||
|
color="tab:orange", alpha=0.7)
|
||||||
|
|
||||||
|
# Color labels by same/different group
|
||||||
|
for i, (yl, col) in enumerate(zip(pair_labels, pair_colors)):
|
||||||
|
ax6.annotate("●", xy=(-0.05, y_pos[i]), fontsize=10, color=col,
|
||||||
|
ha="right", va="center", fontweight="bold",
|
||||||
|
annotation_clip=False)
|
||||||
|
|
||||||
|
ax6.set_yticks(y_pos)
|
||||||
|
ax6.set_yticklabels(pair_labels, fontsize=6)
|
||||||
|
ax6.set_xlabel("Cosine Similarity", fontsize=10)
|
||||||
|
ax6.set_title("(f) Pair Comparison: Full vs PCA Denoised", fontsize=11, fontweight="bold")
|
||||||
|
ax6.legend(fontsize=8)
|
||||||
|
ax6.axvline(x=0, color="black", linewidth=0.5)
|
||||||
|
ax6.set_xlim(-1.1, 1.1)
|
||||||
|
ax6.grid(True, axis="x", alpha=0.3)
|
||||||
|
ax6.invert_yaxis()
|
||||||
|
|
||||||
|
# Custom legend for the dots
|
||||||
|
from matplotlib.lines import Line2D
|
||||||
|
dot_legend = [Line2D([0], [0], marker="o", color="w", markerfacecolor="#2ca02c",
|
||||||
|
markersize=8, label="Same group"),
|
||||||
|
Line2D([0], [0], marker="o", color="w", markerfacecolor="#d62728",
|
||||||
|
markersize=8, label="Different group")]
|
||||||
|
ax6.legend(handles=[bars_full, bars_pca] + dot_legend, fontsize=7, loc="lower right")
|
||||||
|
|
||||||
|
plt.tight_layout(rect=[0, 0, 1, 0.96])
|
||||||
|
plt.savefig("pca_denoising_analysis.png", dpi=150, bbox_inches="tight")
|
||||||
|
print(f"\nSaved: pca_denoising_analysis.png")
|
||||||
|
|
||||||
|
# ── Summary ───────────────────────────────────────────────────────────────
|
||||||
|
print(f"""
|
||||||
|
{'=' * 70}
|
||||||
|
CONCLUSIONS
|
||||||
|
{'=' * 70}
|
||||||
|
|
||||||
|
1. VARIANCE CONCENTRATION:
|
||||||
|
The first few PCA components capture a disproportionate amount of
|
||||||
|
variance. This means the embedding space has low effective
|
||||||
|
dimensionality — most of the 768 dimensions are semi-redundant.
|
||||||
|
|
||||||
|
2. DENOISING EFFECT:
|
||||||
|
At k={best_k}, the gap between intra-group and inter-group similarity
|
||||||
|
is {best_gap:.4f} (vs {full_gap:.4f} at full 768d).
|
||||||
|
{'PCA denoising IMPROVED discriminability by removing noisy dimensions.' if best_gap > full_gap else 'Full dimensionality was already optimal for this dataset.'}
|
||||||
|
|
||||||
|
3. PRACTICAL IMPLICATIONS:
|
||||||
|
- For retrieval (code search), moderate PCA reduction can sharpen
|
||||||
|
results while also reducing storage and computation.
|
||||||
|
- Too few dimensions (k=2,3) lose important signal.
|
||||||
|
- Too many dimensions may retain noise that dilutes similarity.
|
||||||
|
- The "sweet spot" depends on the dataset and task.
|
||||||
|
|
||||||
|
4. TRADE-OFF:
|
||||||
|
PCA denoising is a post-hoc technique. Newer embedding models are
|
||||||
|
trained with Matryoshka Representation Learning (MRL) that makes
|
||||||
|
the FIRST k dimensions maximally informative by design.
|
||||||
|
""")
|
||||||
93
Code embeddings/README.md
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
# Code Embeddings — Hands-On Examples
|
||||||
|
|
||||||
|
**AISE501 – AI in Software Engineering I**
|
||||||
|
Fachhochschule Graubünden — Spring Semester 2026
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Seven self-contained Python programs that demonstrate how embedding
|
||||||
|
models work. Each script loads a pre-trained model, embeds text or code
|
||||||
|
snippets, and explores a different capability of embeddings.
|
||||||
|
|
||||||
|
| # | Script | What it demonstrates |
|
||||||
|
|---|--------|---------------------|
|
||||||
|
| 0 | `00_tokens_and_embeddings_intro.py` | Tokenization basics and general text embeddings (German) |
|
||||||
|
| 1 | `01_basic_embeddings.py` | Compute code embeddings and pairwise cosine similarity |
|
||||||
|
| 2 | `02_text_to_code_search.py` | Semantic search: find code from natural language queries |
|
||||||
|
| 3 | `03_cross_language.py` | Same algorithm in 4 languages → similar embeddings |
|
||||||
|
| 4 | `04_clone_detection.py` | Detect duplicate/similar code in a simulated codebase |
|
||||||
|
| 5 | `05_visualize_embeddings.py` | PCA and t-SNE plots of the embedding space |
|
||||||
|
| 6 | `06_pca_denoising.py` | PCA denoising: fewer dimensions can improve similarity |
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
### 1. Create a virtual environment (recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m venv venv
|
||||||
|
|
||||||
|
# macOS / Linux
|
||||||
|
source venv/bin/activate
|
||||||
|
|
||||||
|
# Windows
|
||||||
|
venv\Scripts\activate
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Install dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
**PyTorch GPU support:**
|
||||||
|
|
||||||
|
- **Apple Silicon Mac (M1/M2/M3/M4):** MPS acceleration works
|
||||||
|
out of the box with the standard PyTorch install. No extra steps needed.
|
||||||
|
- **NVIDIA GPU (Windows/Linux):** Install the CUDA version of PyTorch.
|
||||||
|
See https://pytorch.org/get-started/locally/ for the correct command
|
||||||
|
for your CUDA version.
|
||||||
|
- **CPU only:** Everything works on CPU too, just a bit slower.
|
||||||
|
|
||||||
|
### 3. Run any example
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python 00_tokens_and_embeddings_intro.py
|
||||||
|
python 01_basic_embeddings.py
|
||||||
|
python 02_text_to_code_search.py
|
||||||
|
python 03_cross_language.py
|
||||||
|
python 04_clone_detection.py
|
||||||
|
python 05_visualize_embeddings.py
|
||||||
|
python 06_pca_denoising.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The first run will download the model (~300 MB). Subsequent runs
|
||||||
|
use the cached model.
|
||||||
|
|
||||||
|
## Model
|
||||||
|
|
||||||
|
All code embedding examples (01–06) use **st-codesearch-distilroberta-base**
|
||||||
|
(82M parameters), a DistilRoBERTa model fine-tuned on 1.38 million
|
||||||
|
code-comment pairs from CodeSearchNet using contrastive learning
|
||||||
|
(MultipleNegativesRankingLoss). It produces 768-dimensional embedding
|
||||||
|
vectors optimized for matching natural language descriptions to code,
|
||||||
|
making it ideal for semantic code search and similarity tasks.
|
||||||
|
|
||||||
|
The introductory example (00) uses **paraphrase-multilingual-mpnet-base-v2**
|
||||||
|
for demonstrating general language embeddings with German text.
|
||||||
|
|
||||||
|
## Hardware Requirements
|
||||||
|
|
||||||
|
- **RAM:** 1 GB free (for the model)
|
||||||
|
- **Disk:** ~500 MB (for the downloaded model, cached in `~/.cache/huggingface/`)
|
||||||
|
- **GPU:** Optional — all scripts auto-detect and use:
|
||||||
|
- CUDA (NVIDIA GPUs)
|
||||||
|
- MPS (Apple Silicon)
|
||||||
|
- CPU (fallback)
|
||||||
|
|
||||||
|
## Expected Output
|
||||||
|
|
||||||
|
Each script prints structured output with explanations. Example 5
|
||||||
|
saves two PNG images (`code_embeddings_pca.png` and
|
||||||
|
`code_embeddings_tsne.png`) showing the embedding space. Example 6
|
||||||
|
saves `pca_denoising_analysis.png` with three sub-plots analyzing
|
||||||
|
optimal embedding dimensions.
|
||||||
BIN
Code embeddings/code_embeddings_pca.png
Normal file
|
After Width: | Height: | Size: 107 KiB |
BIN
Code embeddings/code_embeddings_tsne.png
Normal file
|
After Width: | Height: | Size: 104 KiB |
BIN
Code embeddings/embedding_space_crosslingual.png
Normal file
|
After Width: | Height: | Size: 132 KiB |
BIN
Code embeddings/embedding_space_german.png
Normal file
|
After Width: | Height: | Size: 140 KiB |
BIN
Code embeddings/pca_denoising_analysis.png
Normal file
|
After Width: | Height: | Size: 398 KiB |
6
Code embeddings/requirements.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
torch
|
||||||
|
transformers
|
||||||
|
sentence-transformers
|
||||||
|
scikit-learn
|
||||||
|
matplotlib
|
||||||
|
numpy
|
||||||
BIN
Prompting Exercise/.DS_Store
vendored
Normal file
67
Prompting Exercise/analyze_me.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
"""
|
||||||
|
analyze_me.py – A data-processing script used in Exercise 2
|
||||||
|
==============================================================
|
||||||
|
This file contains several realistic bugs and style issues.
|
||||||
|
Do NOT fix them manually — in Exercise 2 the LLM will help you find them!
|
||||||
|
|
||||||
|
Can you spot the issues yourself before asking the LLM?
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_statistics(numbers):
|
||||||
|
total = 0
|
||||||
|
for n in numbers:
|
||||||
|
total = total + n
|
||||||
|
average = total / len(numbers) # Bug 1: ZeroDivisionError when list is empty
|
||||||
|
|
||||||
|
min_val = numbers[0] # Bug 2: IndexError when list is empty
|
||||||
|
max_val = numbers[0]
|
||||||
|
for n in numbers:
|
||||||
|
if n < min_val:
|
||||||
|
min_val = n
|
||||||
|
if n > max_val:
|
||||||
|
max_val = n
|
||||||
|
|
||||||
|
variance = 0
|
||||||
|
for n in numbers:
|
||||||
|
variance = variance + (n - average) ** 2
|
||||||
|
variance = variance / len(numbers) # Bug 3: population variance (÷N), not sample variance (÷N-1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(numbers),
|
||||||
|
"sum": total,
|
||||||
|
"average": average,
|
||||||
|
"min": min_val,
|
||||||
|
"max": max_val,
|
||||||
|
"variance": variance,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_data(filename):
|
||||||
|
numbers = []
|
||||||
|
f = open(filename) # Bug 4: no context manager (file may not be closed on error)
|
||||||
|
for line in f:
|
||||||
|
numbers.append(int(line.strip())) # Bug 5: int() crashes on floats and blank lines
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
result = calculate_statistics(numbers)
|
||||||
|
print("Statistics:", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(numbers, method="minmax"):
|
||||||
|
if method == "minmax":
|
||||||
|
mn = min(numbers)
|
||||||
|
mx = max(numbers)
|
||||||
|
return [(x - mn) / mx - mn for x in numbers] # Bug 6: operator-precedence error
|
||||||
|
elif method == "zscore":
|
||||||
|
stats = calculate_statistics(numbers)
|
||||||
|
std = stats["variance"] ** 0.5
|
||||||
|
return [(x - stats["average"]) / std for x in numbers]
|
||||||
|
else:
|
||||||
|
print("Unknown normalisation method") # Bug 7: should raise ValueError, not just print
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sample = [4, 8, 15, 16, 23, 42]
|
||||||
|
print(calculate_statistics(sample))
|
||||||
67
Prompting Exercise/analyze_me_blind.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
"""
|
||||||
|
analyze_me.py – A data-processing script used in Exercise 2
|
||||||
|
==============================================================
|
||||||
|
This file contains several realistic bugs and style issues.
|
||||||
|
Do NOT fix them manually — in Exercise 2 the LLM will help you find them!
|
||||||
|
|
||||||
|
Can you spot the issues yourself before asking the LLM?
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_statistics(numbers):
|
||||||
|
total = 0
|
||||||
|
for n in numbers:
|
||||||
|
total = total + n
|
||||||
|
average = total / len(numbers)
|
||||||
|
|
||||||
|
min_val = numbers[0]
|
||||||
|
max_val = numbers[0]
|
||||||
|
for n in numbers:
|
||||||
|
if n < min_val:
|
||||||
|
min_val = n
|
||||||
|
if n > max_val:
|
||||||
|
max_val = n
|
||||||
|
|
||||||
|
variance = 0
|
||||||
|
for n in numbers:
|
||||||
|
variance = variance + (n - average) ** 2
|
||||||
|
variance = variance / len(numbers)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(numbers),
|
||||||
|
"sum": total,
|
||||||
|
"average": average,
|
||||||
|
"min": min_val,
|
||||||
|
"max": max_val,
|
||||||
|
"variance": variance,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_data(filename):
|
||||||
|
numbers = []
|
||||||
|
f = open(filename)
|
||||||
|
for line in f:
|
||||||
|
numbers.append(int(line.strip()))
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
result = calculate_statistics(numbers)
|
||||||
|
print("Statistics:", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(numbers, method="minmax"):
|
||||||
|
if method == "minmax":
|
||||||
|
mn = min(numbers)
|
||||||
|
mx = max(numbers)
|
||||||
|
return [(x - mn) / mx - mn for x in numbers]
|
||||||
|
elif method == "zscore":
|
||||||
|
stats = calculate_statistics(numbers)
|
||||||
|
std = stats["variance"] ** 0.5
|
||||||
|
return [(x - stats["average"]) / std for x in numbers]
|
||||||
|
else:
|
||||||
|
print("Unknown normalisation method")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sample = [4, 8, 15, 16, 23, 42]
|
||||||
|
print(calculate_statistics(sample))
|
||||||
89
Prompting Exercise/analyze_me_blind_fix.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
"""
|
||||||
|
analyze_me.py – A data-processing script used in Exercise 2
|
||||||
|
==============================================================
|
||||||
|
This file contains several realistic bugs and style issues.
|
||||||
|
Do NOT fix them manually — in Exercise 2 the LLM will help you find them!
|
||||||
|
|
||||||
|
Can you spot the issues yourself before asking the LLM?
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_statistics(numbers):
|
||||||
|
if not numbers:
|
||||||
|
raise ValueError("Cannot calculate statistics for an empty list.")
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
for n in numbers:
|
||||||
|
total = total + n
|
||||||
|
average = total / len(numbers)
|
||||||
|
|
||||||
|
min_val = numbers[0]
|
||||||
|
max_val = numbers[0]
|
||||||
|
for n in numbers:
|
||||||
|
if n < min_val:
|
||||||
|
min_val = n
|
||||||
|
if n > max_val:
|
||||||
|
max_val = n
|
||||||
|
|
||||||
|
variance = 0
|
||||||
|
for n in numbers:
|
||||||
|
variance = variance + (n - average) ** 2
|
||||||
|
variance = variance / len(numbers)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(numbers),
|
||||||
|
"sum": total,
|
||||||
|
"average": average,
|
||||||
|
"min": min_val,
|
||||||
|
"max": max_val,
|
||||||
|
"variance": variance,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_data(filename):
|
||||||
|
numbers = []
|
||||||
|
try:
|
||||||
|
with open(filename, 'r') as file_handle:
|
||||||
|
for line in file_handle:
|
||||||
|
stripped_line = line.strip()
|
||||||
|
if stripped_line:
|
||||||
|
numbers.append(int(stripped_line))
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(f"Error: File '{filename}' not found.")
|
||||||
|
raise
|
||||||
|
except ValueError as e:
|
||||||
|
print(f"Error: Invalid integer in file: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
result = calculate_statistics(numbers)
|
||||||
|
print("Statistics:", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(numbers, method="minmax"):
|
||||||
|
if not numbers:
|
||||||
|
raise ValueError("Cannot normalize an empty list.")
|
||||||
|
|
||||||
|
if method == "minmax":
|
||||||
|
mn = min(numbers)
|
||||||
|
mx = max(numbers)
|
||||||
|
if mx == mn:
|
||||||
|
return [0.0 for _ in numbers]
|
||||||
|
return [(x - mn) / (mx - mn) for x in numbers]
|
||||||
|
elif method == "zscore":
|
||||||
|
stats = calculate_statistics(numbers)
|
||||||
|
std = stats["variance"] ** 0.5
|
||||||
|
if std == 0:
|
||||||
|
return [0.0 for _ in numbers]
|
||||||
|
return [(x - stats["average"]) / std for x in numbers]
|
||||||
|
else:
|
||||||
|
print("Unknown normalization method")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sample = [4, 8, 15, 16, 23, 42]
|
||||||
|
print(calculate_statistics(sample))
|
||||||
|
|
||||||
192
Prompting Exercise/analyze_me_direct.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
"""
|
||||||
|
analyze_me.py – A data-processing script used in Exercise 2
|
||||||
|
==============================================================
|
||||||
|
This module provides robust functions for calculating statistics,
|
||||||
|
processing data files, and normalizing numeric lists.
|
||||||
|
|
||||||
|
All functions include PEP-484 type hints and NumPy-style docstrings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Dict, Union, Any
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_statistics(numbers: List[Union[int, float]]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Calculate basic statistics for a list of numbers.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numbers : List[Union[int, float]]
|
||||||
|
The list of numeric values to analyze.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dict[str, Any]
|
||||||
|
A dictionary containing count, sum, average, min, max, and variance.
|
||||||
|
If the input list is empty, returns a dictionary with zero values
|
||||||
|
for all fields except count (which is 0).
|
||||||
|
|
||||||
|
Notes
|
||||||
|
-----
|
||||||
|
- Variance is calculated using the sample variance formula (dividing by N-1).
|
||||||
|
- If the list is empty, the function returns early to avoid division by zero
|
||||||
|
or index errors.
|
||||||
|
"""
|
||||||
|
count = len(numbers)
|
||||||
|
|
||||||
|
if count == 0:
|
||||||
|
return {
|
||||||
|
"count": 0,
|
||||||
|
"sum": 0.0,
|
||||||
|
"average": 0.0,
|
||||||
|
"min": 0.0,
|
||||||
|
"max": 0.0,
|
||||||
|
"variance": 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
total = sum(numbers)
|
||||||
|
average = total / count
|
||||||
|
|
||||||
|
min_val = min(numbers)
|
||||||
|
max_val = max(numbers)
|
||||||
|
|
||||||
|
# Calculate sample variance (divide by N-1)
|
||||||
|
variance_sum = sum((n - average) ** 2 for n in numbers)
|
||||||
|
variance = variance_sum / (count - 1)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": count,
|
||||||
|
"sum": total,
|
||||||
|
"average": average,
|
||||||
|
"min": min_val,
|
||||||
|
"max": max_val,
|
||||||
|
"variance": variance,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_data(filename: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Read numeric data from a file and calculate statistics.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename : str
|
||||||
|
Path to the input file containing one number per line.
|
||||||
|
Blank lines and non-numeric lines are skipped.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Dict[str, Any]
|
||||||
|
The statistics dictionary returned by calculate_statistics().
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
FileNotFoundError
|
||||||
|
If the specified file does not exist.
|
||||||
|
ValueError
|
||||||
|
If the file cannot be read or contains no valid numbers.
|
||||||
|
"""
|
||||||
|
numbers: List[Union[int, float]] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filename, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# Attempt to parse as float to handle both int and float
|
||||||
|
numbers.append(float(stripped))
|
||||||
|
except ValueError:
|
||||||
|
# Skip non-numeric lines
|
||||||
|
continue
|
||||||
|
except FileNotFoundError:
|
||||||
|
raise FileNotFoundError(f"File not found: {filename}")
|
||||||
|
except IOError as e:
|
||||||
|
raise IOError(f"Error reading file {filename}: {e}")
|
||||||
|
|
||||||
|
if not numbers:
|
||||||
|
raise ValueError(f"No valid numeric data found in {filename}")
|
||||||
|
|
||||||
|
result = calculate_statistics(numbers)
|
||||||
|
print("Statistics:", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(numbers: List[Union[int, float]], method: str = "minmax") -> List[float]:
|
||||||
|
"""
|
||||||
|
Normalize a list of numbers using the specified method.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numbers : List[Union[int, float]]
|
||||||
|
The list of numeric values to normalize.
|
||||||
|
method : str, optional
|
||||||
|
The normalization method to use. Options are:
|
||||||
|
- "minmax": Min-Max normalization to [0, 1]
|
||||||
|
- "zscore": Z-score normalization (standardization)
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
List[float]
|
||||||
|
The normalized list of numbers.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If an unknown normalization method is provided, or if the list is empty.
|
||||||
|
ZeroDivisionError
|
||||||
|
If the range is zero for minmax or standard deviation is zero for zscore.
|
||||||
|
"""
|
||||||
|
if not numbers:
|
||||||
|
raise ValueError("Cannot normalize an empty list.")
|
||||||
|
|
||||||
|
if method == "minmax":
|
||||||
|
mn = min(numbers)
|
||||||
|
mx = max(numbers)
|
||||||
|
range_val = mx - mn
|
||||||
|
if range_val == 0:
|
||||||
|
# If all values are the same, return zeros or handle as needed
|
||||||
|
return [0.0 for _ in numbers]
|
||||||
|
return [(x - mn) / range_val for x in numbers]
|
||||||
|
|
||||||
|
elif method == "zscore":
|
||||||
|
stats = calculate_statistics(numbers)
|
||||||
|
std = stats["variance"] ** 0.5
|
||||||
|
if std == 0:
|
||||||
|
# If standard deviation is zero, all values are the same
|
||||||
|
return [0.0 for _ in numbers]
|
||||||
|
return [(x - stats["average"]) / std for x in numbers]
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown normalization method: '{method}'. "
|
||||||
|
f"Supported methods: 'minmax', 'zscore'.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Basic sanity checks
|
||||||
|
sample = [4, 8, 15, 16, 23, 42]
|
||||||
|
|
||||||
|
print("Testing calculate_statistics:")
|
||||||
|
stats = calculate_statistics(sample)
|
||||||
|
print(stats)
|
||||||
|
|
||||||
|
print("\nTesting normalize (minmax):")
|
||||||
|
normalized_minmax = normalize(sample, "minmax")
|
||||||
|
print(normalized_minmax)
|
||||||
|
|
||||||
|
print("\nTesting normalize (zscore):")
|
||||||
|
normalized_zscore = normalize(sample, "zscore")
|
||||||
|
print(normalized_zscore)
|
||||||
|
|
||||||
|
print("\nTesting empty list handling:")
|
||||||
|
empty_stats = calculate_statistics([])
|
||||||
|
print(empty_stats)
|
||||||
|
|
||||||
|
print("\nTesting unknown method error:")
|
||||||
|
try:
|
||||||
|
normalize(sample, "unknown")
|
||||||
|
except ValueError as e:
|
||||||
|
print(f"Caught expected error: {e}")
|
||||||
|
|
||||||
|
print("\nAll sanity checks passed!")
|
||||||
89
Prompting Exercise/analyze_me_fix.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
import sys
|
||||||
|
|
||||||
|
"""
|
||||||
|
analyze_me.py – A data-processing script used in Exercise 2
|
||||||
|
==============================================================
|
||||||
|
This file contains several realistic bugs and style issues.
|
||||||
|
Do NOT fix them manually — in Exercise 2 the LLM will help you find them!
|
||||||
|
|
||||||
|
Can you spot the issues yourself before asking the LLM?
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_statistics(numbers):
|
||||||
|
if not numbers:
|
||||||
|
return {
|
||||||
|
"count": 0,
|
||||||
|
"sum": 0,
|
||||||
|
"average": 0.0,
|
||||||
|
"min": None,
|
||||||
|
"max": None,
|
||||||
|
"variance": 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
for n in numbers:
|
||||||
|
total = total + n
|
||||||
|
average = total / len(numbers)
|
||||||
|
|
||||||
|
min_val = numbers[0]
|
||||||
|
max_val = numbers[0]
|
||||||
|
for n in numbers:
|
||||||
|
if n < min_val:
|
||||||
|
min_val = n
|
||||||
|
if n > max_val:
|
||||||
|
max_val = n
|
||||||
|
|
||||||
|
variance = 0
|
||||||
|
for n in numbers:
|
||||||
|
variance = variance + (n - average) ** 2
|
||||||
|
variance = variance / (len(numbers) - 1) if len(numbers) > 1 else 0.0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(numbers),
|
||||||
|
"sum": total,
|
||||||
|
"average": average,
|
||||||
|
"min": min_val,
|
||||||
|
"max": max_val,
|
||||||
|
"variance": variance,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def process_data(filename):
|
||||||
|
numbers = []
|
||||||
|
with open(filename) as file_handle:
|
||||||
|
for line in file_handle:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
numbers.append(float(stripped))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = calculate_statistics(numbers)
|
||||||
|
print("Statistics:", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(numbers, method="minmax"):
|
||||||
|
if method == "minmax":
|
||||||
|
mn = min(numbers)
|
||||||
|
mx = max(numbers)
|
||||||
|
if mx == mn:
|
||||||
|
return [0.0 for _ in numbers]
|
||||||
|
return [(x - mn) / (mx - mn) for x in numbers]
|
||||||
|
elif method == "zscore":
|
||||||
|
stats = calculate_statistics(numbers)
|
||||||
|
std = stats["variance"] ** 0.5
|
||||||
|
if std == 0:
|
||||||
|
return [0.0 for _ in numbers]
|
||||||
|
return [(x - stats["average"]) / std for x in numbers]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown normalization method: {method}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sample = [4, 8, 15, 16, 23, 42]
|
||||||
|
print(calculate_statistics(sample))
|
||||||
|
|
||||||
216
Prompting Exercise/analyze_me_fixed.py
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
"""
|
||||||
|
analyze_me.py – A data-processing script used in Exercise 2
|
||||||
|
==============================================================
|
||||||
|
This file contains several realistic bugs and style issues.
|
||||||
|
Do NOT fix them manually — in Exercise 2 the LLM will help you find them!
|
||||||
|
|
||||||
|
Can you spot the issues yourself before asking the LLM?
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_statistics(numbers: list[float]) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Calculate basic statistical measures for a list of numbers.
|
||||||
|
|
||||||
|
This function computes the count, sum, average, minimum, maximum, and
|
||||||
|
variance (population variance) of the provided list of numbers.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numbers : list[float]
|
||||||
|
A list of numeric values to analyze.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, float]
|
||||||
|
A dictionary containing the following keys:
|
||||||
|
- 'count': The number of elements in the list.
|
||||||
|
- 'sum': The sum of all elements.
|
||||||
|
- 'average': The arithmetic mean of the elements.
|
||||||
|
- 'min': The minimum value in the list.
|
||||||
|
- 'max': The maximum value in the list.
|
||||||
|
- 'variance': The population variance of the elements.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ZeroDivisionError
|
||||||
|
If the input list is empty, division by zero will occur when
|
||||||
|
calculating the average and variance.
|
||||||
|
IndexError
|
||||||
|
If the input list is empty, accessing the first element for min/max
|
||||||
|
will raise an error.
|
||||||
|
"""
|
||||||
|
# Step 2 – Implement empty list handling in calculate_statistics
|
||||||
|
if not numbers:
|
||||||
|
return {
|
||||||
|
"count": 0,
|
||||||
|
"sum": 0.0,
|
||||||
|
"average": 0.0,
|
||||||
|
"min": 0.0,
|
||||||
|
"max": 0.0,
|
||||||
|
"variance": 0.0,
|
||||||
|
}
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
for n in numbers:
|
||||||
|
total = total + n
|
||||||
|
average = total / len(numbers) # Bug 1: ZeroDivisionError when list is empty
|
||||||
|
|
||||||
|
min_val = numbers[0] # Bug 2: IndexError when list is empty
|
||||||
|
max_val = numbers[0]
|
||||||
|
for n in numbers:
|
||||||
|
if n < min_val:
|
||||||
|
min_val = n
|
||||||
|
if n > max_val:
|
||||||
|
max_val = n
|
||||||
|
|
||||||
|
variance = 0
|
||||||
|
for n in numbers:
|
||||||
|
variance = variance + (n - average) ** 2
|
||||||
|
|
||||||
|
# Step 3 – Correct variance calculation to use sample variance
|
||||||
|
count = len(numbers)
|
||||||
|
if count > 1:
|
||||||
|
variance = variance / (count - 1)
|
||||||
|
else:
|
||||||
|
variance = 0.0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"count": len(numbers),
|
||||||
|
"sum": total,
|
||||||
|
"average": average,
|
||||||
|
"min": min_val,
|
||||||
|
"max": max_val,
|
||||||
|
"variance": variance,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Step 4 – Define type hints and docstrings for process_data
|
||||||
|
def process_data(filename: str) -> dict[str, float]:
|
||||||
|
"""
|
||||||
|
Read numeric data from a file and compute statistics.
|
||||||
|
|
||||||
|
This function opens a text file, reads each line, converts it to an integer,
|
||||||
|
and collects the values into a list. It then passes this list to
|
||||||
|
calculate_statistics to compute and return the statistical summary.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
filename : str
|
||||||
|
The path to the text file containing one number per line.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict[str, float]
|
||||||
|
A dictionary containing the statistical measures computed from the file data.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
FileNotFoundError
|
||||||
|
If the specified file does not exist.
|
||||||
|
ValueError
|
||||||
|
If a line in the file cannot be converted to an integer.
|
||||||
|
"""
|
||||||
|
numbers = []
|
||||||
|
# Step 5 – Implement context manager and robust line parsing in process_data
|
||||||
|
with open(filename) as f:
|
||||||
|
for line in f:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
# Attempt to convert to float first to handle both ints and floats
|
||||||
|
value = float(stripped)
|
||||||
|
numbers.append(value)
|
||||||
|
except ValueError:
|
||||||
|
# Skip lines that cannot be converted to a number
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = calculate_statistics(numbers)
|
||||||
|
print("Statistics:", result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Step 6 – Define type hints and docstrings for normalize
|
||||||
|
def normalize(numbers: list[float], method: str = "minmax") -> list[float]:
|
||||||
|
"""
|
||||||
|
Normalize a list of numbers using the specified method.
|
||||||
|
|
||||||
|
This function applies either 'minmax' scaling or 'zscore' standardization
|
||||||
|
to the input list of numbers.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
numbers : list[float]
|
||||||
|
A list of numeric values to normalize.
|
||||||
|
method : str, optional
|
||||||
|
The normalization method to use. Options are:
|
||||||
|
- 'minmax': Scales values to the range [0, 1].
|
||||||
|
- 'zscore': Standardizes values to have mean 0 and standard deviation 1.
|
||||||
|
Default is 'minmax'.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
list[float]
|
||||||
|
A list of normalized values.
|
||||||
|
|
||||||
|
Raises
|
||||||
|
------
|
||||||
|
ValueError
|
||||||
|
If an unknown normalization method is provided.
|
||||||
|
ZeroDivisionError
|
||||||
|
If 'minmax' is used on a list where all values are identical (range is 0),
|
||||||
|
or if 'zscore' is used on a list with zero standard deviation.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
>>> normalize([1, 2, 3, 4, 5])
|
||||||
|
[0.0, 0.25, 0.5, 0.75, 1.0]
|
||||||
|
"""
|
||||||
|
if method == "minmax":
|
||||||
|
mn = min(numbers)
|
||||||
|
mx = max(numbers)
|
||||||
|
# Step 7 – Fix operator precedence bug in minmax normalization
|
||||||
|
return [(x - mn) / (mx - mn) for x in numbers]
|
||||||
|
elif method == "zscore":
|
||||||
|
stats = calculate_statistics(numbers)
|
||||||
|
std = stats["variance"] ** 0.5
|
||||||
|
return [(x - stats["average"]) / std for x in numbers]
|
||||||
|
else:
|
||||||
|
# Step 8 – Replace print statement with ValueError for unknown methods
|
||||||
|
raise ValueError(f"Unknown normalisation method: {method}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Step 9 – Implement and verify main block sanity checks
|
||||||
|
sample = [4, 8, 15, 16, 23, 42]
|
||||||
|
stats = calculate_statistics(sample)
|
||||||
|
|
||||||
|
# Verify expected values for sample data
|
||||||
|
expected_sum = 4 + 8 + 15 + 16 + 23 + 42
|
||||||
|
expected_count = 6
|
||||||
|
expected_avg = expected_sum / expected_count
|
||||||
|
|
||||||
|
assert stats["count"] == expected_count, f"Count mismatch: {stats['count']} != {expected_count}"
|
||||||
|
assert stats["sum"] == expected_sum, f"Sum mismatch: {stats['sum']} != {expected_sum}"
|
||||||
|
assert abs(stats["average"] - expected_avg) < 1e-9, f"Average mismatch: {stats['average']} != {expected_avg}"
|
||||||
|
assert stats["min"] == 4, f"Min mismatch: {stats['min']} != 4"
|
||||||
|
assert stats["max"] == 42, f"Max mismatch: {stats['max']} != 42"
|
||||||
|
|
||||||
|
# Test empty list handling
|
||||||
|
empty_stats = calculate_statistics([])
|
||||||
|
assert empty_stats["count"] == 0, "Empty list count should be 0"
|
||||||
|
assert empty_stats["sum"] == 0.0, "Empty list sum should be 0.0"
|
||||||
|
assert empty_stats["average"] == 0.0, "Empty list average should be 0.0"
|
||||||
|
assert empty_stats["min"] == 0.0, "Empty list min should be 0.0"
|
||||||
|
assert empty_stats["max"] == 0.0, "Empty list max should be 0.0"
|
||||||
|
assert empty_stats["variance"] == 0.0, "Empty list variance should be 0.0"
|
||||||
|
|
||||||
|
# Test normalization
|
||||||
|
normalized = normalize([1, 2, 3, 4, 5])
|
||||||
|
expected_normalized = [0.0, 0.25, 0.5, 0.75, 1.0]
|
||||||
|
assert len(normalized) == 5, "Normalized list length mismatch"
|
||||||
|
for i, val in enumerate(normalized):
|
||||||
|
assert abs(val - expected_normalized[i]) < 1e-9, f"Normalized value mismatch at index {i}"
|
||||||
|
|
||||||
|
print("All sanity checks passed!")
|
||||||
142
Prompting Exercise/ex01_xml_prompting.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
"""
|
||||||
|
Exercise 1 – Basic XML Structured Prompting
|
||||||
|
============================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
|
||||||
|
Learning goals
|
||||||
|
--------------
|
||||||
|
* Connect to the local LLM server and send your first prompt.
|
||||||
|
* Understand the difference between unstructured and XML-structured prompts.
|
||||||
|
* See how structure helps the model parse and prioritise different parts
|
||||||
|
of your request.
|
||||||
|
|
||||||
|
Tasks
|
||||||
|
-----
|
||||||
|
Part A Run the unstructured prompt (already done for you). Read the response.
|
||||||
|
Part B Complete the XML-structured version of the same request (TODOs 1-3).
|
||||||
|
Part C Add a system prompt to set the response style (TODOs 4-5).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from server_utils import chat, get_client, print_messages, print_separator
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part A: Unstructured (Zero-Shot) Prompt ───────────────────────────────────
|
||||||
|
# This section is complete. Run it, read the response, then move on.
|
||||||
|
|
||||||
|
print_separator("Part A – Unstructured Prompt")
|
||||||
|
|
||||||
|
unstructured_messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": (
|
||||||
|
"Explain what a Python list comprehension is, "
|
||||||
|
"give an example that filters even numbers from a list, "
|
||||||
|
"and list two common mistakes beginners make."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# print_messages(unstructured_messages) # ← always inspect what you send!
|
||||||
|
# response_a = chat(client, unstructured_messages)
|
||||||
|
# print(response_a)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Structured Prompt with XML Tags ───────────────────────────────────
|
||||||
|
# Use XML tags to structure the same request more precisely.
|
||||||
|
# Named sections help the model parse and prioritise your intent.
|
||||||
|
|
||||||
|
print_separator("Part B – Structured Prompt with XML Tags")
|
||||||
|
|
||||||
|
# TODO 1: Fill in the three XML sections below.
|
||||||
|
# Use the same topic as Part A but make each section specific.
|
||||||
|
#
|
||||||
|
# <topic> – the Python concept to explain
|
||||||
|
# <example> – what the code example should demonstrate
|
||||||
|
# <focus> – two or three specific points you want covered in the answer
|
||||||
|
#
|
||||||
|
# Tip: XML tag names are arbitrary — choose names that make sense to a
|
||||||
|
# human reader and the model will understand them too.
|
||||||
|
|
||||||
|
structured_content = """\
|
||||||
|
<request>
|
||||||
|
<topic>
|
||||||
|
Python list comprehensions
|
||||||
|
</topic>
|
||||||
|
<example>
|
||||||
|
Filter even numbers from a list
|
||||||
|
</example>
|
||||||
|
<focus>
|
||||||
|
Syntax overview and two common beginner mistakes
|
||||||
|
</focus>
|
||||||
|
</request>"""
|
||||||
|
|
||||||
|
# TODO 2: Build the messages list.
|
||||||
|
# Use structured_content as the content of a "user" message.
|
||||||
|
#
|
||||||
|
# Reminder: messages is a list of dicts with keys "role" and "content".
|
||||||
|
# "role" is one of "system", "user", or "assistant".
|
||||||
|
|
||||||
|
structured_messages = [
|
||||||
|
# TODO: add the user message dict here
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": structured_content,
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# TODO 3: Call chat() with structured_messages, store the result, print it.
|
||||||
|
# Compare the output with response_a above.
|
||||||
|
# Always call print_messages() before chat() to see the full prompt.
|
||||||
|
|
||||||
|
# print_messages(structured_messages)
|
||||||
|
# response_b = chat(client, structured_messages)
|
||||||
|
# print(response_b)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Adding a System Prompt ────────────────────────────────────────────
|
||||||
|
# A system prompt lets you define a persona and global rules for every
|
||||||
|
# response in the conversation without repeating yourself each time.
|
||||||
|
|
||||||
|
print_separator("Part C – Adding a System Prompt")
|
||||||
|
|
||||||
|
# TODO 4: Write an XML-structured system prompt that defines:
|
||||||
|
# <persona> – who the LLM should be
|
||||||
|
# <style> – tone and formatting rules
|
||||||
|
# <constraints> – length or content limits
|
||||||
|
#
|
||||||
|
# Example persona: "experienced Python tutor who always shows code first"
|
||||||
|
|
||||||
|
system_content = """\
|
||||||
|
<request>
|
||||||
|
<persona>You are a master python developer and teacher</persona>
|
||||||
|
<style>You follow the PEP 8 style guide</style>
|
||||||
|
<constraints>Format your response in json</constraints>
|
||||||
|
</request>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO 5: Build a messages list that puts the system prompt FIRST (role="system"),
|
||||||
|
# followed by the structured user message from Part B.
|
||||||
|
# Call chat() and print the result.
|
||||||
|
#
|
||||||
|
# Reflection: How did the system prompt change the answer compared to Part B?
|
||||||
|
|
||||||
|
messages_c = [
|
||||||
|
{"role": "system", "content": system_content},
|
||||||
|
{"role": "user", "content": structured_content}
|
||||||
|
]
|
||||||
|
print_messages(messages_c)
|
||||||
|
response_c = chat(client, messages_c)
|
||||||
|
print(response_c)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. How did XML structure change the format and depth of the response?\n"
|
||||||
|
"2. What happens if you use inconsistent or missing closing tags?\n"
|
||||||
|
"3. When would you NOT bother with XML structure?\n"
|
||||||
|
"4. How does the system prompt interact with the user message?\n"
|
||||||
|
)
|
||||||
91
Prompting Exercise/ex01_xml_prompting_solution.py
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
"""
|
||||||
|
Exercise 1 – SOLUTION – Basic XML Structured Prompting
|
||||||
|
=======================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
"""
|
||||||
|
|
||||||
|
from server_utils import chat, get_client, print_messages, print_separator
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
temperature_value=0.3
|
||||||
|
|
||||||
|
# ── Part A: Unstructured (Zero-Shot) Prompt ───────────────────────────────────
|
||||||
|
print_separator("Part A – Unstructured Prompt")
|
||||||
|
|
||||||
|
unstructured_messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": (
|
||||||
|
"Explain what a Python list comprehension is, "
|
||||||
|
"give an example that filters even numbers from a list, "
|
||||||
|
"and list two common mistakes beginners make."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(unstructured_messages)
|
||||||
|
response_a = chat(client, unstructured_messages)
|
||||||
|
print(response_a)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Structured Prompt with XML Tags ───────────────────────────────────
|
||||||
|
print_separator("Part B – Structured Prompt with XML Tags")
|
||||||
|
|
||||||
|
structured_content = """\
|
||||||
|
<request>
|
||||||
|
<topic>
|
||||||
|
Python list comprehensions
|
||||||
|
</topic>
|
||||||
|
<example>
|
||||||
|
A list comprehension that takes a list of integers and returns only
|
||||||
|
the even numbers, using a conditional filter expression.
|
||||||
|
</example>
|
||||||
|
<focus>
|
||||||
|
1. The general syntax: [expression for item in iterable if condition]
|
||||||
|
2. Two common beginner mistakes when writing list comprehensions
|
||||||
|
</focus>
|
||||||
|
</request>"""
|
||||||
|
|
||||||
|
structured_messages = [
|
||||||
|
{"role": "user", "content": structured_content}
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(structured_messages)
|
||||||
|
response_b = chat(client, structured_messages, temperature=temperature_value)
|
||||||
|
print(response_b)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Adding a System Prompt ────────────────────────────────────────────
|
||||||
|
print_separator("Part C – Adding a System Prompt")
|
||||||
|
|
||||||
|
system_content = """\
|
||||||
|
<persona>
|
||||||
|
You are an experienced Python tutor. You teach Python to university students
|
||||||
|
who have basic programming knowledge but are new to idiomatic Python.
|
||||||
|
</persona>
|
||||||
|
<style>
|
||||||
|
Always show a working code snippet first, then explain it step by step.
|
||||||
|
Use plain language. Avoid jargon without defining it. Write python in PEP8 style
|
||||||
|
</style>
|
||||||
|
<constraints>
|
||||||
|
Keep each answer under 200 words. Use at most one code block per response.
|
||||||
|
</constraints>"""
|
||||||
|
|
||||||
|
messages_c = [
|
||||||
|
{"role": "system", "content": system_content},
|
||||||
|
{"role": "user", "content": structured_content},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_c)
|
||||||
|
response_c = chat(client, messages_c,temperature=temperature_value)
|
||||||
|
print(response_c)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. How did XML structure change the format and depth of the response?\n"
|
||||||
|
"2. What happens if you use inconsistent or missing closing tags?\n"
|
||||||
|
"3. When would you NOT bother with XML structure?\n"
|
||||||
|
"4. How does the system prompt interact with the user message?\n"
|
||||||
|
)
|
||||||
151
Prompting Exercise/ex02_persona_task_data.py
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
"""
|
||||||
|
Exercise 2 – Persona, Task, and Data in a Structured Prompt
|
||||||
|
============================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
|
||||||
|
Learning goals
|
||||||
|
--------------
|
||||||
|
* Use XML tags to separate three prompt concerns: WHO the LLM is,
|
||||||
|
WHAT it should do, and the DATA it should work with.
|
||||||
|
* Pass a real Python file as context (RAG-style) inside a <code> tag.
|
||||||
|
* Iterate on the prompt to extract more specific information.
|
||||||
|
|
||||||
|
The file analyze_me.py contains several bugs and style issues.
|
||||||
|
You will ask the LLM to find and explain them.
|
||||||
|
|
||||||
|
Tasks
|
||||||
|
-----
|
||||||
|
Part A Build a structured prompt with <persona>, <task>, and <code> tags
|
||||||
|
and ask the LLM to review analyze_me.py (TODOs 1-4).
|
||||||
|
Part B Refine the prompt to request a prioritised bug list (TODOs 5-6).
|
||||||
|
Part C Ask for a corrected version of one specific function (TODO 7).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from server_utils import chat, get_client, print_messages, print_separator
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
# Read the file we want the LLM to analyse
|
||||||
|
code_to_review = Path("analyze_me.py").read_text()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part A: Persona + Task + Code ─────────────────────────────────────────────
|
||||||
|
print_separator("Part A – Structured Prompt: Persona / Task / Code")
|
||||||
|
|
||||||
|
# TODO 1: Fill in the <persona> tag.
|
||||||
|
# Define a senior Python engineer who is rigorous about correctness
|
||||||
|
# and follows PEP-8 and best practices.
|
||||||
|
|
||||||
|
# TODO 2: Fill in the <task> tag.
|
||||||
|
# Ask the LLM to review the Python code and identify ALL bugs,
|
||||||
|
# listing each one with a short explanation of why it is a bug.
|
||||||
|
|
||||||
|
# TODO 3: The <code> tag already contains the file — do not change it.
|
||||||
|
|
||||||
|
# TODO 4: Build the messages list using only a user message (no system prompt yet).
|
||||||
|
# Call chat() and print the result.
|
||||||
|
|
||||||
|
prompt_a = f"""\
|
||||||
|
<persona>
|
||||||
|
You are a Python engineer who is rigorous about correctness and follows PEP-8 and best practices.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the Python code and identify ALL bugs, listing each one with a short explanation of why it is a bug.
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_a = [
|
||||||
|
{"role": "user", "content": prompt_a}
|
||||||
|
]
|
||||||
|
|
||||||
|
# print_messages(messages_a)
|
||||||
|
# response_a = chat(client, messages_a)
|
||||||
|
# print(response_a)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Refine – Ask for a Prioritised Bug List ───────────────────────────
|
||||||
|
print_separator("Part B – Refined Prompt: Prioritised Bug List")
|
||||||
|
|
||||||
|
# TODO 5: Extend the <task> from Part A to ask the LLM to:
|
||||||
|
# - Separate bugs by severity: Critical / Medium / Style
|
||||||
|
# - For each bug: state the line number, the problem, and a one-line fix hint
|
||||||
|
#
|
||||||
|
# Tip: add a <output_format> tag that describes exactly how you want the answer
|
||||||
|
# structured (plain text for now — we tackle real machine output in Ex 3).
|
||||||
|
|
||||||
|
# TODO 6: Build messages_b with a system prompt that reinforces the persona
|
||||||
|
# and a user message with the refined prompt.
|
||||||
|
# Call chat() and print the result.
|
||||||
|
|
||||||
|
system_b = """\
|
||||||
|
<request>
|
||||||
|
<persona>You are a master python developer and teacher</persona>
|
||||||
|
<style>You follow the PEP 8 style guide</style>
|
||||||
|
<constraints>Format your response in json</constraints>
|
||||||
|
</request>
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt_b = f"""\
|
||||||
|
<persona>
|
||||||
|
You are a Python engineer who is rigorous about correctness and follows PEP-8 and best practices.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the Python code and identify ALL bugs, listing each one with a short explanation of why it is a bug.
|
||||||
|
Separate bugs by severity: Critical / Medium / Style
|
||||||
|
or each bug: state the line number, the problem, and a one-line fix hint
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<output_format>
|
||||||
|
...
|
||||||
|
</output_format>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_b = [
|
||||||
|
{"role": "system", "content": system_b},
|
||||||
|
{"role": "user", "content": prompt_b},
|
||||||
|
]
|
||||||
|
print_messages(messages_b)
|
||||||
|
response_b = chat(client, messages_b)
|
||||||
|
print(response_b)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Request a Corrected Function ──────────────────────────────────────
|
||||||
|
print_separator("Part C – Ask for a Corrected Function")
|
||||||
|
|
||||||
|
# TODO 7: Pick one buggy function from analyze_me.py (e.g. calculate_statistics).
|
||||||
|
# Write a new user message — continuing the SAME conversation as Part B —
|
||||||
|
# that asks the LLM to rewrite that function with all bugs fixed,
|
||||||
|
# including proper type hints and a docstring.
|
||||||
|
#
|
||||||
|
# Key insight: you can reuse the model's previous response by appending it to
|
||||||
|
# the messages list as an "assistant" message, then adding a new "user" message.
|
||||||
|
# This is how multi-turn conversations work with the API.
|
||||||
|
|
||||||
|
messages_c = messages_b + [
|
||||||
|
{"role": "assistant", "content": response_b}, # LLM's previous answer
|
||||||
|
{"role": "user", "content": "Fix all bugs, keep the rest as it is"},
|
||||||
|
]
|
||||||
|
print_messages(messages_c)
|
||||||
|
response_c = chat(client, messages_c)
|
||||||
|
print(response_c)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. Did the LLM find all 7 bugs? Which did it miss?\n"
|
||||||
|
"2. How did the <output_format> tag change the structure of the answer?\n"
|
||||||
|
"3. What is the advantage of continuing a conversation vs. starting fresh?\n"
|
||||||
|
"4. How would you scale this pattern to a large codebase (many files)?\n"
|
||||||
|
)
|
||||||
122
Prompting Exercise/ex02_persona_task_data_solution.py
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
"""
|
||||||
|
Exercise 2 – SOLUTION – Persona, Task, and Data in a Structured Prompt
|
||||||
|
=======================================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from server_utils import chat, get_client, print_messages, print_separator
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
code_to_review = Path("analyze_me.py").read_text()
|
||||||
|
temperature_value=1
|
||||||
|
|
||||||
|
# ── Part A: Persona + Task + Code ─────────────────────────────────────────────
|
||||||
|
print_separator("Part A – Structured Prompt: Persona / Task / Code")
|
||||||
|
|
||||||
|
prompt_a = f"""\
|
||||||
|
<persona>
|
||||||
|
You are a senior Python engineer with 10+ years of experience.
|
||||||
|
You are rigorous about correctness, follow PEP-8 strictly, and care
|
||||||
|
deeply about defensive programming and readable code.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the Python code provided below.
|
||||||
|
Identify every bug and code-quality issue you can find.
|
||||||
|
For each issue, state what is wrong and why it is a problem.
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_a = [
|
||||||
|
{"role": "user", "content": prompt_a}
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_a)
|
||||||
|
response_a = chat(client, messages_a, temperature=temperature_value)
|
||||||
|
print(response_a)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Refine – Ask for a Prioritised Bug List ───────────────────────────
|
||||||
|
print_separator("Part B – Refined Prompt: Prioritised Bug List")
|
||||||
|
|
||||||
|
system_b = """\
|
||||||
|
You are a senior Python engineer performing a thorough code review.
|
||||||
|
Be concise, precise, and always refer to line numbers when available.
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt_b = f"""\
|
||||||
|
<persona>
|
||||||
|
You are a senior Python engineer with 10+ years of experience.
|
||||||
|
You are rigorous about correctness, follow PEP-8, and care about
|
||||||
|
defensive programming and readable code.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the Python code below.
|
||||||
|
Identify every bug and code-quality issue.
|
||||||
|
Classify each finding by severity:
|
||||||
|
- Critical : causes a crash or wrong result under normal use
|
||||||
|
- Medium : bad practice that will cause problems in production
|
||||||
|
- Style : violates PEP-8 or reduces readability
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<output_format>
|
||||||
|
For each finding produce exactly this structure (plain text):
|
||||||
|
[SEVERITY] Line <N>: <one-sentence problem description>
|
||||||
|
Fix hint: <one-sentence suggestion>
|
||||||
|
|
||||||
|
Group findings under headings: ## Critical, ## Medium, ## Style
|
||||||
|
</output_format>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_b = [
|
||||||
|
{"role": "system", "content": system_b},
|
||||||
|
{"role": "user", "content": prompt_b},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_b)
|
||||||
|
response_b = chat(client, messages_b, temperature=temperature_value)
|
||||||
|
print(response_b)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Request a Corrected Function ──────────────────────────────────────
|
||||||
|
print_separator("Part C – Ask for a Corrected Function")
|
||||||
|
|
||||||
|
followup = """\
|
||||||
|
<task>
|
||||||
|
Rewrite only the `calculate_statistics` function with all bugs fixed.
|
||||||
|
Requirements:
|
||||||
|
- Handle an empty list gracefully (return None or raise ValueError with a clear message)
|
||||||
|
- Use sample variance (divide by N-1)
|
||||||
|
- Add full PEP-8 type hints
|
||||||
|
- Add a NumPy-style docstring
|
||||||
|
Return only the function code, no surrounding explanation.
|
||||||
|
</task>"""
|
||||||
|
|
||||||
|
messages_c = messages_b + [
|
||||||
|
{"role": "assistant", "content": response_b},
|
||||||
|
{"role": "user", "content": followup},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_c)
|
||||||
|
response_c = chat(client, messages_c, temperature=temperature_value)
|
||||||
|
print(response_c)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. Did the LLM find all 7 bugs? Which did it miss?\n"
|
||||||
|
"2. How did the <output_format> tag change the structure of the answer?\n"
|
||||||
|
"3. What is the advantage of continuing a conversation vs. starting fresh?\n"
|
||||||
|
"4. How would you scale this pattern to a large codebase (many files)?\n"
|
||||||
|
)
|
||||||
231
Prompting Exercise/ex03_structured_output.py
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
"""
|
||||||
|
Exercise 3 – Structured Input and Structured Output
|
||||||
|
====================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
|
||||||
|
Learning goals
|
||||||
|
--------------
|
||||||
|
* Request machine-parseable output (JSON and YAML) from the LLM.
|
||||||
|
* Parse the JSON response in Python and use it programmatically.
|
||||||
|
* Build a second prompt dynamically from the parsed data.
|
||||||
|
* Understand why structured output is essential for LLM pipelines.
|
||||||
|
|
||||||
|
Tasks
|
||||||
|
-----
|
||||||
|
Part A Ask the LLM to review analyze_me.py and return a JSON report (TODOs 1-4).
|
||||||
|
Part B Parse the JSON response and print a summary table (TODOs 5-6).
|
||||||
|
Part C Use the parsed data to build a follow-up prompt automatically (TODOs 7-8).
|
||||||
|
Part D Repeat Part A but request YAML instead of JSON (TODO 9).
|
||||||
|
|
||||||
|
Estimated time: 40-50 minutes
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from server_utils import chat, chat_json, get_client, print_messages, print_separator
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
code_to_review = Path("analyze_me.py").read_text()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part A: Structured Input → JSON Output ────────────────────────────────────
|
||||||
|
print_separator("Part A – Request JSON Output")
|
||||||
|
|
||||||
|
# TODO 1: Write a system prompt that instructs the model to ALWAYS respond
|
||||||
|
# with valid JSON and nothing else (no markdown fences, no explanation).
|
||||||
|
|
||||||
|
system_a = """\
|
||||||
|
<request>
|
||||||
|
<persona>You are a master python tutor</persona>
|
||||||
|
<style>You follow the PEP 8 style guide</style>
|
||||||
|
<constraints>Only respond in a json format following the user provided schema</constraints>
|
||||||
|
</request>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO 2: Write the user prompt.
|
||||||
|
# Use XML tags for <persona>, <task>, and <code>.
|
||||||
|
#
|
||||||
|
# In <task>, specify the exact JSON schema you expect:
|
||||||
|
#
|
||||||
|
schema = """{
|
||||||
|
"summary": "<one sentence overview>",
|
||||||
|
"bugs": [
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"severity": "Critical|Medium|Style",
|
||||||
|
"line": <int or null>,
|
||||||
|
"function": "<function name>",
|
||||||
|
"description": "<what is wrong>",
|
||||||
|
"fix": "<one-sentence fix hint>"
|
||||||
|
},
|
||||||
|
...
|
||||||
|
],
|
||||||
|
"overall_quality": "Poor|Fair|Good|Excellent"
|
||||||
|
}"""
|
||||||
|
#
|
||||||
|
# Tip: paste the schema directly inside a <schema> tag in your prompt.
|
||||||
|
|
||||||
|
prompt_a = f"""\
|
||||||
|
TODO: Write your structured prompt here.
|
||||||
|
Include <persona>, <task>, <schema>, and <code> tags.
|
||||||
|
|
||||||
|
<persona>
|
||||||
|
You are a Python engineer who is rigorous about correctness and follows PEP-8 and best practices.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the Python code and identify ALL bugs.
|
||||||
|
Explain all the bugs you found the schema provided.
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<schema>
|
||||||
|
{schema}
|
||||||
|
</schema>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_a = [
|
||||||
|
# TODO 3: build the messages list (system + user)
|
||||||
|
{"role": "system", "content": system_a},
|
||||||
|
{"role": "user", "content": prompt_a},
|
||||||
|
]
|
||||||
|
|
||||||
|
# TODO 4: call chat_json() and store the raw response string in raw_json_a.
|
||||||
|
# chat_json() adds response_format={"type": "json_object"} so the
|
||||||
|
# server guarantees the output is parseable by json.loads().
|
||||||
|
print_messages(messages_a)
|
||||||
|
raw_json_a = chat_json(client, messages_a)
|
||||||
|
print("Raw response:")
|
||||||
|
print(raw_json_a)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Parse the JSON and Display a Summary ──────────────────────────────
|
||||||
|
print_separator("Part B – Parse JSON and Print Summary")
|
||||||
|
|
||||||
|
# TODO 5: Parse raw_json_a with json.loads().
|
||||||
|
# Handle the case where the model returned malformed JSON
|
||||||
|
# (wrap in try/except and print a helpful error message).
|
||||||
|
|
||||||
|
report = json.loads(raw_json_a)
|
||||||
|
|
||||||
|
# TODO 6: Print a formatted summary table like this:
|
||||||
|
#
|
||||||
|
# Overall quality : Fair
|
||||||
|
# Summary : ...
|
||||||
|
#
|
||||||
|
# ID | Severity | Line | Function | Description
|
||||||
|
# ---+----------+------+-----------------------+---------------------------
|
||||||
|
# 1 | Critical | 12 | calculate_statistics | ZeroDivisionError on ...
|
||||||
|
# 2 | ...
|
||||||
|
#
|
||||||
|
# Hint: use f-strings and ljust() / rjust() for alignment.
|
||||||
|
|
||||||
|
print(f"Overall quality : {report['overall_quality']}")
|
||||||
|
print(f"Summary : {report['summary']}\n")
|
||||||
|
|
||||||
|
bugs = report.get("bugs", [])
|
||||||
|
if bugs:
|
||||||
|
headers = {
|
||||||
|
"id": "ID",
|
||||||
|
"severity": "Severity",
|
||||||
|
"line": "Line",
|
||||||
|
"function": "Function",
|
||||||
|
"description": "Description",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Compute column widths
|
||||||
|
widths = {
|
||||||
|
key: max(len(headers[key]), *(len(str(b[key])) for b in bugs))
|
||||||
|
for key in headers
|
||||||
|
}
|
||||||
|
|
||||||
|
# Header row
|
||||||
|
print(
|
||||||
|
f"{headers['id'].ljust(widths['id'])} | "
|
||||||
|
f"{headers['severity'].ljust(widths['severity'])} | "
|
||||||
|
f"{headers['line'].ljust(widths['line'])} | "
|
||||||
|
f"{headers['function'].ljust(widths['function'])} | "
|
||||||
|
f"{headers['description']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Separator row
|
||||||
|
print(
|
||||||
|
f"{'-' * widths['id']}-+-"
|
||||||
|
f"{'-' * widths['severity']}-+-"
|
||||||
|
f"{'-' * widths['line']}-+-"
|
||||||
|
f"{'-' * widths['function']}-+-"
|
||||||
|
f"{'-' * widths['description']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Data rows
|
||||||
|
for bug in bugs:
|
||||||
|
print(
|
||||||
|
f"{str(bug['id']).ljust(widths['id'])} | "
|
||||||
|
f"{bug['severity'].ljust(widths['severity'])} | "
|
||||||
|
f"{str(bug['line']).ljust(widths['line'])} | "
|
||||||
|
f"{bug['function'].ljust(widths['function'])} | "
|
||||||
|
f"{bug['description']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Part C: Use the Parsed Data to Build a Follow-Up Prompt ──────────────────
|
||||||
|
print_separator("Part C – Dynamic Follow-Up Prompt from Parsed Data")
|
||||||
|
|
||||||
|
# TODO 7: Select all bugs with severity "Critical" from the parsed report.
|
||||||
|
# Build a new user prompt that:
|
||||||
|
# - Lists each critical bug by ID and description
|
||||||
|
# - Asks the LLM to provide the corrected code for each one
|
||||||
|
# - Requests the output as a JSON OBJECT (not a bare array, because
|
||||||
|
# response_format=json_object requires an object at the top level):
|
||||||
|
# {"fixes": [{"bug_id": 1, "fixed_code": "..."}, ...]}
|
||||||
|
#
|
||||||
|
# Tip: wrap the schema in a {"fixes": [...]} object so chat_json() works.
|
||||||
|
|
||||||
|
critical_bugs = [b for b in report["bugs"] if b["severity"] == "Critical"]
|
||||||
|
|
||||||
|
followup_prompt = """\
|
||||||
|
TODO: Build the follow-up prompt dynamically using the critical_bugs list.
|
||||||
|
Loop over critical_bugs to embed each bug's description in the prompt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO 8: Continue the conversation (multi-turn) by appending the previous
|
||||||
|
# response and the new prompt, then call chat_json() and parse the result.
|
||||||
|
# Because the schema is {"fixes": [...]}, extract the list with ["fixes"].
|
||||||
|
|
||||||
|
# messages_c = messages_a + [
|
||||||
|
# {"role": "assistant", "content": raw_json_a},
|
||||||
|
# {"role": "user", "content": followup_prompt},
|
||||||
|
# ]
|
||||||
|
# print_messages(messages_c)
|
||||||
|
# raw_json_c = chat_json(client, messages_c)
|
||||||
|
# fixes = json.loads(raw_json_c)["fixes"]
|
||||||
|
# for fix in fixes:
|
||||||
|
# print(f"\n--- Fix for bug {fix['bug_id']} ---")
|
||||||
|
# print(fix["fixed_code"])
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part D: Request YAML Instead of JSON ─────────────────────────────────────
|
||||||
|
print_separator("Part D – YAML Output")
|
||||||
|
|
||||||
|
# TODO 9: Repeat Part A but ask for YAML output instead of JSON.
|
||||||
|
# Install PyYAML if needed: pip install pyyaml
|
||||||
|
# Parse the response with yaml.safe_load() and print the result.
|
||||||
|
#
|
||||||
|
# Question: Which format do you prefer for human-readable reports? For
|
||||||
|
# machine-to-machine pipelines?
|
||||||
|
|
||||||
|
# import yaml
|
||||||
|
# ...
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. What can go wrong when asking an LLM to return JSON?\n"
|
||||||
|
"2. How did the <schema> tag influence the output structure?\n"
|
||||||
|
"3. Why is structured output important for building LLM pipelines?\n"
|
||||||
|
"4. When would you use JSON vs. YAML vs. plain text?\n"
|
||||||
|
)
|
||||||
188
Prompting Exercise/ex03_structured_output_solution.py
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
"""
|
||||||
|
Exercise 3 – SOLUTION – Structured Input and Structured Output
|
||||||
|
==============================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml # pip install pyyaml
|
||||||
|
|
||||||
|
from server_utils import chat, chat_json, get_client, print_messages, print_separator
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
code_to_review = Path("analyze_me.py").read_text()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part A: Structured Input → JSON Output ────────────────────────────────────
|
||||||
|
print_separator("Part A – Request JSON Output")
|
||||||
|
|
||||||
|
system_a = """\
|
||||||
|
You are a code-review assistant. You ALWAYS respond with valid JSON and
|
||||||
|
nothing else — no markdown code fences, no introductory text, no trailing
|
||||||
|
commentary. Your entire response must be parseable by json.loads().
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt_a = f"""\
|
||||||
|
<persona>
|
||||||
|
You are a senior Python engineer performing a thorough, structured code review.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the Python code below and return your findings as JSON.
|
||||||
|
Follow the schema defined in <schema> exactly.
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<schema>
|
||||||
|
{{
|
||||||
|
"summary": "<one-sentence overview of the code quality>",
|
||||||
|
"bugs": [
|
||||||
|
{{
|
||||||
|
"id": 1,
|
||||||
|
"severity": "Critical|Medium|Style",
|
||||||
|
"line": <integer line number or null if not applicable>,
|
||||||
|
"function": "<name of the affected function>",
|
||||||
|
"description": "<what is wrong and why it matters>",
|
||||||
|
"fix": "<one-sentence fix hint>"
|
||||||
|
}}
|
||||||
|
],
|
||||||
|
"overall_quality": "Poor|Fair|Good|Excellent"
|
||||||
|
}}
|
||||||
|
</schema>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_a = [
|
||||||
|
{"role": "system", "content": system_a},
|
||||||
|
{"role": "user", "content": prompt_a},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_a)
|
||||||
|
raw_json_a = chat_json(client, messages_a) # response_format=json_object → always valid JSON
|
||||||
|
print("Raw response:")
|
||||||
|
print(raw_json_a)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Parse the JSON and Display a Summary ──────────────────────────────
|
||||||
|
print_separator("Part B – Parse JSON and Print Summary")
|
||||||
|
|
||||||
|
report = json.loads(raw_json_a)
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Overall quality : {report['overall_quality']}")
|
||||||
|
print(f"Summary : {report['summary']}\n")
|
||||||
|
|
||||||
|
col_w = [4, 10, 6, 24, 45]
|
||||||
|
header = (
|
||||||
|
f"{'ID':<{col_w[0]}} | {'Severity':<{col_w[1]}} | {'Line':<{col_w[2]}} | "
|
||||||
|
f"{'Function':<{col_w[3]}} | {'Description':<{col_w[4]}}"
|
||||||
|
)
|
||||||
|
print(header)
|
||||||
|
print("-" * len(header))
|
||||||
|
|
||||||
|
for bug in report["bugs"]:
|
||||||
|
line_str = str(bug["line"]) if bug["line"] is not None else "—"
|
||||||
|
print(
|
||||||
|
f"{bug['id']:<{col_w[0]}} | "
|
||||||
|
f"{bug['severity']:<{col_w[1]}} | "
|
||||||
|
f"{line_str:<{col_w[2]}} | "
|
||||||
|
f"{bug['function']:<{col_w[3]}} | "
|
||||||
|
f"{bug['description'][:col_w[4]]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Use the Parsed Data to Build a Follow-Up Prompt ──────────────────
|
||||||
|
print_separator("Part C – Dynamic Follow-Up Prompt from Parsed Data")
|
||||||
|
|
||||||
|
critical_bugs = [b for b in report["bugs"] if b["severity"] == "Critical"]
|
||||||
|
|
||||||
|
if not critical_bugs:
|
||||||
|
print("No critical bugs found — nothing to fix.")
|
||||||
|
else:
|
||||||
|
lines = []
|
||||||
|
for b in critical_bugs:
|
||||||
|
lines.append(f' - Bug {b["id"]} (line {b["line"]}): {b["description"]}')
|
||||||
|
bug_list_text = "\n".join(lines)
|
||||||
|
|
||||||
|
followup_prompt = f"""\
|
||||||
|
<task>
|
||||||
|
The following critical bugs were found in analyze_me.py:
|
||||||
|
|
||||||
|
{bug_list_text}
|
||||||
|
|
||||||
|
For each bug, provide the corrected Python code snippet (the full function
|
||||||
|
is fine). Return your answer as a JSON object with this schema:
|
||||||
|
{{
|
||||||
|
"fixes": [
|
||||||
|
{{"bug_id": <int>, "fixed_code": "<corrected Python code as a string>"}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
No markdown, no explanation — only the JSON object.
|
||||||
|
</task>"""
|
||||||
|
|
||||||
|
messages_c = messages_a + [
|
||||||
|
{"role": "assistant", "content": raw_json_a},
|
||||||
|
{"role": "user", "content": followup_prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_c)
|
||||||
|
raw_json_c = chat_json(client, messages_c)
|
||||||
|
|
||||||
|
fixes = json.loads(raw_json_c)["fixes"]
|
||||||
|
for fix in fixes:
|
||||||
|
print(f"\n--- Fix for bug {fix['bug_id']} ---")
|
||||||
|
print(fix["fixed_code"])
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part D: Request YAML Instead of JSON ─────────────────────────────────────
|
||||||
|
print_separator("Part D – YAML Output")
|
||||||
|
|
||||||
|
system_d = """\
|
||||||
|
You are a code-review assistant. You ALWAYS respond with valid YAML and
|
||||||
|
nothing else — no markdown fences, no introductory text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt_d = f"""\
|
||||||
|
<persona>
|
||||||
|
You are a senior Python engineer performing a structured code review.
|
||||||
|
</persona>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Review the code below and return your findings as YAML.
|
||||||
|
Use the same fields as before: summary, bugs (with id/severity/line/
|
||||||
|
function/description/fix), and overall_quality.
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_review}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
messages_d = [
|
||||||
|
{"role": "system", "content": system_d},
|
||||||
|
{"role": "user", "content": prompt_d},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_d)
|
||||||
|
raw_yaml = chat(client, messages_d, temperature=0.2)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yaml_report = yaml.safe_load(raw_yaml)
|
||||||
|
print(f"Parsed YAML – overall quality: {yaml_report.get('overall_quality')}")
|
||||||
|
print(f"Number of bugs found: {len(yaml_report.get('bugs', []))}")
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
print(f"ERROR: malformed YAML: {e}")
|
||||||
|
print(raw_yaml)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. What can go wrong when asking an LLM to return JSON?\n"
|
||||||
|
"2. How did the <schema> tag influence the output structure?\n"
|
||||||
|
"3. Why is structured output important for building LLM pipelines?\n"
|
||||||
|
"4. When would you use JSON vs. YAML vs. plain text?\n"
|
||||||
|
)
|
||||||
300
Prompting Exercise/ex04_cot_pipeline.py
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
"""
|
||||||
|
Exercise 4 – Build Your Own Chain-of-Thought Pipeline
|
||||||
|
======================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
|
||||||
|
Learning goals
|
||||||
|
--------------
|
||||||
|
* Understand that reasoning models (o1, DeepSeek-R1, Qwen3 think mode)
|
||||||
|
generate a hidden "plan" before giving the final answer.
|
||||||
|
* Replicate this behaviour manually using multiple LLM calls:
|
||||||
|
Call 1 (Planning) – structured input → structured JSON plan
|
||||||
|
Calls 2…N (Execution) – iterate step-by-step, validating each step
|
||||||
|
* See why explicit reasoning steps improve answer quality for complex tasks.
|
||||||
|
|
||||||
|
Background
|
||||||
|
----------
|
||||||
|
When you disable Qwen3's built-in thinking mode (as we do in server_utils),
|
||||||
|
you get fast, direct answers — but no explicit reasoning.
|
||||||
|
In this exercise you rebuild that reasoning step yourself, step by step,
|
||||||
|
so you can inspect and control the thinking process.
|
||||||
|
|
||||||
|
The problem
|
||||||
|
-----------
|
||||||
|
Given the buggy analyze_me.py from earlier exercises, design and implement
|
||||||
|
a corrected, production-ready version of the full module.
|
||||||
|
|
||||||
|
Tasks
|
||||||
|
-----
|
||||||
|
Part A Planning phase: structured input → JSON reasoning plan (TODOs 1-5).
|
||||||
|
Part B Iterative execution: apply each plan step one at a time,
|
||||||
|
validating syntax after each step (TODOs 6-10).
|
||||||
|
Part C Reflection — compare with and without CoT (TODO 11).
|
||||||
|
|
||||||
|
Estimated time: 50-60 minutes
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from server_utils import (
|
||||||
|
chat, chat_json, get_client, print_messages, print_separator,
|
||||||
|
strip_code_fences,
|
||||||
|
)
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
code_to_fix = Path("analyze_me.py").read_text()
|
||||||
|
|
||||||
|
# ── The Problem Statement ─────────────────────────────────────────────────────
|
||||||
|
# We will use this description in both phases so we define it once.
|
||||||
|
|
||||||
|
PROBLEM = """\
|
||||||
|
Rewrite the Python module analyze_me.py so that it is correct,
|
||||||
|
robust, and production-ready.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
1. calculate_statistics() must handle empty lists without crashing.
|
||||||
|
2. Use sample variance (divide by N-1).
|
||||||
|
3. process_data() must use a context manager and handle non-numeric lines.
|
||||||
|
4. normalize() must fix the operator-precedence bug and raise ValueError
|
||||||
|
for unknown methods.
|
||||||
|
5. All functions must have PEP-484 type hints and NumPy-style docstrings.
|
||||||
|
6. The module must pass basic sanity checks when run as __main__.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part A: Planning Phase ────────────────────────────────────────────────────
|
||||||
|
print_separator("Part A – Planning Phase (CoT Step 1)")
|
||||||
|
|
||||||
|
# The goal of this phase is NOT to write the code — it is to produce a
|
||||||
|
# structured plan: what steps are needed and in what order?
|
||||||
|
|
||||||
|
# TODO 1: Write a system prompt that instructs the model to act as a
|
||||||
|
# "software architect" whose job is ONLY to produce a plan,
|
||||||
|
# never to write the final code.
|
||||||
|
# IMPORTANT: explicitly forbid code snippets in all fields —
|
||||||
|
# use plain English only. This prevents unescaped quotes from
|
||||||
|
# breaking the JSON output.
|
||||||
|
# Enforce JSON-only output.
|
||||||
|
|
||||||
|
system_plan = """\
|
||||||
|
TODO: Write a system prompt for the planning phase.
|
||||||
|
The model should only reason and plan, not write code.
|
||||||
|
Enforce JSON-only output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# TODO 2: Write the planning user prompt using XML tags:
|
||||||
|
# <problem> – embed the PROBLEM string
|
||||||
|
# <code> – embed the buggy code_to_fix
|
||||||
|
# <task> – ask for a step-by-step plan
|
||||||
|
# <schema> – specify the exact JSON schema for the plan:
|
||||||
|
#
|
||||||
|
# {
|
||||||
|
# "goal": "<one sentence goal>",
|
||||||
|
# "steps": [
|
||||||
|
# {
|
||||||
|
# "step_id": 1,
|
||||||
|
# "title": "<short title>",
|
||||||
|
# "reasoning": "<why this step is needed>",
|
||||||
|
# "action": "<what to do in this step — plain English, no code>",
|
||||||
|
# "depends_on": [] // list of step_ids this step depends on
|
||||||
|
# },
|
||||||
|
# ...
|
||||||
|
# ]
|
||||||
|
# }
|
||||||
|
|
||||||
|
prompt_plan = f"""\
|
||||||
|
TODO: Write the planning prompt here.
|
||||||
|
Use <problem>, <code>, <task>, and <schema> tags.
|
||||||
|
|
||||||
|
<problem>
|
||||||
|
{PROBLEM}
|
||||||
|
</problem>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_fix}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
# TODO 3: Build messages_plan (system + user) and call chat_json().
|
||||||
|
# Use chat_json() (not chat()) so the server enforces valid JSON via
|
||||||
|
# response_format={"type": "json_object"}.
|
||||||
|
# Use max_tokens=4096 — the plan can be long and would get cut off
|
||||||
|
# with the default 2048, producing truncated (unparseable) JSON.
|
||||||
|
|
||||||
|
messages_plan = [
|
||||||
|
# TODO: add system and user messages
|
||||||
|
]
|
||||||
|
|
||||||
|
# print_messages(messages_plan)
|
||||||
|
# raw_plan = chat_json(client, messages_plan, max_tokens=4096)
|
||||||
|
# print("Raw plan JSON:")
|
||||||
|
# print(raw_plan)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 4: Parse raw_plan with json.loads().
|
||||||
|
# Print each step in a readable format:
|
||||||
|
# Step 1 – <title>
|
||||||
|
# Reasoning : <reasoning>
|
||||||
|
# Action : <action>
|
||||||
|
|
||||||
|
# plan = json.loads(raw_plan)
|
||||||
|
# print(f"\nGoal: {plan['goal']}\n")
|
||||||
|
# for step in plan["steps"]:
|
||||||
|
# print(f"Step {step['step_id']} – {step['title']}")
|
||||||
|
# print(f" Reasoning : {step['reasoning']}")
|
||||||
|
# print(f" Action : {step['action']}\n")
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 5: (Optional) Inspect the plan critically.
|
||||||
|
# Does the order of steps make sense?
|
||||||
|
# Are any steps missing?
|
||||||
|
# You can edit the plan dict before passing it to the execution phase.
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Iterative Execution Phase ────────────────────────────────────────
|
||||||
|
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")
|
||||||
|
|
||||||
|
# KEY INSIGHT: Instead of dumping the entire plan into one big prompt
|
||||||
|
# (which would just be another one-shot), we iterate through each step
|
||||||
|
# individually. After every step we:
|
||||||
|
# 1. Feed the model only the CURRENT step + the accumulated code so far
|
||||||
|
# 2. Validate the output (syntax check via py_compile)
|
||||||
|
# 3. Use the validated output as input for the next step
|
||||||
|
#
|
||||||
|
# This mirrors how a real developer works: implement one change, verify it
|
||||||
|
# compiles, then move on. The model always works with CONCRETE code from
|
||||||
|
# the previous step rather than an abstract plan of what it intends to write.
|
||||||
|
|
||||||
|
# TODO 6: Write a system prompt for the execution phase.
|
||||||
|
# The model should act as a developer who receives the current
|
||||||
|
# state of a module plus a single step to implement.
|
||||||
|
# It should apply ONLY that step and return the full updated module.
|
||||||
|
|
||||||
|
system_exec = """\
|
||||||
|
TODO: Write a system prompt for the step-by-step execution phase.
|
||||||
|
The model should apply ONE step at a time.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 7: Complete the validate_syntax() function below.
|
||||||
|
# It should write code to a temp file and run py_compile on it.
|
||||||
|
# Return (True, "") if syntax is valid, (False, error_message) otherwise.
|
||||||
|
|
||||||
|
def validate_syntax(code: str) -> tuple[bool, str]:
|
||||||
|
"""Write code to a temp file and run py_compile to check syntax."""
|
||||||
|
tmp = Path("_tmp_validate.py")
|
||||||
|
# TODO: write code to tmp, run py_compile, clean up, return result
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
return True, "" # placeholder
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 8: Implement the step-by-step execution loop.
|
||||||
|
# Start with current_code = code_to_fix (the original buggy code).
|
||||||
|
# For each step in plan["steps"]:
|
||||||
|
# a) Build a prompt with <current_code>, <step>, and <task> tags
|
||||||
|
# b) Call chat() with the prompt
|
||||||
|
# c) Strip code fences from the response
|
||||||
|
# d) Validate syntax using validate_syntax()
|
||||||
|
# e) If valid: update current_code
|
||||||
|
# f) If invalid: retry ONCE with error feedback
|
||||||
|
# g) Print the code after each step
|
||||||
|
|
||||||
|
# current_code = code_to_fix
|
||||||
|
#
|
||||||
|
# for step in plan["steps"]:
|
||||||
|
# step_id = step["step_id"]
|
||||||
|
# print_separator(f"Executing Step {step_id} – {step['title']}")
|
||||||
|
#
|
||||||
|
# prompt_step = f"""\
|
||||||
|
# TODO: Build the per-step prompt here.
|
||||||
|
# Include <current_code>, <step>, and <task> tags.
|
||||||
|
# Tell the model to apply ONLY this step."""
|
||||||
|
#
|
||||||
|
# messages_step = [
|
||||||
|
# {"role": "system", "content": system_exec},
|
||||||
|
# {"role": "user", "content": prompt_step},
|
||||||
|
# ]
|
||||||
|
#
|
||||||
|
# print_messages(messages_step)
|
||||||
|
# raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
|
||||||
|
# step_code = strip_code_fences(raw_response)
|
||||||
|
#
|
||||||
|
# # Validate syntax
|
||||||
|
# ok, error_msg = validate_syntax(step_code)
|
||||||
|
# if ok:
|
||||||
|
# print(f" [PASS] Step {step_id} – syntax OK")
|
||||||
|
# current_code = step_code
|
||||||
|
# else:
|
||||||
|
# print(f" [FAIL] Step {step_id} – syntax error: {error_msg}")
|
||||||
|
# # TODO: retry with error feedback (see TODO 9)
|
||||||
|
#
|
||||||
|
# print(f"\n--- Code after Step {step_id} ---")
|
||||||
|
# print(current_code)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 9: Implement the retry logic for syntax errors.
|
||||||
|
# When a step produces invalid syntax:
|
||||||
|
# a) Build a retry prompt with the <error> and the broken <code>
|
||||||
|
# b) Ask the model to fix the syntax error
|
||||||
|
# c) Validate again
|
||||||
|
# d) If still broken, keep the last valid code and continue
|
||||||
|
|
||||||
|
|
||||||
|
# TODO 10: Save the final result and run it as a validation.
|
||||||
|
# - Save current_code to "analyze_me_fixed.py"
|
||||||
|
# - Run it with subprocess and print the output
|
||||||
|
|
||||||
|
# Path("analyze_me_fixed.py").write_text(current_code)
|
||||||
|
# print("\nSaved iterative CoT result to analyze_me_fixed.py")
|
||||||
|
#
|
||||||
|
# result = subprocess.run(
|
||||||
|
# [sys.executable, "analyze_me_fixed.py"],
|
||||||
|
# capture_output=True, text=True,
|
||||||
|
# )
|
||||||
|
# print("STDOUT:", result.stdout)
|
||||||
|
# if result.stderr:
|
||||||
|
# print("STDERR:", result.stderr)
|
||||||
|
# print(f"Exit code: {result.returncode}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Compare With and Without CoT ─────────────────────────────────────
|
||||||
|
print_separator("Part C – Baseline: Direct Prompt Without CoT")
|
||||||
|
|
||||||
|
# TODO 11: Send the same problem to the model in a SINGLE prompt with NO plan.
|
||||||
|
# Compare this response with the iterative CoT version.
|
||||||
|
|
||||||
|
direct_prompt = f"""\
|
||||||
|
TODO: Write a direct, single-shot prompt asking the model to rewrite
|
||||||
|
analyze_me.py according to the PROBLEM requirements.
|
||||||
|
No plan, no iteration — just ask directly.
|
||||||
|
|
||||||
|
<problem>
|
||||||
|
{PROBLEM}
|
||||||
|
</problem>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_fix}
|
||||||
|
</code>"""
|
||||||
|
|
||||||
|
# messages_direct = [{"role": "user", "content": direct_prompt}]
|
||||||
|
# print_messages(messages_direct)
|
||||||
|
# direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
|
||||||
|
# print(direct_response)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. How did the iterative CoT output differ from the direct single-shot?\n"
|
||||||
|
"2. Did the validation step catch any syntax errors? How were they fixed?\n"
|
||||||
|
"3. What would happen if you gave the model a deliberately wrong plan?\n"
|
||||||
|
"4. How does this manual CoT pipeline relate to built-in thinking modes\n"
|
||||||
|
" in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
|
||||||
|
"5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
|
||||||
|
" (Think: latency, cost, error isolation, debuggability)\n"
|
||||||
|
"6. How could you extend the validation step beyond syntax checking?\n"
|
||||||
|
" (Hint: unit tests, type checking, linting)\n"
|
||||||
|
)
|
||||||
279
Prompting Exercise/ex04_cot_pipeline_solution.py
Normal file
@ -0,0 +1,279 @@
|
|||||||
|
"""
|
||||||
|
Exercise 4 – SOLUTION – Build Your Own Chain-of-Thought Pipeline
|
||||||
|
================================================================
|
||||||
|
AISE501 · Prompting in Coding · Spring Semester 2026
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from server_utils import (
|
||||||
|
chat, chat_json, get_client, print_messages, print_separator,
|
||||||
|
strip_code_fences,
|
||||||
|
)
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
|
||||||
|
code_to_fix = Path("analyze_me.py").read_text()
|
||||||
|
|
||||||
|
PROBLEM = """\
|
||||||
|
Rewrite the Python module analyze_me.py so that it is correct,
|
||||||
|
robust, and production-ready.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
1. calculate_statistics() must handle empty lists without crashing.
|
||||||
|
2. Use sample variance (divide by N-1).
|
||||||
|
3. process_data() must use a context manager and handle non-numeric lines.
|
||||||
|
4. normalize() must fix the operator-precedence bug and raise ValueError
|
||||||
|
for unknown methods.
|
||||||
|
5. All functions must have PEP-484 type hints and NumPy-style docstrings.
|
||||||
|
6. The module must pass basic sanity checks when run as __main__.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part A: Planning Phase ────────────────────────────────────────────────────
|
||||||
|
print_separator("Part A – Planning Phase (CoT Step 1)")
|
||||||
|
|
||||||
|
system_plan = """\
|
||||||
|
You are a software architect. Your ONLY job right now is to produce a
|
||||||
|
structured reasoning plan. You must NOT write any Python code or code
|
||||||
|
snippets anywhere in your response — not in action fields, not in
|
||||||
|
reasoning fields, nowhere. Use plain English descriptions only.
|
||||||
|
Respond with valid JSON only (no markdown fences, no extra text).
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt_plan = f"""\
|
||||||
|
<problem>
|
||||||
|
{PROBLEM}
|
||||||
|
</problem>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_fix}
|
||||||
|
</code>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Analyse the problem and the buggy code above.
|
||||||
|
Produce a step-by-step plan that a developer can follow to implement
|
||||||
|
the corrected module. Each step must be atomic and self-contained.
|
||||||
|
</task>
|
||||||
|
|
||||||
|
<schema>
|
||||||
|
{{
|
||||||
|
"goal": "<one-sentence goal>",
|
||||||
|
"steps": [
|
||||||
|
{{
|
||||||
|
"step_id": 1,
|
||||||
|
"title": "<short title>",
|
||||||
|
"reasoning": "<why this step is necessary>",
|
||||||
|
"action": "<concrete action to take — plain English only, no code>",
|
||||||
|
"depends_on": []
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
</schema>"""
|
||||||
|
|
||||||
|
messages_plan = [
|
||||||
|
{"role": "system", "content": system_plan},
|
||||||
|
{"role": "user", "content": prompt_plan},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_plan)
|
||||||
|
raw_plan = chat_json(client, messages_plan, max_tokens=4096)
|
||||||
|
print("Raw plan JSON:")
|
||||||
|
print(raw_plan)
|
||||||
|
|
||||||
|
plan = json.loads(raw_plan)
|
||||||
|
|
||||||
|
print(f"\nGoal: {plan['goal']}\n")
|
||||||
|
for step in plan["steps"]:
|
||||||
|
print(f"Step {step['step_id']} – {step['title']}")
|
||||||
|
print(f" Reasoning : {step['reasoning']}")
|
||||||
|
print(f" Action : {step['action']}")
|
||||||
|
deps = step.get("depends_on", [])
|
||||||
|
if deps:
|
||||||
|
print(f" Depends on: steps {deps}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part B: Iterative Execution Phase ────────────────────────────────────────
|
||||||
|
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")
|
||||||
|
|
||||||
|
# Instead of dumping the entire plan into a single prompt, we iterate through
|
||||||
|
# each step individually. After every step we:
|
||||||
|
# 1. Feed the model only the CURRENT step + the accumulated code so far
|
||||||
|
# 2. Validate the output (syntax check via py_compile)
|
||||||
|
# 3. Use the validated output as input for the next step
|
||||||
|
#
|
||||||
|
# This mirrors how a real developer works: implement one change, verify it
|
||||||
|
# compiles, then move on. It also means the model always works with CONCRETE
|
||||||
|
# code from the previous step rather than an abstract plan of what it intends
|
||||||
|
# to write.
|
||||||
|
|
||||||
|
system_exec = """\
|
||||||
|
You are a senior Python developer. You receive the current state of a
|
||||||
|
Python module together with a single step to implement. Apply ONLY the
|
||||||
|
requested change. Return the complete updated module — no explanations
|
||||||
|
outside the code block.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def validate_syntax_ast(code: str) -> tuple[bool, str]:
|
||||||
|
"""Use ast.parse to check whether code is syntactically valid Python."""
|
||||||
|
try:
|
||||||
|
ast.parse(code)
|
||||||
|
return True, ""
|
||||||
|
except SyntaxError as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
def validate_syntax(code: str) -> tuple[bool, str]:
|
||||||
|
"""Write code to a temp file and run py_compile to check syntax."""
|
||||||
|
tmp = Path("_tmp_validate.py")
|
||||||
|
# TODO: write code to tmp, run py_compile, clean up, return result
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
return True, "" # placeholder
|
||||||
|
|
||||||
|
|
||||||
|
current_code = code_to_fix # start with the original buggy code
|
||||||
|
|
||||||
|
for step in plan["steps"]:
|
||||||
|
step_id = step["step_id"]
|
||||||
|
print_separator(f"Executing Step {step_id} – {step['title']}")
|
||||||
|
|
||||||
|
prompt_step = f"""\
|
||||||
|
<current_code>
|
||||||
|
{current_code}
|
||||||
|
</current_code>
|
||||||
|
|
||||||
|
<step>
|
||||||
|
Step {step_id}: {step['title']}
|
||||||
|
Action: {step['action']}
|
||||||
|
Reasoning: {step['reasoning']}
|
||||||
|
</step>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Apply ONLY this single step to the current code above.
|
||||||
|
Do not skip ahead to other steps.
|
||||||
|
Mark your change with a comment: # Step {step_id} – {step['title']}
|
||||||
|
Return the complete updated Python module.
|
||||||
|
Do not include any explanation outside the code.
|
||||||
|
</task>"""
|
||||||
|
|
||||||
|
messages_step = [
|
||||||
|
{"role": "system", "content": system_exec},
|
||||||
|
{"role": "user", "content": prompt_step},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_step)
|
||||||
|
raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
|
||||||
|
step_code = strip_code_fences(raw_response)
|
||||||
|
|
||||||
|
# ── Validate: syntax check before moving on ──
|
||||||
|
ok, error_msg = validate_syntax(step_code)
|
||||||
|
if ok:
|
||||||
|
print(f" [PASS] Step {step_id} – syntax OK")
|
||||||
|
current_code = step_code
|
||||||
|
else:
|
||||||
|
print(f" [FAIL] Step {step_id} – syntax error:\n{error_msg}")
|
||||||
|
print(" Retrying with error feedback...")
|
||||||
|
|
||||||
|
# Give the model one chance to fix its own syntax error
|
||||||
|
retry_prompt = f"""\
|
||||||
|
The code you returned has a syntax error:
|
||||||
|
|
||||||
|
<error>
|
||||||
|
{error_msg}
|
||||||
|
</error>
|
||||||
|
|
||||||
|
<code>
|
||||||
|
{step_code}
|
||||||
|
</code>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Fix the syntax error and return the complete corrected module.
|
||||||
|
Do not include any explanation outside the code.
|
||||||
|
</task>"""
|
||||||
|
|
||||||
|
messages_retry = [
|
||||||
|
{"role": "system", "content": system_exec},
|
||||||
|
{"role": "user", "content": retry_prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
print_messages(messages_retry)
|
||||||
|
retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096)
|
||||||
|
retry_code = strip_code_fences(retry_response)
|
||||||
|
|
||||||
|
ok2, error_msg2 = validate_syntax(retry_code)
|
||||||
|
if ok2:
|
||||||
|
print(f" [PASS] Step {step_id} – retry syntax OK")
|
||||||
|
current_code = retry_code
|
||||||
|
else:
|
||||||
|
print(f" [FAIL] Step {step_id} – retry still has errors: {error_msg2}")
|
||||||
|
print(" Continuing with last valid code.")
|
||||||
|
|
||||||
|
print(f"\n--- Code after Step {step_id} ---")
|
||||||
|
print(current_code)
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Save final result
|
||||||
|
Path("analyze_me_fixed.py").write_text(current_code)
|
||||||
|
print("\nSaved iterative CoT result to analyze_me_fixed.py")
|
||||||
|
|
||||||
|
# Final validation: run the module
|
||||||
|
print_separator("Final Validation – Running analyze_me_fixed.py")
|
||||||
|
result = subprocess.run(
|
||||||
|
[sys.executable, "analyze_me_fixed.py"],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
print("STDOUT:", result.stdout)
|
||||||
|
if result.stderr:
|
||||||
|
print("STDERR:", result.stderr)
|
||||||
|
print(f"Exit code: {result.returncode}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Part C: Baseline – Direct Prompt Without CoT ─────────────────────────────
|
||||||
|
print_separator("Part C – Baseline: Direct Prompt Without CoT")
|
||||||
|
|
||||||
|
direct_prompt = f"""\
|
||||||
|
<problem>
|
||||||
|
{PROBLEM}
|
||||||
|
</problem>
|
||||||
|
|
||||||
|
<code language="python" filename="analyze_me.py">
|
||||||
|
{code_to_fix}
|
||||||
|
</code>
|
||||||
|
|
||||||
|
<task>
|
||||||
|
Rewrite the module so that it satisfies all requirements in <problem>.
|
||||||
|
Return only the corrected Python code.
|
||||||
|
</task>"""
|
||||||
|
|
||||||
|
messages_direct = [{"role": "user", "content": direct_prompt}]
|
||||||
|
print_messages(messages_direct)
|
||||||
|
direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
|
||||||
|
print(direct_response)
|
||||||
|
|
||||||
|
Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response))
|
||||||
|
print("\nSaved direct-prompt result to analyze_me_direct.py")
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n"
|
||||||
|
"Which is more complete? Which follows the requirements more closely?"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||||||
|
print_separator("Reflection Questions")
|
||||||
|
print(
|
||||||
|
"1. How did the iterative CoT output differ from the direct single-shot?\n"
|
||||||
|
"2. Did the validation step catch any syntax errors? How were they fixed?\n"
|
||||||
|
"3. What would happen if you gave the model a deliberately wrong plan?\n"
|
||||||
|
"4. How does this manual CoT pipeline relate to built-in thinking modes\n"
|
||||||
|
" in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
|
||||||
|
"5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
|
||||||
|
" (Think: latency, cost, error isolation, debuggability)\n"
|
||||||
|
"6. How could you extend the validation step beyond syntax checking?\n"
|
||||||
|
" (Hint: unit tests, type checking, linting)\n"
|
||||||
|
)
|
||||||
BIN
Prompting Exercise/prompting_exercises.pdf
Normal file
215
Prompting Exercise/server_utils.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
"""
|
||||||
|
server_utils.py – Shared utilities for AISE501 Prompting Exercises
|
||||||
|
======================================================================
|
||||||
|
Connects to the vLLM inference server at silicon.fhgr.ch via the
|
||||||
|
OpenAI-compatible API.
|
||||||
|
|
||||||
|
This file is complete — no TODOs here.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# ── Server configuration ──────────────────────────────────────────────────────
|
||||||
|
HOST = "silicon.fhgr.ch"
|
||||||
|
PORT = 7080
|
||||||
|
API_KEY = "EMPTY"
|
||||||
|
MODEL = "qwen3.5-35b-a3b" # model ID served on silicon.fhgr.ch
|
||||||
|
|
||||||
|
|
||||||
|
def get_client() -> OpenAI:
|
||||||
|
"""Return an OpenAI-compatible client pointing at the vLLM server."""
|
||||||
|
base_url = f"http://{HOST}:{PORT}/v1"
|
||||||
|
return OpenAI(base_url=base_url, api_key=API_KEY)
|
||||||
|
|
||||||
|
|
||||||
|
def list_models(client: OpenAI) -> list[str]:
|
||||||
|
"""Return all model IDs available on the server."""
|
||||||
|
return [m.id for m in client.models.list().data]
|
||||||
|
|
||||||
|
|
||||||
|
def chat(
|
||||||
|
client: OpenAI,
|
||||||
|
messages: list[dict],
|
||||||
|
model: str = MODEL,
|
||||||
|
temperature: float = 0.2,
|
||||||
|
max_tokens: int = 2048,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Send a list of chat messages to the LLM and return the response text.
|
||||||
|
|
||||||
|
Qwen3's built-in chain-of-thought "think" mode is disabled via
|
||||||
|
``extra_body`` so that replies are direct and not wrapped in
|
||||||
|
<think>…</think> blocks.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
client : OpenAI client returned by get_client()
|
||||||
|
messages : List of {"role": ..., "content": ...} dicts
|
||||||
|
model : Model ID (default: module-level MODEL constant)
|
||||||
|
temperature : Sampling temperature (0 = deterministic, 1 = creative)
|
||||||
|
max_tokens : Maximum number of tokens in the response
|
||||||
|
"""
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
|
def chat_json(
|
||||||
|
client: OpenAI,
|
||||||
|
messages: list[dict],
|
||||||
|
model: str = MODEL,
|
||||||
|
temperature: float = 0.2,
|
||||||
|
max_tokens: int = 2048,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Like chat(), but forces the model to emit syntactically valid JSON via
|
||||||
|
response_format={"type": "json_object"}.
|
||||||
|
|
||||||
|
The server constrains token sampling so the output is always parseable
|
||||||
|
by json.loads() — no post-processing needed. Use this whenever you
|
||||||
|
need structured JSON output (Exercises 3 and 4).
|
||||||
|
|
||||||
|
Parameters are the same as chat(); temperature defaults to 0.2 because
|
||||||
|
deterministic output is usually preferable for structured data.
|
||||||
|
"""
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
temperature=temperature,
|
||||||
|
response_format={"type": "json_object"},
|
||||||
|
extra_body={"chat_template_kwargs": {"enable_thinking": False}},
|
||||||
|
)
|
||||||
|
return response.choices[0].message.content
|
||||||
|
|
||||||
|
|
||||||
|
def _repair_json_strings(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Replace unescaped control characters (newline, tab, carriage return)
|
||||||
|
inside JSON string values with their proper escape sequences.
|
||||||
|
|
||||||
|
LLMs frequently emit literal newlines inside long string values, which
|
||||||
|
is invalid JSON. This function fixes that without touching structural
|
||||||
|
whitespace outside strings.
|
||||||
|
"""
|
||||||
|
result: list[str] = []
|
||||||
|
in_string = False
|
||||||
|
escape = False
|
||||||
|
_escapes = {'\n': '\\n', '\r': '\\r', '\t': '\\t'}
|
||||||
|
for ch in text:
|
||||||
|
if escape:
|
||||||
|
result.append(ch)
|
||||||
|
escape = False
|
||||||
|
continue
|
||||||
|
if ch == '\\' and in_string:
|
||||||
|
result.append(ch)
|
||||||
|
escape = True
|
||||||
|
continue
|
||||||
|
if ch == '"':
|
||||||
|
in_string = not in_string
|
||||||
|
result.append(ch)
|
||||||
|
continue
|
||||||
|
if in_string and ch in _escapes:
|
||||||
|
result.append(_escapes[ch])
|
||||||
|
continue
|
||||||
|
result.append(ch)
|
||||||
|
return ''.join(result)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_json(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract and repair a JSON object or array from an LLM response that may
|
||||||
|
contain extra prose, markdown code fences, or unescaped control characters.
|
||||||
|
|
||||||
|
Strategy:
|
||||||
|
1. Strip markdown ```json ... ``` or ``` ... ``` fences.
|
||||||
|
2. Find the first '{' or '[' and extract to the matching closing bracket.
|
||||||
|
3. Repair unescaped newlines/tabs inside string values.
|
||||||
|
|
||||||
|
Returns the cleaned JSON string, or the original text as a fallback
|
||||||
|
(so json.loads can raise a meaningful error with context).
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 1. Strip markdown fences
|
||||||
|
fenced = re.sub(r"```(?:json)?\s*([\s\S]*?)\s*```", r"\1", text.strip())
|
||||||
|
if fenced != text.strip():
|
||||||
|
return _repair_json_strings(fenced.strip())
|
||||||
|
|
||||||
|
# 2. Find first JSON container and extract to matching close
|
||||||
|
extracted = text
|
||||||
|
for start_char, end_char in [('{', '}'), ('[', ']')]:
|
||||||
|
idx = text.find(start_char)
|
||||||
|
if idx == -1:
|
||||||
|
continue
|
||||||
|
depth = 0
|
||||||
|
in_string = False
|
||||||
|
escape = False
|
||||||
|
for i, ch in enumerate(text[idx:], start=idx):
|
||||||
|
if escape:
|
||||||
|
escape = False
|
||||||
|
continue
|
||||||
|
if ch == '\\' and in_string:
|
||||||
|
escape = True
|
||||||
|
continue
|
||||||
|
if ch == '"':
|
||||||
|
in_string = not in_string
|
||||||
|
continue
|
||||||
|
if in_string:
|
||||||
|
continue
|
||||||
|
if ch == start_char:
|
||||||
|
depth += 1
|
||||||
|
elif ch == end_char:
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
extracted = text[idx: i + 1]
|
||||||
|
break
|
||||||
|
break
|
||||||
|
|
||||||
|
# 3. Repair unescaped control characters inside string values
|
||||||
|
return _repair_json_strings(extracted)
|
||||||
|
|
||||||
|
|
||||||
|
def strip_code_fences(text: str) -> str:
|
||||||
|
"""Remove markdown code fences (```python ... ```) from LLM output.
|
||||||
|
|
||||||
|
LLMs often wrap code in fences even when told not to. Call this before
|
||||||
|
writing LLM-generated code to a .py file so it is directly executable.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
text = text.strip()
|
||||||
|
text = re.sub(r"^```\w*\n?", "", text)
|
||||||
|
text = re.sub(r"\n?```\s*$", "", text)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def print_messages(messages: list[dict]) -> None:
|
||||||
|
"""Print the full messages list before sending it to the LLM.
|
||||||
|
|
||||||
|
Call this before chat() or chat_json() to inspect the exact prompt
|
||||||
|
hierarchy (system + user + assistant turns) that the model receives.
|
||||||
|
This is the primary debugging and learning tool for prompt engineering.
|
||||||
|
"""
|
||||||
|
width = 64
|
||||||
|
print("\n" + "═" * width)
|
||||||
|
print(" PROMPT SENT TO LLM")
|
||||||
|
print("═" * width)
|
||||||
|
for msg in messages:
|
||||||
|
role = msg["role"].upper()
|
||||||
|
print(f"\n── [{role}] " + "─" * max(0, width - len(role) - 6))
|
||||||
|
print(msg["content"])
|
||||||
|
print("\n" + "═" * width)
|
||||||
|
|
||||||
|
|
||||||
|
def print_separator(title: str = "") -> None:
|
||||||
|
"""Print a visual separator with an optional title."""
|
||||||
|
width = 64
|
||||||
|
print("\n" + "─" * width)
|
||||||
|
if title:
|
||||||
|
print(f" {title}")
|
||||||
|
print("─" * width)
|
||||||
23
Prompting Exercise/test_connection.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
"""
|
||||||
|
test_connection.py – Verify the vLLM server connection
|
||||||
|
=========================================================
|
||||||
|
Run this script from the prompting_exercises/ directory before starting
|
||||||
|
the exercises:
|
||||||
|
|
||||||
|
python test_connection.py
|
||||||
|
|
||||||
|
Expected output:
|
||||||
|
Models available: ['qwen3.5-35b-a3b']
|
||||||
|
Connection OK.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from server_utils import get_client, list_models
|
||||||
|
|
||||||
|
client = get_client()
|
||||||
|
models = list_models(client)
|
||||||
|
print(f"Models available: {models}")
|
||||||
|
|
||||||
|
if models:
|
||||||
|
print("Connection OK.")
|
||||||
|
else:
|
||||||
|
print("WARNING: no models returned – check server address and port.")
|
||||||
BIN
code_embeddings_pca.png
Normal file
|
After Width: | Height: | Size: 107 KiB |
BIN
code_embeddings_tsne.png
Normal file
|
After Width: | Height: | Size: 104 KiB |
BIN
pca_denoising_analysis.png
Normal file
|
After Width: | Height: | Size: 398 KiB |
72
Übung: Clean Code/Student Grade Calculator.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
"""
|
||||||
|
Bad example
|
||||||
|
"""
|
||||||
|
|
||||||
|
def calc (l) :
|
||||||
|
t =0
|
||||||
|
for i in l:
|
||||||
|
t = t + i
|
||||||
|
a = t / len (l)
|
||||||
|
if a >=90:
|
||||||
|
g = "A"
|
||||||
|
elif a >=80:
|
||||||
|
g = "B"
|
||||||
|
elif a >=70:
|
||||||
|
g = "C"
|
||||||
|
elif a >=60:
|
||||||
|
g = "D"
|
||||||
|
else :
|
||||||
|
g = "F"
|
||||||
|
return g, a
|
||||||
|
|
||||||
|
def doeverything (n, s1, s2, s3, s4, s5) :
|
||||||
|
print ("Processing student :"+ n)
|
||||||
|
l = [s1, s2, s3, s4, s5]
|
||||||
|
r = calc (l)
|
||||||
|
print ("Average :"+ str (r [1]))
|
||||||
|
print ("Grade :"+ r [0])
|
||||||
|
if r[1] >= 60:
|
||||||
|
print ("Status : PASSED")
|
||||||
|
else:
|
||||||
|
print ("Status : FAILED")
|
||||||
|
return r
|
||||||
|
|
||||||
|
# main program
|
||||||
|
x = "John"
|
||||||
|
doeverything (x,85,90,78,92,88)
|
||||||
|
print ("---")
|
||||||
|
y = "Jane"
|
||||||
|
doeverything (y,55,60,45,50,58)
|
||||||
|
print ("---")
|
||||||
|
z = "Bob"
|
||||||
|
doeverything (z,70,75,80,72,78)
|
||||||
|
|
||||||
|
"""
|
||||||
|
[x] Naming conventions (variables, functions, classes)
|
||||||
|
[x] Code structure and indentation
|
||||||
|
[x] Magic numbers and constants
|
||||||
|
[x] Function length and single responsibility
|
||||||
|
[ ] DRY principle (Don’t Repeat Yourself)
|
||||||
|
[x] Comments and documentation
|
||||||
|
[x] Error handling
|
||||||
|
[x] Whitespace and formatting
|
||||||
|
[ ] Mutable default arguments
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
good example
|
||||||
|
"""
|
||||||
|
|
||||||
|
def calculate_avg(points: list[int]) -> float:
|
||||||
|
return sum(points) / len(points)
|
||||||
|
|
||||||
|
def calculate_grade(point_avg: float) -> str:
|
||||||
|
grade_dict = {
|
||||||
|
(lambda avg: avg >= 90): "A",
|
||||||
|
(lambda avg: avg >= 80): "B",
|
||||||
|
(lambda avg: avg >= 70): "C",
|
||||||
|
(lambda avg: avg >= 60): "D",
|
||||||
|
(lambda avg: avg < 60): "F"
|
||||||
|
}
|
||||||
|
|
||||||
|
return grade_dict.get(point_avg)
|
||||||