AISE501_CLASS/Prompting Exercise/ex05b_coding_agent.py
2026-05-03 20:27:09 +02:00

590 lines
26 KiB
Python

"""
Exercise 5b -- Build a Basic AI Coding Agent (Guided Version)
==============================================================
AISE501 . Prompting in Coding . Spring Semester 2026
This is a GUIDED version of Exercise 5 with more scaffolding.
It teaches the same concepts but reduces boilerplate so you can
focus on the key insight: how an LLM uses tools.
The key insight
---------------
An LLM cannot run code or read files by itself. But we can give it
"superpowers" through a simple trick:
1. TELL the LLM (via the system prompt) what tools exist.
2. ASK the LLM to respond with JSON saying which tool to call.
3. PARSE the JSON, call the real Python function, and
4. FEED the result back into the conversation as a new message.
This is how ALL AI coding agents work (Claude Code, Cursor, Copilot).
The LLM never actually "runs" code — it just asks us to run it!
What is already provided
------------------------
To let you focus on the interesting parts, the following are PRE-BUILT:
- All 7 tool functions (Part A) — read_file, grep_search, etc.
- The tool dispatcher (Part B) — maps tool names to functions.
- Helper functions: truncate_result, trim_messages, ask_human.
What you need to build (the interesting parts)
-----------------------------------------------
Part C The SYSTEM PROMPT that teaches the LLM about its tools (TODOs 1-2).
Part D The AGENT LOOP that connects the LLM to the tools (TODOs 3-6).
Part E The INTERACTIVE CHAT interface (TODOs 7-8).
Think of it like wiring a robot:
- Part A+B are the robot's HANDS (already built).
- Part C is the robot's INSTRUCTION MANUAL (you write it).
- Part D is the robot's BRAIN LOOP (you wire it).
- Part E is the ON SWITCH (you connect it).
The conversation flow
---------------------
Here is exactly what happens in one iteration of the agent loop:
┌─────────────────────────────────────────────────────────────┐
│ messages = [ │
{"role": "system", "content": "<system prompt>"}, │
{"role": "user", "content": "Fix the bug in app.py"},│
│ ] │
└──────────────────────────┬──────────────────────────────────┘
┌─────────▼─────────┐
│ LLM generates │
│ JSON response │
└─────────┬─────────┘
┌──────────────▼──────────────┐
{"thought": "I should...", │
"tool": "read_file", │
"arguments": {
"path": "app.py"
│ }} │
└──────────────┬──────────────┘
┌─────────▼─────────┐
│ You parse JSON, │
│ call read_file() │
└─────────┬─────────┘
┌──────────────▼──────────────────┐
│ Append to messages: │
{"role":"assistant", "content":..}│
{"role":"user", "content": │
"<tool_result>file contents │
│ </tool_result>"} │
└──────────────┬──────────────────┘
┌─────────▼─────────┐
│ Next iteration: │
│ LLM sees result, │
│ picks next tool │
└───────────────────┘
"""
import ast
import json
import subprocess
import sys
from pathlib import Path
from server_utils import (
chat, chat_json, get_client, print_messages, print_separator,
strip_code_fences,
)
client = get_client()
# ── Agent Configuration ──────────────────────────────────────────────────────
WORKSPACE = Path(__file__).parent / "workspace"
WORKSPACE.mkdir(exist_ok=True)
MAX_ITERATIONS = 50
MAX_RESULT_LENGTH = 8000
MAX_HISTORY_CHARS = 60000
# ═══════════════════════════════════════════════════════════════════════════════
# PART A -- TOOL FUNCTIONS (pre-built)
# ═══════════════════════════════════════════════════════════════════════════════
#
# These are the tools the agent can use. Each is a normal Python function.
# The LLM will never call these directly — it will OUTPUT JSON saying
# "please call read_file with path='app.py'", and OUR CODE will call it.
def read_file(path: str) -> str:
"""Read a .txt or .py file from the workspace and return its contents."""
target = (WORKSPACE / path).resolve()
if not str(target).startswith(str(WORKSPACE.resolve())):
return "ERROR: path is outside the workspace."
if not target.exists():
return f"ERROR: file '{path}' not found."
if target.suffix not in (".py", ".txt"):
return f"ERROR: can only read .py and .txt files, got '{target.suffix}'."
return target.read_text()
def grep_search(pattern: str, file_glob: str = "*.py") -> str:
"""Search for a pattern in workspace files matching the glob."""
matches = []
for filepath in sorted(WORKSPACE.glob(file_glob)):
if filepath.suffix not in (".py", ".txt"):
continue
try:
lines = filepath.read_text().splitlines()
except Exception:
continue
for i, line in enumerate(lines, 1):
if pattern in line:
rel = filepath.relative_to(WORKSPACE)
matches.append(f"{rel}:{i}: {line}")
if not matches:
return f"No matches for '{pattern}' in {file_glob}."
return "\n".join(matches)
def list_files(file_glob: str = "*") -> str:
"""List files in the workspace matching the glob pattern."""
found = sorted(WORKSPACE.glob(file_glob))
found = [f.relative_to(WORKSPACE) for f in found if f.is_file()]
if not found:
return f"No files matching '{file_glob}' in workspace."
return "\n".join(str(f) for f in found)
def write_file(path: str, content: str) -> str:
"""Write content to a .py or .txt file in the workspace."""
target = (WORKSPACE / path).resolve()
if not str(target).startswith(str(WORKSPACE.resolve())):
return "ERROR: path is outside the workspace."
if target.suffix not in (".py", ".txt"):
return f"ERROR: can only write .py and .txt files, got '{target.suffix}'."
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(content)
return f"OK: wrote {len(content)} chars to {path}."
def run_python(path: str) -> str:
"""Execute a Python file in the workspace and return stdout + stderr."""
target = (WORKSPACE / path).resolve()
if not str(target).startswith(str(WORKSPACE.resolve())):
return "ERROR: path is outside the workspace."
if not target.exists():
return f"ERROR: file '{path}' not found."
result = subprocess.run(
[sys.executable, str(target)],
capture_output=True, text=True, timeout=30,
cwd=str(WORKSPACE),
)
output = ""
if result.stdout:
output += f"STDOUT:\n{result.stdout}"
if result.stderr:
output += f"STDERR:\n{result.stderr}"
output += f"\nExit code: {result.returncode}"
return output.strip()
def validate_python(path: str) -> str:
"""Check whether a Python file has valid syntax using ast.parse."""
target = (WORKSPACE / path).resolve()
if not str(target).startswith(str(WORKSPACE.resolve())):
return "ERROR: path is outside the workspace."
if not target.exists():
return f"ERROR: file '{path}' not found."
source = target.read_text()
try:
ast.parse(source)
return "OK: syntax is valid."
except SyntaxError as e:
return f"SYNTAX ERROR: {e}"
def done(summary: str) -> str:
"""Signal that the agent has finished its task."""
return f"DONE: {summary}"
# ═══════════════════════════════════════════════════════════════════════════════
# PART B -- TOOL DISPATCHER (pre-built)
# ═══════════════════════════════════════════════════════════════════════════════
#
# This is the bridge between the LLM's JSON output and Python function calls.
#
# When the LLM says: {"tool": "read_file", "arguments": {"path": "app.py"}}
# The dispatcher does: TOOL_FUNCTIONS["read_file"](path="app.py")
#
# The **arguments syntax means "unpack the dict as keyword arguments":
# {"path": "app.py"} → read_file(path="app.py")
TOOL_FUNCTIONS = {
"read_file": read_file,
"grep_search": grep_search,
"list_files": list_files,
"write_file": write_file,
"run_python": run_python,
"validate_python": validate_python,
"done": done,
}
def dispatch_tool(tool_name: str, arguments: dict) -> str:
"""Look up a tool by name and call it with the given arguments.
Example:
dispatch_tool("read_file", {"path": "app.py"})
→ calls read_file(path="app.py")
→ returns the file contents as a string
"""
if tool_name not in TOOL_FUNCTIONS:
return f"ERROR: unknown tool '{tool_name}'. Available: {list(TOOL_FUNCTIONS.keys())}"
func = TOOL_FUNCTIONS[tool_name]
try:
return func(**arguments)
except TypeError as e:
return f"ERROR calling {tool_name}: {e}"
except Exception as e:
return f"ERROR in {tool_name}: {type(e).__name__}: {e}"
# ═══════════════════════════════════════════════════════════════════════════════
# PART C -- SYSTEM PROMPT (TODOs 1-2)
# ═══════════════════════════════════════════════════════════════════════════════
#
# The system prompt is the MOST IMPORTANT part of the agent. It is the only
# way the LLM knows what tools it has and how to use them.
#
# Think about it: the LLM is just a text model. It has no built-in ability
# to read files or run code. The system prompt is where we TELL it:
# "You have these tools. When you want to use one, output this JSON format.
# I (the code) will parse your JSON, run the tool, and give you the result."
#
# The LLM then "plays along" — it outputs JSON that LOOKS LIKE a tool call,
# and our agent loop code makes it ACTUALLY happen.
# TODO 1: Complete the TOOL_DESCRIPTIONS string below.
# This text will be embedded in the system prompt inside a <tools> section.
# The LLM needs to know:
# - The name of each tool (must match the keys in TOOL_FUNCTIONS above!)
# - What arguments each tool takes
# - What each tool does
#
# Four tools are already described for you as examples.
# Add the missing three: write_file, run_python, validate_python.
#
# Follow the same format:
# - tool_name({"param": "<description>"}): What the tool does.
TOOL_DESCRIPTIONS = """\
- read_file({"path": "<relative path>"}): Read a .py or .txt file from the workspace.
- grep_search({"pattern": "<text>", "file_glob": "<glob, default='*.py'>"}): Search for a pattern in files.
- list_files({"file_glob": "<glob, default='*'>"}): List files matching the pattern.
- done({"summary": "<what you accomplished>"}): Signal that you are finished.
- write_file({"path": "<relative path>", "content": "<new content>"): Write new content to file
- run_python({"path": "<relative path>"}): Run code in python file
- validate_python({"path": "<relative path>"}): Validate code in python file
"""
# TODO 2: Complete the system prompt.
# The structure is provided — fill in the <workflow> and <rules> sections.
#
# For <workflow>, describe these steps:
# 1. PLAN: Think about what steps are needed. List them in "thought".
# 2. ACT: Choose ONE tool to call.
# 3. OBSERVE: Analyse the tool's output carefully.
# 4. REPLAN: If the result was unexpected, revise your plan.
# 5. REPEAT: Go back to ACT if more work is needed.
# 6. DONE: Call the "done" tool when the task is complete.
#
# For <rules>, include at least:
# - Always plan before acting.
# - Call exactly ONE tool per response.
# - After writing code, always validate and run it.
# - If an error occurs, try to fix it (up to 3 retries).
# - Stay within the workspace directory.
# - When finished, call the "done" tool.
#
# IMPORTANT: The JSON example uses {{ and }} because this is an f-string.
# In an f-string, {{ produces a literal { in the output.
# So {{"thought": "..."}} becomes {"thought": "..."} when printed.
SYSTEM_PROMPT = f"""\
You are a coding agent that helps users with Python programming tasks.
You work inside a workspace directory and have access to tools.
<tools>
Available tools:
{TOOL_DESCRIPTIONS}
</tools>
<workflow>
1. PLAN: Think about what steps are needed. List them in "thought".
2. ACT: Choose ONE tool to call.
3. OBSERVE: Analyse the tool's output carefully.
4. REPLAN: If the result was unexpected, revise your plan.
5. REPEAT: Go back to ACT if more work is needed.
6. DONE: Call the "done" tool when the task is complete.
</workflow>
<response_format>
You MUST respond with a JSON object every time. The format is:
{{{{
"thought": "<your reasoning about what to do next>",
"tool": "<tool name from the list above>",
"arguments": {{{{ <arguments for the tool> }}}}
}}}}
Example — to read a file:
{{{{
"thought": "I need to read app.py to understand the code.",
"tool": "read_file",
"arguments": {{{{"path": "app.py"}}}}
}}}}
Example — to signal completion:
{{{{
"thought": "I have fixed all the bugs and verified the code runs.",
"tool": "done",
"arguments": {{{{"summary": "Fixed 3 bugs in app.py and verified all tests pass."}}}}
}}}}
</response_format>
<rules>
- Always plan before acting.
- Call exactly ONE tool per response.
- After writing code, always validate and run it.
- If an error occurs, try to fix it (up to 3 retries).
- Stay within the workspace directory.
- When finished, call the "done" tool.
</rules>
"""
# ═══════════════════════════════════════════════════════════════════════════════
# PART D -- AGENT LOOP (TODOs 3-6)
# ═══════════════════════════════════════════════════════════════════════════════
#
# This is where everything comes together. The agent loop:
#
# 1. Sends messages to the LLM (including the system prompt with tools).
# 2. The LLM responds with JSON like: {"tool": "read_file", "arguments": {"path": "app.py"}}
# 3. We parse that JSON and call the real Python function.
# 4. We put the result back into the conversation as a new message.
# 5. We send the updated conversation to the LLM again.
# 6. The LLM sees the result and decides what to do next.
# 7. Repeat until the LLM calls "done" or we hit the iteration limit.
def truncate_result(result: str) -> str:
"""Truncate a tool result if it exceeds MAX_RESULT_LENGTH."""
if len(result) <= MAX_RESULT_LENGTH:
return result
half = MAX_RESULT_LENGTH // 2
return (
result[:half]
+ f"\n\n... [TRUNCATED — {len(result)} chars total, showing first and last {half}] ...\n\n"
+ result[-half:]
)
def trim_messages(messages: list) -> list:
"""Trim older messages if total character count exceeds MAX_HISTORY_CHARS."""
total = sum(len(m["content"]) for m in messages)
if total <= MAX_HISTORY_CHARS:
return messages
head = messages[:2]
tail = messages[2:]
original_task = messages[1]["content"] if len(messages) > 1 else ""
while tail and sum(len(m["content"]) for m in head + tail) > MAX_HISTORY_CHARS:
tail.pop(0)
reminder = {
"role": "user",
"content": (
"<system_note>Earlier conversation history was trimmed. "
f"REMINDER — your original task was:\n{original_task}\n"
"Continue from where you left off.</system_note>"
),
}
return head + [reminder] + tail
def ask_human() -> str:
"""Ask the user to approve, redirect, or stop before each action."""
try:
reply = input("\n [Enter]=continue, or type a comment (stop to abort): ").strip()
return reply
except (EOFError, KeyboardInterrupt):
return "stop"
def agent_loop(user_task: str) -> None:
"""Run the agent loop: plan -> user review -> act -> observe -> repeat.
Study this function carefully — it IS the agent. Everything else is
just support. The loop implements this cycle:
LLM produces JSON → we parse it → we call the tool →
we feed the result back → LLM produces next JSON → ...
"""
# TODO 3: Initialise the message list.
# Create a list with two messages:
# 1. {"role": "system", "content": SYSTEM_PROMPT}
# 2. {"role": "user", "content": user_task}
#
# The system message teaches the LLM about its tools.
# The user message is the task to accomplish.
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_task}
]
for iteration in range(1, MAX_ITERATIONS + 1):
print_separator(f"Agent Iteration {iteration}")
messages = trim_messages(messages)
# TODO 4: Get the LLM's next action.
# a) Call chat_json(client, messages, temperature=0.2, max_tokens=4096)
# This sends the conversation to the LLM and forces JSON output.
# The LLM will respond with something like:
# '{"thought": "I need to...", "tool": "read_file", "arguments": {"path": "app.py"}}'
#
# b) Parse the JSON string into a Python dict using json.loads().
# Extract three values:
# thought = action["thought"] — the LLM's reasoning
# tool_name = action["tool"] — which tool to call
# arguments = action["arguments"] — arguments for the tool
#
# c) Handle json.JSONDecodeError: if parsing fails, append the raw
# response as an assistant message and a user message asking for
# valid JSON, then 'continue' to retry.
#
# d) Print the thought, tool, and arguments so we can see what
# the agent is planning.
raw = chat_json(client, messages) # TODO: call chat_json(...)
response = json.loads(raw)
tool_name = response["tool"] # TODO: parse and extract
arguments = response["arguments"] # TODO: parse and extract
thought = response["thought"] # TODO: parse and extract
# TODO 5: Human-in-the-loop — let the user review before execution.
# a) If tool_name == "done", print the summary so the user sees it.
# b) Call ask_human() to get user input.
# c) If user says "stop" → print a message and return.
# d) If user typed a comment (non-empty string):
# - Do NOT execute the tool.
# - Append the assistant's raw JSON as {"role": "assistant", "content": raw}
# - Append a user message with the feedback:
# {"role": "user", "content": f"<human_message>{human}</human_message>\n"
# "Please revise your plan based on this feedback."}
# - Then 'continue' to the next iteration.
# e) If user pressed Enter (empty) → fall through to execute.
if tool_name == "done":
print_messages(messages)
human_response = ask_human()
if human_response == "stop":
print("stop interactive chat")
break
if human_response:
messages.append({"role": "assistant", "content": raw})
messages.append({"role": "user", "content": f"<human_message>{human_response}</human_message>\n" +
"Please revise your plan based on this feedback."})
continue
# TODO 6: Execute the tool and feed the result back.
# a) If tool_name == "done" and user approved:
# - Print the summary and return.
#
# b) Call the tool:
# result = dispatch_tool(tool_name, arguments)
#
# c) Truncate the result:
# result = truncate_result(result)
#
# d) Append TWO messages to the conversation:
# 1. The assistant's response (what the LLM said):
# {"role": "assistant", "content": raw}
# 2. The tool result (what we're telling the LLM happened):
# {"role": "user", "content":
# f'<tool_result tool="{tool_name}">\n{result}\n</tool_result>'}
#
# WHY role="user" for the tool result? Because in the OpenAI chat
# format, messages alternate between assistant and user. The tool
# result is information we (the system) are giving back to the LLM,
# so it goes in a "user" message. The LLM will understand from the
# <tool_result> tags that this is a tool response, not a human message.
#
# e) Print the result for debugging.
if tool_name == "done":
print_messages(messages)
return
result = dispatch_tool(tool_name, arguments)
result = truncate_result(result)
messages.append({"role": "assistant", "content": raw})
messages.append({"role": "user", "content": f'<tool_result tool="{tool_name}">\n{result}\n</tool_result>'})
print(response)
print_separator("Agent stopped (max iterations reached)")
# ═══════════════════════════════════════════════════════════════════════════════
# PART E -- INTERACTIVE CHAT (TODOs 7-8)
# ═══════════════════════════════════════════════════════════════════════════════
# TODO 7: Implement the input loop.
# - Read input with: user_input = input("You> ").strip()
# - Handle EOFError and KeyboardInterrupt (Ctrl+C)
# - Skip empty input
# - Exit on "quit" or "exit"
# - Otherwise call agent_loop(user_input)
def interactive_chat():
"""Run an interactive chat loop where the user gives tasks to the agent."""
print_separator("AI Coding Agent -- Interactive Mode")
print("Type your task and press Enter. Type 'quit' or 'exit' to stop.")
print(f"Workspace: {WORKSPACE.resolve()}\n")
# Show what files are in the workspace
files = [f for f in sorted(WORKSPACE.glob("*")) if f.is_file()]
if files:
print("Files in workspace:")
for f in files:
print(f" {f.name}")
else:
print("Workspace is empty.")
print()
# TODO: implement the input loop
try:
while True:
user_input = input("You> ").strip()
if user_input in ["quit", "exit"]:
sys.exit(0)
elif not user_input:
continue
else:
agent_loop(user_input)
except (KeyboardInterrupt, EOFError):
sys.exit(0)
# ═══════════════════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
# TODO 8: Copy analyze_me.py into the workspace if not already there,
# then start interactive_chat().
# Use: source = Path(__file__).parent / "analyze_me.py"
# dest = WORKSPACE / "analyze_me.py"
interactive_chat()