301 lines
11 KiB
Python
301 lines
11 KiB
Python
"""
|
||
Exercise 4 – Build Your Own Chain-of-Thought Pipeline
|
||
======================================================
|
||
AISE501 · Prompting in Coding · Spring Semester 2026
|
||
|
||
Learning goals
|
||
--------------
|
||
* Understand that reasoning models (o1, DeepSeek-R1, Qwen3 think mode)
|
||
generate a hidden "plan" before giving the final answer.
|
||
* Replicate this behaviour manually using multiple LLM calls:
|
||
Call 1 (Planning) – structured input → structured JSON plan
|
||
Calls 2…N (Execution) – iterate step-by-step, validating each step
|
||
* See why explicit reasoning steps improve answer quality for complex tasks.
|
||
|
||
Background
|
||
----------
|
||
When you disable Qwen3's built-in thinking mode (as we do in server_utils),
|
||
you get fast, direct answers — but no explicit reasoning.
|
||
In this exercise you rebuild that reasoning step yourself, step by step,
|
||
so you can inspect and control the thinking process.
|
||
|
||
The problem
|
||
-----------
|
||
Given the buggy analyze_me.py from earlier exercises, design and implement
|
||
a corrected, production-ready version of the full module.
|
||
|
||
Tasks
|
||
-----
|
||
Part A Planning phase: structured input → JSON reasoning plan (TODOs 1-5).
|
||
Part B Iterative execution: apply each plan step one at a time,
|
||
validating syntax after each step (TODOs 6-10).
|
||
Part C Reflection — compare with and without CoT (TODO 11).
|
||
|
||
Estimated time: 50-60 minutes
|
||
"""
|
||
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
from server_utils import (
|
||
chat, chat_json, get_client, print_messages, print_separator,
|
||
strip_code_fences,
|
||
)
|
||
|
||
client = get_client()
|
||
|
||
code_to_fix = Path("analyze_me.py").read_text()
|
||
|
||
# ── The Problem Statement ─────────────────────────────────────────────────────
|
||
# We will use this description in both phases so we define it once.
|
||
|
||
PROBLEM = """\
|
||
Rewrite the Python module analyze_me.py so that it is correct,
|
||
robust, and production-ready.
|
||
|
||
Requirements:
|
||
1. calculate_statistics() must handle empty lists without crashing.
|
||
2. Use sample variance (divide by N-1).
|
||
3. process_data() must use a context manager and handle non-numeric lines.
|
||
4. normalize() must fix the operator-precedence bug and raise ValueError
|
||
for unknown methods.
|
||
5. All functions must have PEP-484 type hints and NumPy-style docstrings.
|
||
6. The module must pass basic sanity checks when run as __main__.
|
||
"""
|
||
|
||
|
||
# ── Part A: Planning Phase ────────────────────────────────────────────────────
|
||
print_separator("Part A – Planning Phase (CoT Step 1)")
|
||
|
||
# The goal of this phase is NOT to write the code — it is to produce a
|
||
# structured plan: what steps are needed and in what order?
|
||
|
||
# TODO 1: Write a system prompt that instructs the model to act as a
|
||
# "software architect" whose job is ONLY to produce a plan,
|
||
# never to write the final code.
|
||
# IMPORTANT: explicitly forbid code snippets in all fields —
|
||
# use plain English only. This prevents unescaped quotes from
|
||
# breaking the JSON output.
|
||
# Enforce JSON-only output.
|
||
|
||
system_plan = """\
|
||
TODO: Write a system prompt for the planning phase.
|
||
The model should only reason and plan, not write code.
|
||
Enforce JSON-only output.
|
||
"""
|
||
|
||
# TODO 2: Write the planning user prompt using XML tags:
|
||
# <problem> – embed the PROBLEM string
|
||
# <code> – embed the buggy code_to_fix
|
||
# <task> – ask for a step-by-step plan
|
||
# <schema> – specify the exact JSON schema for the plan:
|
||
#
|
||
# {
|
||
# "goal": "<one sentence goal>",
|
||
# "steps": [
|
||
# {
|
||
# "step_id": 1,
|
||
# "title": "<short title>",
|
||
# "reasoning": "<why this step is needed>",
|
||
# "action": "<what to do in this step — plain English, no code>",
|
||
# "depends_on": [] // list of step_ids this step depends on
|
||
# },
|
||
# ...
|
||
# ]
|
||
# }
|
||
|
||
prompt_plan = f"""\
|
||
TODO: Write the planning prompt here.
|
||
Use <problem>, <code>, <task>, and <schema> tags.
|
||
|
||
<problem>
|
||
{PROBLEM}
|
||
</problem>
|
||
|
||
<code language="python" filename="analyze_me.py">
|
||
{code_to_fix}
|
||
</code>"""
|
||
|
||
# TODO 3: Build messages_plan (system + user) and call chat_json().
|
||
# Use chat_json() (not chat()) so the server enforces valid JSON via
|
||
# response_format={"type": "json_object"}.
|
||
# Use max_tokens=4096 — the plan can be long and would get cut off
|
||
# with the default 2048, producing truncated (unparseable) JSON.
|
||
|
||
messages_plan = [
|
||
# TODO: add system and user messages
|
||
]
|
||
|
||
# print_messages(messages_plan)
|
||
# raw_plan = chat_json(client, messages_plan, max_tokens=4096)
|
||
# print("Raw plan JSON:")
|
||
# print(raw_plan)
|
||
|
||
|
||
# TODO 4: Parse raw_plan with json.loads().
|
||
# Print each step in a readable format:
|
||
# Step 1 – <title>
|
||
# Reasoning : <reasoning>
|
||
# Action : <action>
|
||
|
||
# plan = json.loads(raw_plan)
|
||
# print(f"\nGoal: {plan['goal']}\n")
|
||
# for step in plan["steps"]:
|
||
# print(f"Step {step['step_id']} – {step['title']}")
|
||
# print(f" Reasoning : {step['reasoning']}")
|
||
# print(f" Action : {step['action']}\n")
|
||
|
||
|
||
# TODO 5: (Optional) Inspect the plan critically.
|
||
# Does the order of steps make sense?
|
||
# Are any steps missing?
|
||
# You can edit the plan dict before passing it to the execution phase.
|
||
|
||
|
||
# ── Part B: Iterative Execution Phase ────────────────────────────────────────
|
||
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")
|
||
|
||
# KEY INSIGHT: Instead of dumping the entire plan into one big prompt
|
||
# (which would just be another one-shot), we iterate through each step
|
||
# individually. After every step we:
|
||
# 1. Feed the model only the CURRENT step + the accumulated code so far
|
||
# 2. Validate the output (syntax check via py_compile)
|
||
# 3. Use the validated output as input for the next step
|
||
#
|
||
# This mirrors how a real developer works: implement one change, verify it
|
||
# compiles, then move on. The model always works with CONCRETE code from
|
||
# the previous step rather than an abstract plan of what it intends to write.
|
||
|
||
# TODO 6: Write a system prompt for the execution phase.
|
||
# The model should act as a developer who receives the current
|
||
# state of a module plus a single step to implement.
|
||
# It should apply ONLY that step and return the full updated module.
|
||
|
||
system_exec = """\
|
||
TODO: Write a system prompt for the step-by-step execution phase.
|
||
The model should apply ONE step at a time.
|
||
"""
|
||
|
||
|
||
# TODO 7: Complete the validate_syntax() function below.
|
||
# It should write code to a temp file and run py_compile on it.
|
||
# Return (True, "") if syntax is valid, (False, error_message) otherwise.
|
||
|
||
def validate_syntax(code: str) -> tuple[bool, str]:
|
||
"""Write code to a temp file and run py_compile to check syntax."""
|
||
tmp = Path("_tmp_validate.py")
|
||
# TODO: write code to tmp, run py_compile, clean up, return result
|
||
tmp.unlink(missing_ok=True)
|
||
return True, "" # placeholder
|
||
|
||
|
||
# TODO 8: Implement the step-by-step execution loop.
|
||
# Start with current_code = code_to_fix (the original buggy code).
|
||
# For each step in plan["steps"]:
|
||
# a) Build a prompt with <current_code>, <step>, and <task> tags
|
||
# b) Call chat() with the prompt
|
||
# c) Strip code fences from the response
|
||
# d) Validate syntax using validate_syntax()
|
||
# e) If valid: update current_code
|
||
# f) If invalid: retry ONCE with error feedback
|
||
# g) Print the code after each step
|
||
|
||
# current_code = code_to_fix
|
||
#
|
||
# for step in plan["steps"]:
|
||
# step_id = step["step_id"]
|
||
# print_separator(f"Executing Step {step_id} – {step['title']}")
|
||
#
|
||
# prompt_step = f"""\
|
||
# TODO: Build the per-step prompt here.
|
||
# Include <current_code>, <step>, and <task> tags.
|
||
# Tell the model to apply ONLY this step."""
|
||
#
|
||
# messages_step = [
|
||
# {"role": "system", "content": system_exec},
|
||
# {"role": "user", "content": prompt_step},
|
||
# ]
|
||
#
|
||
# print_messages(messages_step)
|
||
# raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
|
||
# step_code = strip_code_fences(raw_response)
|
||
#
|
||
# # Validate syntax
|
||
# ok, error_msg = validate_syntax(step_code)
|
||
# if ok:
|
||
# print(f" [PASS] Step {step_id} – syntax OK")
|
||
# current_code = step_code
|
||
# else:
|
||
# print(f" [FAIL] Step {step_id} – syntax error: {error_msg}")
|
||
# # TODO: retry with error feedback (see TODO 9)
|
||
#
|
||
# print(f"\n--- Code after Step {step_id} ---")
|
||
# print(current_code)
|
||
|
||
|
||
# TODO 9: Implement the retry logic for syntax errors.
|
||
# When a step produces invalid syntax:
|
||
# a) Build a retry prompt with the <error> and the broken <code>
|
||
# b) Ask the model to fix the syntax error
|
||
# c) Validate again
|
||
# d) If still broken, keep the last valid code and continue
|
||
|
||
|
||
# TODO 10: Save the final result and run it as a validation.
|
||
# - Save current_code to "analyze_me_fixed.py"
|
||
# - Run it with subprocess and print the output
|
||
|
||
# Path("analyze_me_fixed.py").write_text(current_code)
|
||
# print("\nSaved iterative CoT result to analyze_me_fixed.py")
|
||
#
|
||
# result = subprocess.run(
|
||
# [sys.executable, "analyze_me_fixed.py"],
|
||
# capture_output=True, text=True,
|
||
# )
|
||
# print("STDOUT:", result.stdout)
|
||
# if result.stderr:
|
||
# print("STDERR:", result.stderr)
|
||
# print(f"Exit code: {result.returncode}")
|
||
|
||
|
||
# ── Part C: Compare With and Without CoT ─────────────────────────────────────
|
||
print_separator("Part C – Baseline: Direct Prompt Without CoT")
|
||
|
||
# TODO 11: Send the same problem to the model in a SINGLE prompt with NO plan.
|
||
# Compare this response with the iterative CoT version.
|
||
|
||
direct_prompt = f"""\
|
||
TODO: Write a direct, single-shot prompt asking the model to rewrite
|
||
analyze_me.py according to the PROBLEM requirements.
|
||
No plan, no iteration — just ask directly.
|
||
|
||
<problem>
|
||
{PROBLEM}
|
||
</problem>
|
||
|
||
<code language="python" filename="analyze_me.py">
|
||
{code_to_fix}
|
||
</code>"""
|
||
|
||
# messages_direct = [{"role": "user", "content": direct_prompt}]
|
||
# print_messages(messages_direct)
|
||
# direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
|
||
# print(direct_response)
|
||
|
||
|
||
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||
print_separator("Reflection Questions")
|
||
print(
|
||
"1. How did the iterative CoT output differ from the direct single-shot?\n"
|
||
"2. Did the validation step catch any syntax errors? How were they fixed?\n"
|
||
"3. What would happen if you gave the model a deliberately wrong plan?\n"
|
||
"4. How does this manual CoT pipeline relate to built-in thinking modes\n"
|
||
" in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
|
||
"5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
|
||
" (Think: latency, cost, error isolation, debuggability)\n"
|
||
"6. How could you extend the validation step beyond syntax checking?\n"
|
||
" (Hint: unit tests, type checking, linting)\n"
|
||
)
|