AISE1_CLASS/Prompting Exercise/ex04_cot_pipeline_solution.py

280 lines
9.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Exercise 4 SOLUTION Build Your Own Chain-of-Thought Pipeline
================================================================
AISE501 · Prompting in Coding · Spring Semester 2026
"""
import ast
import json
import subprocess
import sys
from pathlib import Path
from server_utils import (
chat, chat_json, get_client, print_messages, print_separator,
strip_code_fences,
)
client = get_client()
code_to_fix = Path("analyze_me.py").read_text()
PROBLEM = """\
Rewrite the Python module analyze_me.py so that it is correct,
robust, and production-ready.
Requirements:
1. calculate_statistics() must handle empty lists without crashing.
2. Use sample variance (divide by N-1).
3. process_data() must use a context manager and handle non-numeric lines.
4. normalize() must fix the operator-precedence bug and raise ValueError
for unknown methods.
5. All functions must have PEP-484 type hints and NumPy-style docstrings.
6. The module must pass basic sanity checks when run as __main__.
"""
# ── Part A: Planning Phase ────────────────────────────────────────────────────
print_separator("Part A Planning Phase (CoT Step 1)")
system_plan = """\
You are a software architect. Your ONLY job right now is to produce a
structured reasoning plan. You must NOT write any Python code or code
snippets anywhere in your response — not in action fields, not in
reasoning fields, nowhere. Use plain English descriptions only.
Respond with valid JSON only (no markdown fences, no extra text).
"""
prompt_plan = f"""\
<problem>
{PROBLEM}
</problem>
<code language="python" filename="analyze_me.py">
{code_to_fix}
</code>
<task>
Analyse the problem and the buggy code above.
Produce a step-by-step plan that a developer can follow to implement
the corrected module. Each step must be atomic and self-contained.
</task>
<schema>
{{
"goal": "<one-sentence goal>",
"steps": [
{{
"step_id": 1,
"title": "<short title>",
"reasoning": "<why this step is necessary>",
"action": "<concrete action to take — plain English only, no code>",
"depends_on": []
}}
]
}}
</schema>"""
messages_plan = [
{"role": "system", "content": system_plan},
{"role": "user", "content": prompt_plan},
]
print_messages(messages_plan)
raw_plan = chat_json(client, messages_plan, max_tokens=4096)
print("Raw plan JSON:")
print(raw_plan)
plan = json.loads(raw_plan)
print(f"\nGoal: {plan['goal']}\n")
for step in plan["steps"]:
print(f"Step {step['step_id']} {step['title']}")
print(f" Reasoning : {step['reasoning']}")
print(f" Action : {step['action']}")
deps = step.get("depends_on", [])
if deps:
print(f" Depends on: steps {deps}")
print()
# ── Part B: Iterative Execution Phase ────────────────────────────────────────
print_separator("Part B Iterative Execution Phase (CoT Step 2)")
# Instead of dumping the entire plan into a single prompt, we iterate through
# each step individually. After every step we:
# 1. Feed the model only the CURRENT step + the accumulated code so far
# 2. Validate the output (syntax check via py_compile)
# 3. Use the validated output as input for the next step
#
# This mirrors how a real developer works: implement one change, verify it
# compiles, then move on. It also means the model always works with CONCRETE
# code from the previous step rather than an abstract plan of what it intends
# to write.
system_exec = """\
You are a senior Python developer. You receive the current state of a
Python module together with a single step to implement. Apply ONLY the
requested change. Return the complete updated module — no explanations
outside the code block.
"""
def validate_syntax_ast(code: str) -> tuple[bool, str]:
"""Use ast.parse to check whether code is syntactically valid Python."""
try:
ast.parse(code)
return True, ""
except SyntaxError as e:
return False, str(e)
def validate_syntax(code: str) -> tuple[bool, str]:
"""Write code to a temp file and run py_compile to check syntax."""
tmp = Path("_tmp_validate.py")
# TODO: write code to tmp, run py_compile, clean up, return result
tmp.unlink(missing_ok=True)
return True, "" # placeholder
current_code = code_to_fix # start with the original buggy code
for step in plan["steps"]:
step_id = step["step_id"]
print_separator(f"Executing Step {step_id} {step['title']}")
prompt_step = f"""\
<current_code>
{current_code}
</current_code>
<step>
Step {step_id}: {step['title']}
Action: {step['action']}
Reasoning: {step['reasoning']}
</step>
<task>
Apply ONLY this single step to the current code above.
Do not skip ahead to other steps.
Mark your change with a comment: # Step {step_id} {step['title']}
Return the complete updated Python module.
Do not include any explanation outside the code.
</task>"""
messages_step = [
{"role": "system", "content": system_exec},
{"role": "user", "content": prompt_step},
]
print_messages(messages_step)
raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
step_code = strip_code_fences(raw_response)
# ── Validate: syntax check before moving on ──
ok, error_msg = validate_syntax(step_code)
if ok:
print(f" [PASS] Step {step_id} syntax OK")
current_code = step_code
else:
print(f" [FAIL] Step {step_id} syntax error:\n{error_msg}")
print(" Retrying with error feedback...")
# Give the model one chance to fix its own syntax error
retry_prompt = f"""\
The code you returned has a syntax error:
<error>
{error_msg}
</error>
<code>
{step_code}
</code>
<task>
Fix the syntax error and return the complete corrected module.
Do not include any explanation outside the code.
</task>"""
messages_retry = [
{"role": "system", "content": system_exec},
{"role": "user", "content": retry_prompt},
]
print_messages(messages_retry)
retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096)
retry_code = strip_code_fences(retry_response)
ok2, error_msg2 = validate_syntax(retry_code)
if ok2:
print(f" [PASS] Step {step_id} retry syntax OK")
current_code = retry_code
else:
print(f" [FAIL] Step {step_id} retry still has errors: {error_msg2}")
print(" Continuing with last valid code.")
print(f"\n--- Code after Step {step_id} ---")
print(current_code)
print()
# Save final result
Path("analyze_me_fixed.py").write_text(current_code)
print("\nSaved iterative CoT result to analyze_me_fixed.py")
# Final validation: run the module
print_separator("Final Validation Running analyze_me_fixed.py")
result = subprocess.run(
[sys.executable, "analyze_me_fixed.py"],
capture_output=True, text=True,
)
print("STDOUT:", result.stdout)
if result.stderr:
print("STDERR:", result.stderr)
print(f"Exit code: {result.returncode}")
# ── Part C: Baseline Direct Prompt Without CoT ─────────────────────────────
print_separator("Part C Baseline: Direct Prompt Without CoT")
direct_prompt = f"""\
<problem>
{PROBLEM}
</problem>
<code language="python" filename="analyze_me.py">
{code_to_fix}
</code>
<task>
Rewrite the module so that it satisfies all requirements in <problem>.
Return only the corrected Python code.
</task>"""
messages_direct = [{"role": "user", "content": direct_prompt}]
print_messages(messages_direct)
direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
print(direct_response)
Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response))
print("\nSaved direct-prompt result to analyze_me_direct.py")
print(
"\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n"
"Which is more complete? Which follows the requirements more closely?"
)
# ── Reflection Questions ──────────────────────────────────────────────────────
print_separator("Reflection Questions")
print(
"1. How did the iterative CoT output differ from the direct single-shot?\n"
"2. Did the validation step catch any syntax errors? How were they fixed?\n"
"3. What would happen if you gave the model a deliberately wrong plan?\n"
"4. How does this manual CoT pipeline relate to built-in thinking modes\n"
" in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
"5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
" (Think: latency, cost, error isolation, debuggability)\n"
"6. How could you extend the validation step beyond syntax checking?\n"
" (Hint: unit tests, type checking, linting)\n"
)