280 lines
9.0 KiB
Python
280 lines
9.0 KiB
Python
"""
|
||
Exercise 4 – SOLUTION – Build Your Own Chain-of-Thought Pipeline
|
||
================================================================
|
||
AISE501 · Prompting in Coding · Spring Semester 2026
|
||
"""
|
||
|
||
import ast
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
from server_utils import (
|
||
chat, chat_json, get_client, print_messages, print_separator,
|
||
strip_code_fences,
|
||
)
|
||
|
||
client = get_client()
|
||
|
||
code_to_fix = Path("analyze_me.py").read_text()
|
||
|
||
PROBLEM = """\
|
||
Rewrite the Python module analyze_me.py so that it is correct,
|
||
robust, and production-ready.
|
||
|
||
Requirements:
|
||
1. calculate_statistics() must handle empty lists without crashing.
|
||
2. Use sample variance (divide by N-1).
|
||
3. process_data() must use a context manager and handle non-numeric lines.
|
||
4. normalize() must fix the operator-precedence bug and raise ValueError
|
||
for unknown methods.
|
||
5. All functions must have PEP-484 type hints and NumPy-style docstrings.
|
||
6. The module must pass basic sanity checks when run as __main__.
|
||
"""
|
||
|
||
|
||
# ── Part A: Planning Phase ────────────────────────────────────────────────────
|
||
print_separator("Part A – Planning Phase (CoT Step 1)")
|
||
|
||
system_plan = """\
|
||
You are a software architect. Your ONLY job right now is to produce a
|
||
structured reasoning plan. You must NOT write any Python code or code
|
||
snippets anywhere in your response — not in action fields, not in
|
||
reasoning fields, nowhere. Use plain English descriptions only.
|
||
Respond with valid JSON only (no markdown fences, no extra text).
|
||
"""
|
||
|
||
prompt_plan = f"""\
|
||
<problem>
|
||
{PROBLEM}
|
||
</problem>
|
||
|
||
<code language="python" filename="analyze_me.py">
|
||
{code_to_fix}
|
||
</code>
|
||
|
||
<task>
|
||
Analyse the problem and the buggy code above.
|
||
Produce a step-by-step plan that a developer can follow to implement
|
||
the corrected module. Each step must be atomic and self-contained.
|
||
</task>
|
||
|
||
<schema>
|
||
{{
|
||
"goal": "<one-sentence goal>",
|
||
"steps": [
|
||
{{
|
||
"step_id": 1,
|
||
"title": "<short title>",
|
||
"reasoning": "<why this step is necessary>",
|
||
"action": "<concrete action to take — plain English only, no code>",
|
||
"depends_on": []
|
||
}}
|
||
]
|
||
}}
|
||
</schema>"""
|
||
|
||
messages_plan = [
|
||
{"role": "system", "content": system_plan},
|
||
{"role": "user", "content": prompt_plan},
|
||
]
|
||
|
||
print_messages(messages_plan)
|
||
raw_plan = chat_json(client, messages_plan, max_tokens=4096)
|
||
print("Raw plan JSON:")
|
||
print(raw_plan)
|
||
|
||
plan = json.loads(raw_plan)
|
||
|
||
print(f"\nGoal: {plan['goal']}\n")
|
||
for step in plan["steps"]:
|
||
print(f"Step {step['step_id']} – {step['title']}")
|
||
print(f" Reasoning : {step['reasoning']}")
|
||
print(f" Action : {step['action']}")
|
||
deps = step.get("depends_on", [])
|
||
if deps:
|
||
print(f" Depends on: steps {deps}")
|
||
print()
|
||
|
||
|
||
# ── Part B: Iterative Execution Phase ────────────────────────────────────────
|
||
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")
|
||
|
||
# Instead of dumping the entire plan into a single prompt, we iterate through
|
||
# each step individually. After every step we:
|
||
# 1. Feed the model only the CURRENT step + the accumulated code so far
|
||
# 2. Validate the output (syntax check via py_compile)
|
||
# 3. Use the validated output as input for the next step
|
||
#
|
||
# This mirrors how a real developer works: implement one change, verify it
|
||
# compiles, then move on. It also means the model always works with CONCRETE
|
||
# code from the previous step rather than an abstract plan of what it intends
|
||
# to write.
|
||
|
||
system_exec = """\
|
||
You are a senior Python developer. You receive the current state of a
|
||
Python module together with a single step to implement. Apply ONLY the
|
||
requested change. Return the complete updated module — no explanations
|
||
outside the code block.
|
||
"""
|
||
|
||
|
||
def validate_syntax_ast(code: str) -> tuple[bool, str]:
|
||
"""Use ast.parse to check whether code is syntactically valid Python."""
|
||
try:
|
||
ast.parse(code)
|
||
return True, ""
|
||
except SyntaxError as e:
|
||
return False, str(e)
|
||
|
||
def validate_syntax(code: str) -> tuple[bool, str]:
|
||
"""Write code to a temp file and run py_compile to check syntax."""
|
||
tmp = Path("_tmp_validate.py")
|
||
# TODO: write code to tmp, run py_compile, clean up, return result
|
||
tmp.unlink(missing_ok=True)
|
||
return True, "" # placeholder
|
||
|
||
|
||
current_code = code_to_fix # start with the original buggy code
|
||
|
||
for step in plan["steps"]:
|
||
step_id = step["step_id"]
|
||
print_separator(f"Executing Step {step_id} – {step['title']}")
|
||
|
||
prompt_step = f"""\
|
||
<current_code>
|
||
{current_code}
|
||
</current_code>
|
||
|
||
<step>
|
||
Step {step_id}: {step['title']}
|
||
Action: {step['action']}
|
||
Reasoning: {step['reasoning']}
|
||
</step>
|
||
|
||
<task>
|
||
Apply ONLY this single step to the current code above.
|
||
Do not skip ahead to other steps.
|
||
Mark your change with a comment: # Step {step_id} – {step['title']}
|
||
Return the complete updated Python module.
|
||
Do not include any explanation outside the code.
|
||
</task>"""
|
||
|
||
messages_step = [
|
||
{"role": "system", "content": system_exec},
|
||
{"role": "user", "content": prompt_step},
|
||
]
|
||
|
||
print_messages(messages_step)
|
||
raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
|
||
step_code = strip_code_fences(raw_response)
|
||
|
||
# ── Validate: syntax check before moving on ──
|
||
ok, error_msg = validate_syntax(step_code)
|
||
if ok:
|
||
print(f" [PASS] Step {step_id} – syntax OK")
|
||
current_code = step_code
|
||
else:
|
||
print(f" [FAIL] Step {step_id} – syntax error:\n{error_msg}")
|
||
print(" Retrying with error feedback...")
|
||
|
||
# Give the model one chance to fix its own syntax error
|
||
retry_prompt = f"""\
|
||
The code you returned has a syntax error:
|
||
|
||
<error>
|
||
{error_msg}
|
||
</error>
|
||
|
||
<code>
|
||
{step_code}
|
||
</code>
|
||
|
||
<task>
|
||
Fix the syntax error and return the complete corrected module.
|
||
Do not include any explanation outside the code.
|
||
</task>"""
|
||
|
||
messages_retry = [
|
||
{"role": "system", "content": system_exec},
|
||
{"role": "user", "content": retry_prompt},
|
||
]
|
||
|
||
print_messages(messages_retry)
|
||
retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096)
|
||
retry_code = strip_code_fences(retry_response)
|
||
|
||
ok2, error_msg2 = validate_syntax(retry_code)
|
||
if ok2:
|
||
print(f" [PASS] Step {step_id} – retry syntax OK")
|
||
current_code = retry_code
|
||
else:
|
||
print(f" [FAIL] Step {step_id} – retry still has errors: {error_msg2}")
|
||
print(" Continuing with last valid code.")
|
||
|
||
print(f"\n--- Code after Step {step_id} ---")
|
||
print(current_code)
|
||
print()
|
||
|
||
# Save final result
|
||
Path("analyze_me_fixed.py").write_text(current_code)
|
||
print("\nSaved iterative CoT result to analyze_me_fixed.py")
|
||
|
||
# Final validation: run the module
|
||
print_separator("Final Validation – Running analyze_me_fixed.py")
|
||
result = subprocess.run(
|
||
[sys.executable, "analyze_me_fixed.py"],
|
||
capture_output=True, text=True,
|
||
)
|
||
print("STDOUT:", result.stdout)
|
||
if result.stderr:
|
||
print("STDERR:", result.stderr)
|
||
print(f"Exit code: {result.returncode}")
|
||
|
||
|
||
# ── Part C: Baseline – Direct Prompt Without CoT ─────────────────────────────
|
||
print_separator("Part C – Baseline: Direct Prompt Without CoT")
|
||
|
||
direct_prompt = f"""\
|
||
<problem>
|
||
{PROBLEM}
|
||
</problem>
|
||
|
||
<code language="python" filename="analyze_me.py">
|
||
{code_to_fix}
|
||
</code>
|
||
|
||
<task>
|
||
Rewrite the module so that it satisfies all requirements in <problem>.
|
||
Return only the corrected Python code.
|
||
</task>"""
|
||
|
||
messages_direct = [{"role": "user", "content": direct_prompt}]
|
||
print_messages(messages_direct)
|
||
direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
|
||
print(direct_response)
|
||
|
||
Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response))
|
||
print("\nSaved direct-prompt result to analyze_me_direct.py")
|
||
|
||
print(
|
||
"\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n"
|
||
"Which is more complete? Which follows the requirements more closely?"
|
||
)
|
||
|
||
|
||
# ── Reflection Questions ──────────────────────────────────────────────────────
|
||
print_separator("Reflection Questions")
|
||
print(
|
||
"1. How did the iterative CoT output differ from the direct single-shot?\n"
|
||
"2. Did the validation step catch any syntax errors? How were they fixed?\n"
|
||
"3. What would happen if you gave the model a deliberately wrong plan?\n"
|
||
"4. How does this manual CoT pipeline relate to built-in thinking modes\n"
|
||
" in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
|
||
"5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
|
||
" (Think: latency, cost, error isolation, debuggability)\n"
|
||
"6. How could you extend the validation step beyond syntax checking?\n"
|
||
" (Hint: unit tests, type checking, linting)\n"
|
||
)
|