"""
Exercise 4 – SOLUTION – Build Your Own Chain-of-Thought Pipeline
================================================================
AISE501 · Prompting in Coding · Spring Semester 2026
"""
import ast
import json
import subprocess
import sys
from pathlib import Path
from server_utils import (
chat, chat_json, get_client, print_messages, print_separator,
strip_code_fences,
)
client = get_client()
code_to_fix = Path("analyze_me.py").read_text()
PROBLEM = """\
Rewrite the Python module analyze_me.py so that it is correct,
robust, and production-ready.
Requirements:
1. calculate_statistics() must handle empty lists without crashing.
2. Use sample variance (divide by N-1).
3. process_data() must use a context manager and handle non-numeric lines.
4. normalize() must fix the operator-precedence bug and raise ValueError
for unknown methods.
5. All functions must have PEP-484 type hints and NumPy-style docstrings.
6. The module must pass basic sanity checks when run as __main__.
"""
# ── Part A: Planning Phase ────────────────────────────────────────────────────
print_separator("Part A – Planning Phase (CoT Step 1)")
system_plan = """\
You are a software architect. Your ONLY job right now is to produce a
structured reasoning plan. You must NOT write any Python code or code
snippets anywhere in your response — not in action fields, not in
reasoning fields, nowhere. Use plain English descriptions only.
Respond with valid JSON only (no markdown fences, no extra text).
"""
prompt_plan = f"""\
{PROBLEM}
{code_to_fix}
Analyse the problem and the buggy code above.
Produce a step-by-step plan that a developer can follow to implement
the corrected module. Each step must be atomic and self-contained.
{{
"goal": "",
"steps": [
{{
"step_id": 1,
"title": "",
"reasoning": "",
"action": "",
"depends_on": []
}}
]
}}
"""
messages_plan = [
{"role": "system", "content": system_plan},
{"role": "user", "content": prompt_plan},
]
print_messages(messages_plan)
raw_plan = chat_json(client, messages_plan, max_tokens=4096)
print("Raw plan JSON:")
print(raw_plan)
plan = json.loads(raw_plan)
print(f"\nGoal: {plan['goal']}\n")
for step in plan["steps"]:
print(f"Step {step['step_id']} – {step['title']}")
print(f" Reasoning : {step['reasoning']}")
print(f" Action : {step['action']}")
deps = step.get("depends_on", [])
if deps:
print(f" Depends on: steps {deps}")
print()
# ── Part B: Iterative Execution Phase ────────────────────────────────────────
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")
# Instead of dumping the entire plan into a single prompt, we iterate through
# each step individually. After every step we:
# 1. Feed the model only the CURRENT step + the accumulated code so far
# 2. Validate the output (syntax check via py_compile)
# 3. Use the validated output as input for the next step
#
# This mirrors how a real developer works: implement one change, verify it
# compiles, then move on. It also means the model always works with CONCRETE
# code from the previous step rather than an abstract plan of what it intends
# to write.
system_exec = """\
You are a senior Python developer. You receive the current state of a
Python module together with a single step to implement. Apply ONLY the
requested change. Return the complete updated module — no explanations
outside the code block.
"""
def validate_syntax_ast(code: str) -> tuple[bool, str]:
"""Use ast.parse to check whether code is syntactically valid Python."""
try:
ast.parse(code)
return True, ""
except SyntaxError as e:
return False, str(e)
def validate_syntax(code: str) -> tuple[bool, str]:
"""Write code to a temp file and run py_compile to check syntax."""
tmp = Path("_tmp_validate.py")
# TODO: write code to tmp, run py_compile, clean up, return result
tmp.unlink(missing_ok=True)
return True, "" # placeholder
current_code = code_to_fix # start with the original buggy code
for step in plan["steps"]:
step_id = step["step_id"]
print_separator(f"Executing Step {step_id} – {step['title']}")
prompt_step = f"""\
{current_code}
Step {step_id}: {step['title']}
Action: {step['action']}
Reasoning: {step['reasoning']}
Apply ONLY this single step to the current code above.
Do not skip ahead to other steps.
Mark your change with a comment: # Step {step_id} – {step['title']}
Return the complete updated Python module.
Do not include any explanation outside the code.
"""
messages_step = [
{"role": "system", "content": system_exec},
{"role": "user", "content": prompt_step},
]
print_messages(messages_step)
raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
step_code = strip_code_fences(raw_response)
# ── Validate: syntax check before moving on ──
ok, error_msg = validate_syntax(step_code)
if ok:
print(f" [PASS] Step {step_id} – syntax OK")
current_code = step_code
else:
print(f" [FAIL] Step {step_id} – syntax error:\n{error_msg}")
print(" Retrying with error feedback...")
# Give the model one chance to fix its own syntax error
retry_prompt = f"""\
The code you returned has a syntax error:
{error_msg}
{step_code}
Fix the syntax error and return the complete corrected module.
Do not include any explanation outside the code.
"""
messages_retry = [
{"role": "system", "content": system_exec},
{"role": "user", "content": retry_prompt},
]
print_messages(messages_retry)
retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096)
retry_code = strip_code_fences(retry_response)
ok2, error_msg2 = validate_syntax(retry_code)
if ok2:
print(f" [PASS] Step {step_id} – retry syntax OK")
current_code = retry_code
else:
print(f" [FAIL] Step {step_id} – retry still has errors: {error_msg2}")
print(" Continuing with last valid code.")
print(f"\n--- Code after Step {step_id} ---")
print(current_code)
print()
# Save final result
Path("analyze_me_fixed.py").write_text(current_code)
print("\nSaved iterative CoT result to analyze_me_fixed.py")
# Final validation: run the module
print_separator("Final Validation – Running analyze_me_fixed.py")
result = subprocess.run(
[sys.executable, "analyze_me_fixed.py"],
capture_output=True, text=True,
)
print("STDOUT:", result.stdout)
if result.stderr:
print("STDERR:", result.stderr)
print(f"Exit code: {result.returncode}")
# ── Part C: Baseline – Direct Prompt Without CoT ─────────────────────────────
print_separator("Part C – Baseline: Direct Prompt Without CoT")
direct_prompt = f"""\
{PROBLEM}
{code_to_fix}
Rewrite the module so that it satisfies all requirements in .
Return only the corrected Python code.
"""
messages_direct = [{"role": "user", "content": direct_prompt}]
print_messages(messages_direct)
direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
print(direct_response)
Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response))
print("\nSaved direct-prompt result to analyze_me_direct.py")
print(
"\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n"
"Which is more complete? Which follows the requirements more closely?"
)
# ── Reflection Questions ──────────────────────────────────────────────────────
print_separator("Reflection Questions")
print(
"1. How did the iterative CoT output differ from the direct single-shot?\n"
"2. Did the validation step catch any syntax errors? How were they fixed?\n"
"3. What would happen if you gave the model a deliberately wrong plan?\n"
"4. How does this manual CoT pipeline relate to built-in thinking modes\n"
" in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
"5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
" (Think: latency, cost, error isolation, debuggability)\n"
"6. How could you extend the validation step beyond syntax checking?\n"
" (Hint: unit tests, type checking, linting)\n"
)