AISE1_CLASS/Prompting Exercise/ex04_cot_pipeline_solution.py

"""
Exercise 4 – SOLUTION – Build Your Own Chain-of-Thought Pipeline
================================================================
AISE501 · Prompting in Coding · Spring Semester 2026
"""

import ast
import json
import subprocess
import sys
from pathlib import Path

from server_utils import (
    chat, chat_json, get_client, print_messages, print_separator,
    strip_code_fences,
)

client = get_client()

code_to_fix = Path("analyze_me.py").read_text()

PROBLEM = """\
Rewrite the Python module analyze_me.py so that it is correct,
robust, and production-ready.

Requirements:
  1. calculate_statistics() must handle empty lists without crashing.
  2. Use sample variance (divide by N-1).
  3. process_data() must use a context manager and handle non-numeric lines.
  4. normalize() must fix the operator-precedence bug and raise ValueError
     for unknown methods.
  5. All functions must have PEP-484 type hints and NumPy-style docstrings.
  6. The module must pass basic sanity checks when run as __main__.
"""


# ── Part A: Planning Phase ────────────────────────────────────────────────────
print_separator("Part A – Planning Phase (CoT Step 1)")

system_plan = """\
You are a software architect. Your ONLY job right now is to produce a
structured reasoning plan. You must NOT write any Python code or code
snippets anywhere in your response — not in action fields, not in
reasoning fields, nowhere. Use plain English descriptions only.
Respond with valid JSON only (no markdown fences, no extra text).
"""

prompt_plan = f"""\
<problem>
{PROBLEM}
</problem>

<code language="python" filename="analyze_me.py">
{code_to_fix}
</code>

<task>
  Analyse the problem and the buggy code above.
  Produce a step-by-step plan that a developer can follow to implement
  the corrected module. Each step must be atomic and self-contained.
</task>

<schema>
{{
  "goal": "<one-sentence goal>",
  "steps": [
    {{
      "step_id": 1,
      "title": "<short title>",
      "reasoning": "<why this step is necessary>",
      "action": "<concrete action to take — plain English only, no code>",
      "depends_on": []
    }}
  ]
}}
</schema>"""

messages_plan = [
    {"role": "system", "content": system_plan},
    {"role": "user",   "content": prompt_plan},
]

print_messages(messages_plan)
raw_plan = chat_json(client, messages_plan, max_tokens=4096)
print("Raw plan JSON:")
print(raw_plan)

plan = json.loads(raw_plan)

print(f"\nGoal: {plan['goal']}\n")
for step in plan["steps"]:
    print(f"Step {step['step_id']} – {step['title']}")
    print(f"  Reasoning : {step['reasoning']}")
    print(f"  Action    : {step['action']}")
    deps = step.get("depends_on", [])
    if deps:
        print(f"  Depends on: steps {deps}")
    print()


# ── Part B: Iterative Execution Phase ────────────────────────────────────────
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")

# Instead of dumping the entire plan into a single prompt, we iterate through
# each step individually.  After every step we:
#   1. Feed the model only the CURRENT step + the accumulated code so far
#   2. Validate the output (syntax check via py_compile)
#   3. Use the validated output as input for the next step
#
# This mirrors how a real developer works: implement one change, verify it
# compiles, then move on.  It also means the model always works with CONCRETE
# code from the previous step rather than an abstract plan of what it intends
# to write.

system_exec = """\
You are a senior Python developer. You receive the current state of a
Python module together with a single step to implement. Apply ONLY the
requested change. Return the complete updated module — no explanations
outside the code block.
"""


def validate_syntax_ast(code: str) -> tuple[bool, str]:
    """Use ast.parse to check whether code is syntactically valid Python."""
    try:
        ast.parse(code)
        return True, ""
    except SyntaxError as e:
        return False, str(e)

def validate_syntax(code: str) -> tuple[bool, str]:
    """Write code to a temp file and run py_compile to check syntax."""
    tmp = Path("_tmp_validate.py")
    # TODO: write code to tmp, run py_compile, clean up, return result
    tmp.unlink(missing_ok=True)
    return True, ""   # placeholder


current_code = code_to_fix          # start with the original buggy code

for step in plan["steps"]:
    step_id = step["step_id"]
    print_separator(f"Executing Step {step_id} – {step['title']}")

    prompt_step = f"""\
<current_code>
{current_code}
</current_code>

<step>
  Step {step_id}: {step['title']}
  Action: {step['action']}
  Reasoning: {step['reasoning']}
</step>

<task>
  Apply ONLY this single step to the current code above.
  Do not skip ahead to other steps.
  Mark your change with a comment: # Step {step_id} – {step['title']}
  Return the complete updated Python module.
  Do not include any explanation outside the code.
</task>"""

    messages_step = [
        {"role": "system", "content": system_exec},
        {"role": "user",   "content": prompt_step},
    ]

    print_messages(messages_step)
    raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
    step_code = strip_code_fences(raw_response)

    # ── Validate: syntax check before moving on ──
    ok, error_msg = validate_syntax(step_code)
    if ok:
        print(f"  [PASS] Step {step_id} – syntax OK")
        current_code = step_code
    else:
        print(f"  [FAIL] Step {step_id} – syntax error:\n{error_msg}")
        print("  Retrying with error feedback...")

        # Give the model one chance to fix its own syntax error
        retry_prompt = f"""\
The code you returned has a syntax error:

<error>
{error_msg}
</error>

<code>
{step_code}
</code>

<task>
  Fix the syntax error and return the complete corrected module.
  Do not include any explanation outside the code.
</task>"""

        messages_retry = [
            {"role": "system", "content": system_exec},
            {"role": "user",   "content": retry_prompt},
        ]

        print_messages(messages_retry)
        retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096)
        retry_code = strip_code_fences(retry_response)

        ok2, error_msg2 = validate_syntax(retry_code)
        if ok2:
            print(f"  [PASS] Step {step_id} – retry syntax OK")
            current_code = retry_code
        else:
            print(f"  [FAIL] Step {step_id} – retry still has errors: {error_msg2}")
            print("  Continuing with last valid code.")

    print(f"\n--- Code after Step {step_id} ---")
    print(current_code)
    print()

# Save final result
Path("analyze_me_fixed.py").write_text(current_code)
print("\nSaved iterative CoT result to analyze_me_fixed.py")

# Final validation: run the module
print_separator("Final Validation – Running analyze_me_fixed.py")
result = subprocess.run(
    [sys.executable, "analyze_me_fixed.py"],
    capture_output=True, text=True,
)
print("STDOUT:", result.stdout)
if result.stderr:
    print("STDERR:", result.stderr)
print(f"Exit code: {result.returncode}")


# ── Part C: Baseline – Direct Prompt Without CoT ─────────────────────────────
print_separator("Part C – Baseline: Direct Prompt Without CoT")

direct_prompt = f"""\
<problem>
{PROBLEM}
</problem>

<code language="python" filename="analyze_me.py">
{code_to_fix}
</code>

<task>
  Rewrite the module so that it satisfies all requirements in <problem>.
  Return only the corrected Python code.
</task>"""

messages_direct = [{"role": "user", "content": direct_prompt}]
print_messages(messages_direct)
direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
print(direct_response)

Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response))
print("\nSaved direct-prompt result to analyze_me_direct.py")

print(
    "\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n"
    "Which is more complete? Which follows the requirements more closely?"
)


# ── Reflection Questions ──────────────────────────────────────────────────────
print_separator("Reflection Questions")
print(
    "1. How did the iterative CoT output differ from the direct single-shot?\n"
    "2. Did the validation step catch any syntax errors? How were they fixed?\n"
    "3. What would happen if you gave the model a deliberately wrong plan?\n"
    "4. How does this manual CoT pipeline relate to built-in thinking modes\n"
    "   in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
    "5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
    "   (Think: latency, cost, error isolation, debuggability)\n"
    "6. How could you extend the validation step beyond syntax checking?\n"
    "   (Hint: unit tests, type checking, linting)\n"
)