AISE1_CLASS/Prompting Exercise/ex04_cot_pipeline.py

"""
Exercise 4 – Build Your Own Chain-of-Thought Pipeline
======================================================
AISE501 · Prompting in Coding · Spring Semester 2026

Learning goals
--------------
* Understand that reasoning models (o1, DeepSeek-R1, Qwen3 think mode)
  generate a hidden "plan" before giving the final answer.
* Replicate this behaviour manually using multiple LLM calls:
    Call 1  (Planning)  – structured input  →  structured JSON plan
    Calls 2…N (Execution) – iterate step-by-step, validating each step
* See why explicit reasoning steps improve answer quality for complex tasks.

Background
----------
When you disable Qwen3's built-in thinking mode (as we do in server_utils),
you get fast, direct answers — but no explicit reasoning.
In this exercise you rebuild that reasoning step yourself, step by step,
so you can inspect and control the thinking process.

The problem
-----------
Given the buggy analyze_me.py from earlier exercises, design and implement
a corrected, production-ready version of the full module.

Tasks
-----
Part A  Planning phase: structured input → JSON reasoning plan (TODOs 1-5).
Part B  Iterative execution: apply each plan step one at a time,
        validating syntax after each step (TODOs 6-10).
Part C  Reflection — compare with and without CoT (TODO 11).

Estimated time: 50-60 minutes
"""

import json
import subprocess
import sys
from pathlib import Path

from server_utils import (
    chat, chat_json, get_client, print_messages, print_separator,
    strip_code_fences,
)

client = get_client()

code_to_fix = Path("analyze_me.py").read_text()

# ── The Problem Statement ─────────────────────────────────────────────────────
# We will use this description in both phases so we define it once.

PROBLEM = """\
Rewrite the Python module analyze_me.py so that it is correct,
robust, and production-ready.

Requirements:
  1. calculate_statistics() must handle empty lists without crashing.
  2. Use sample variance (divide by N-1).
  3. process_data() must use a context manager and handle non-numeric lines.
  4. normalize() must fix the operator-precedence bug and raise ValueError
     for unknown methods.
  5. All functions must have PEP-484 type hints and NumPy-style docstrings.
  6. The module must pass basic sanity checks when run as __main__.
"""


# ── Part A: Planning Phase ────────────────────────────────────────────────────
print_separator("Part A – Planning Phase (CoT Step 1)")

# The goal of this phase is NOT to write the code — it is to produce a
# structured plan: what steps are needed and in what order?

# TODO 1:  Write a system prompt that instructs the model to act as a
#          "software architect" whose job is ONLY to produce a plan,
#          never to write the final code.
#          IMPORTANT: explicitly forbid code snippets in all fields —
#          use plain English only. This prevents unescaped quotes from
#          breaking the JSON output.
#          Enforce JSON-only output.

system_plan = """\
TODO: Write a system prompt for the planning phase.
      The model should only reason and plan, not write code.
      Enforce JSON-only output.
"""

# TODO 2:  Write the planning user prompt using XML tags:
#            <problem>   – embed the PROBLEM string
#            <code>      – embed the buggy code_to_fix
#            <task>      – ask for a step-by-step plan
#            <schema>    – specify the exact JSON schema for the plan:
#
#          {
#            "goal": "<one sentence goal>",
#            "steps": [
#              {
#                "step_id": 1,
#                "title": "<short title>",
#                "reasoning": "<why this step is needed>",
#                "action": "<what to do in this step — plain English, no code>",
#                "depends_on": []   // list of step_ids this step depends on
#              },
#              ...
#            ]
#          }

prompt_plan = f"""\
TODO: Write the planning prompt here.
Use <problem>, <code>, <task>, and <schema> tags.

<problem>
{PROBLEM}
</problem>

<code language="python" filename="analyze_me.py">
{code_to_fix}
</code>"""

# TODO 3:  Build messages_plan (system + user) and call chat_json().
#          Use chat_json() (not chat()) so the server enforces valid JSON via
#          response_format={"type": "json_object"}.
#          Use max_tokens=4096 — the plan can be long and would get cut off
#          with the default 2048, producing truncated (unparseable) JSON.

messages_plan = [
    # TODO: add system and user messages
]

# print_messages(messages_plan)
# raw_plan = chat_json(client, messages_plan, max_tokens=4096)
# print("Raw plan JSON:")
# print(raw_plan)


# TODO 4:  Parse raw_plan with json.loads().
#          Print each step in a readable format:
#            Step 1 – <title>
#              Reasoning : <reasoning>
#              Action    : <action>

# plan = json.loads(raw_plan)
# print(f"\nGoal: {plan['goal']}\n")
# for step in plan["steps"]:
#     print(f"Step {step['step_id']} – {step['title']}")
#     print(f"  Reasoning : {step['reasoning']}")
#     print(f"  Action    : {step['action']}\n")


# TODO 5:  (Optional) Inspect the plan critically.
#          Does the order of steps make sense?
#          Are any steps missing?
#          You can edit the plan dict before passing it to the execution phase.


# ── Part B: Iterative Execution Phase ────────────────────────────────────────
print_separator("Part B – Iterative Execution Phase (CoT Step 2)")

# KEY INSIGHT: Instead of dumping the entire plan into one big prompt
# (which would just be another one-shot), we iterate through each step
# individually. After every step we:
#   1. Feed the model only the CURRENT step + the accumulated code so far
#   2. Validate the output (syntax check via py_compile)
#   3. Use the validated output as input for the next step
#
# This mirrors how a real developer works: implement one change, verify it
# compiles, then move on. The model always works with CONCRETE code from
# the previous step rather than an abstract plan of what it intends to write.

# TODO 6:  Write a system prompt for the execution phase.
#          The model should act as a developer who receives the current
#          state of a module plus a single step to implement.
#          It should apply ONLY that step and return the full updated module.

system_exec = """\
TODO: Write a system prompt for the step-by-step execution phase.
      The model should apply ONE step at a time.
"""


# TODO 7:  Complete the validate_syntax() function below.
#          It should write code to a temp file and run py_compile on it.
#          Return (True, "") if syntax is valid, (False, error_message) otherwise.

def validate_syntax(code: str) -> tuple[bool, str]:
    """Write code to a temp file and run py_compile to check syntax."""
    tmp = Path("_tmp_validate.py")
    # TODO: write code to tmp, run py_compile, clean up, return result
    tmp.unlink(missing_ok=True)
    return True, ""   # placeholder


# TODO 8:  Implement the step-by-step execution loop.
#          Start with current_code = code_to_fix (the original buggy code).
#          For each step in plan["steps"]:
#            a) Build a prompt with <current_code>, <step>, and <task> tags
#            b) Call chat() with the prompt
#            c) Strip code fences from the response
#            d) Validate syntax using validate_syntax()
#            e) If valid: update current_code
#            f) If invalid: retry ONCE with error feedback
#            g) Print the code after each step

# current_code = code_to_fix
#
# for step in plan["steps"]:
#     step_id = step["step_id"]
#     print_separator(f"Executing Step {step_id} – {step['title']}")
#
#     prompt_step = f"""\
# TODO: Build the per-step prompt here.
# Include <current_code>, <step>, and <task> tags.
# Tell the model to apply ONLY this step."""
#
#     messages_step = [
#         {"role": "system", "content": system_exec},
#         {"role": "user",   "content": prompt_step},
#     ]
#
#     print_messages(messages_step)
#     raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096)
#     step_code = strip_code_fences(raw_response)
#
#     # Validate syntax
#     ok, error_msg = validate_syntax(step_code)
#     if ok:
#         print(f"  [PASS] Step {step_id} – syntax OK")
#         current_code = step_code
#     else:
#         print(f"  [FAIL] Step {step_id} – syntax error: {error_msg}")
#         # TODO: retry with error feedback (see TODO 9)
#
#     print(f"\n--- Code after Step {step_id} ---")
#     print(current_code)


# TODO 9:  Implement the retry logic for syntax errors.
#          When a step produces invalid syntax:
#            a) Build a retry prompt with the <error> and the broken <code>
#            b) Ask the model to fix the syntax error
#            c) Validate again
#            d) If still broken, keep the last valid code and continue


# TODO 10: Save the final result and run it as a validation.
#          - Save current_code to "analyze_me_fixed.py"
#          - Run it with subprocess and print the output

# Path("analyze_me_fixed.py").write_text(current_code)
# print("\nSaved iterative CoT result to analyze_me_fixed.py")
#
# result = subprocess.run(
#     [sys.executable, "analyze_me_fixed.py"],
#     capture_output=True, text=True,
# )
# print("STDOUT:", result.stdout)
# if result.stderr:
#     print("STDERR:", result.stderr)
# print(f"Exit code: {result.returncode}")


# ── Part C: Compare With and Without CoT ─────────────────────────────────────
print_separator("Part C – Baseline: Direct Prompt Without CoT")

# TODO 11: Send the same problem to the model in a SINGLE prompt with NO plan.
#          Compare this response with the iterative CoT version.

direct_prompt = f"""\
TODO: Write a direct, single-shot prompt asking the model to rewrite
      analyze_me.py according to the PROBLEM requirements.
      No plan, no iteration — just ask directly.

<problem>
{PROBLEM}
</problem>

<code language="python" filename="analyze_me.py">
{code_to_fix}
</code>"""

# messages_direct = [{"role": "user", "content": direct_prompt}]
# print_messages(messages_direct)
# direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096)
# print(direct_response)


# ── Reflection Questions ──────────────────────────────────────────────────────
print_separator("Reflection Questions")
print(
    "1. How did the iterative CoT output differ from the direct single-shot?\n"
    "2. Did the validation step catch any syntax errors? How were they fixed?\n"
    "3. What would happen if you gave the model a deliberately wrong plan?\n"
    "4. How does this manual CoT pipeline relate to built-in thinking modes\n"
    "   in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n"
    "5. What are the trade-offs of step-by-step iteration vs. one-shot?\n"
    "   (Think: latency, cost, error isolation, debuggability)\n"
    "6. How could you extend the validation step beyond syntax checking?\n"
    "   (Hint: unit tests, type checking, linting)\n"
)