""" Exercise 4 – Build Your Own Chain-of-Thought Pipeline ====================================================== AISE501 · Prompting in Coding · Spring Semester 2026 Learning goals -------------- * Understand that reasoning models (o1, DeepSeek-R1, Qwen3 think mode) generate a hidden "plan" before giving the final answer. * Replicate this behaviour manually using multiple LLM calls: Call 1 (Planning) – structured input → structured JSON plan Calls 2…N (Execution) – iterate step-by-step, validating each step * See why explicit reasoning steps improve answer quality for complex tasks. Background ---------- When you disable Qwen3's built-in thinking mode (as we do in server_utils), you get fast, direct answers — but no explicit reasoning. In this exercise you rebuild that reasoning step yourself, step by step, so you can inspect and control the thinking process. The problem ----------- Given the buggy analyze_me.py from earlier exercises, design and implement a corrected, production-ready version of the full module. Tasks ----- Part A Planning phase: structured input → JSON reasoning plan (TODOs 1-5). Part B Iterative execution: apply each plan step one at a time, validating syntax after each step (TODOs 6-10). Part C Reflection — compare with and without CoT (TODO 11). Estimated time: 50-60 minutes """ import json import subprocess import sys from pathlib import Path from server_utils import ( chat, chat_json, get_client, print_messages, print_separator, strip_code_fences, ) client = get_client() code_to_fix = Path("analyze_me.py").read_text() # ── The Problem Statement ───────────────────────────────────────────────────── # We will use this description in both phases so we define it once. PROBLEM = """\ Rewrite the Python module analyze_me.py so that it is correct, robust, and production-ready. Requirements: 1. calculate_statistics() must handle empty lists without crashing. 2. Use sample variance (divide by N-1). 3. process_data() must use a context manager and handle non-numeric lines. 4. normalize() must fix the operator-precedence bug and raise ValueError for unknown methods. 5. All functions must have PEP-484 type hints and NumPy-style docstrings. 6. The module must pass basic sanity checks when run as __main__. """ # ── Part A: Planning Phase ──────────────────────────────────────────────────── print_separator("Part A – Planning Phase (CoT Step 1)") # The goal of this phase is NOT to write the code — it is to produce a # structured plan: what steps are needed and in what order? # TODO 1: Write a system prompt that instructs the model to act as a # "software architect" whose job is ONLY to produce a plan, # never to write the final code. # IMPORTANT: explicitly forbid code snippets in all fields — # use plain English only. This prevents unescaped quotes from # breaking the JSON output. # Enforce JSON-only output. system_plan = """\ TODO: Write a system prompt for the planning phase. The model should only reason and plan, not write code. Enforce JSON-only output. """ # TODO 2: Write the planning user prompt using XML tags: # – embed the PROBLEM string # – embed the buggy code_to_fix # – ask for a step-by-step plan # – specify the exact JSON schema for the plan: # # { # "goal": "", # "steps": [ # { # "step_id": 1, # "title": "", # "reasoning": "", # "action": "", # "depends_on": [] // list of step_ids this step depends on # }, # ... # ] # } prompt_plan = f"""\ TODO: Write the planning prompt here. Use , , , and tags. {PROBLEM} {code_to_fix} """ # TODO 3: Build messages_plan (system + user) and call chat_json(). # Use chat_json() (not chat()) so the server enforces valid JSON via # response_format={"type": "json_object"}. # Use max_tokens=4096 — the plan can be long and would get cut off # with the default 2048, producing truncated (unparseable) JSON. messages_plan = [ # TODO: add system and user messages ] # print_messages(messages_plan) # raw_plan = chat_json(client, messages_plan, max_tokens=4096) # print("Raw plan JSON:") # print(raw_plan) # TODO 4: Parse raw_plan with json.loads(). # Print each step in a readable format: # Step 1 – # Reasoning : <reasoning> # Action : <action> # plan = json.loads(raw_plan) # print(f"\nGoal: {plan['goal']}\n") # for step in plan["steps"]: # print(f"Step {step['step_id']} – {step['title']}") # print(f" Reasoning : {step['reasoning']}") # print(f" Action : {step['action']}\n") # TODO 5: (Optional) Inspect the plan critically. # Does the order of steps make sense? # Are any steps missing? # You can edit the plan dict before passing it to the execution phase. # ── Part B: Iterative Execution Phase ──────────────────────────────────────── print_separator("Part B – Iterative Execution Phase (CoT Step 2)") # KEY INSIGHT: Instead of dumping the entire plan into one big prompt # (which would just be another one-shot), we iterate through each step # individually. After every step we: # 1. Feed the model only the CURRENT step + the accumulated code so far # 2. Validate the output (syntax check via py_compile) # 3. Use the validated output as input for the next step # # This mirrors how a real developer works: implement one change, verify it # compiles, then move on. The model always works with CONCRETE code from # the previous step rather than an abstract plan of what it intends to write. # TODO 6: Write a system prompt for the execution phase. # The model should act as a developer who receives the current # state of a module plus a single step to implement. # It should apply ONLY that step and return the full updated module. system_exec = """\ TODO: Write a system prompt for the step-by-step execution phase. The model should apply ONE step at a time. """ # TODO 7: Complete the validate_syntax() function below. # It should write code to a temp file and run py_compile on it. # Return (True, "") if syntax is valid, (False, error_message) otherwise. def validate_syntax(code: str) -> tuple[bool, str]: """Write code to a temp file and run py_compile to check syntax.""" tmp = Path("_tmp_validate.py") # TODO: write code to tmp, run py_compile, clean up, return result tmp.unlink(missing_ok=True) return True, "" # placeholder # TODO 8: Implement the step-by-step execution loop. # Start with current_code = code_to_fix (the original buggy code). # For each step in plan["steps"]: # a) Build a prompt with <current_code>, <step>, and <task> tags # b) Call chat() with the prompt # c) Strip code fences from the response # d) Validate syntax using validate_syntax() # e) If valid: update current_code # f) If invalid: retry ONCE with error feedback # g) Print the code after each step # current_code = code_to_fix # # for step in plan["steps"]: # step_id = step["step_id"] # print_separator(f"Executing Step {step_id} – {step['title']}") # # prompt_step = f"""\ # TODO: Build the per-step prompt here. # Include <current_code>, <step>, and <task> tags. # Tell the model to apply ONLY this step.""" # # messages_step = [ # {"role": "system", "content": system_exec}, # {"role": "user", "content": prompt_step}, # ] # # print_messages(messages_step) # raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096) # step_code = strip_code_fences(raw_response) # # # Validate syntax # ok, error_msg = validate_syntax(step_code) # if ok: # print(f" [PASS] Step {step_id} – syntax OK") # current_code = step_code # else: # print(f" [FAIL] Step {step_id} – syntax error: {error_msg}") # # TODO: retry with error feedback (see TODO 9) # # print(f"\n--- Code after Step {step_id} ---") # print(current_code) # TODO 9: Implement the retry logic for syntax errors. # When a step produces invalid syntax: # a) Build a retry prompt with the <error> and the broken <code> # b) Ask the model to fix the syntax error # c) Validate again # d) If still broken, keep the last valid code and continue # TODO 10: Save the final result and run it as a validation. # - Save current_code to "analyze_me_fixed.py" # - Run it with subprocess and print the output # Path("analyze_me_fixed.py").write_text(current_code) # print("\nSaved iterative CoT result to analyze_me_fixed.py") # # result = subprocess.run( # [sys.executable, "analyze_me_fixed.py"], # capture_output=True, text=True, # ) # print("STDOUT:", result.stdout) # if result.stderr: # print("STDERR:", result.stderr) # print(f"Exit code: {result.returncode}") # ── Part C: Compare With and Without CoT ───────────────────────────────────── print_separator("Part C – Baseline: Direct Prompt Without CoT") # TODO 11: Send the same problem to the model in a SINGLE prompt with NO plan. # Compare this response with the iterative CoT version. direct_prompt = f"""\ TODO: Write a direct, single-shot prompt asking the model to rewrite analyze_me.py according to the PROBLEM requirements. No plan, no iteration — just ask directly. <problem> {PROBLEM} </problem> <code language="python" filename="analyze_me.py"> {code_to_fix} </code>""" # messages_direct = [{"role": "user", "content": direct_prompt}] # print_messages(messages_direct) # direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096) # print(direct_response) # ── Reflection Questions ────────────────────────────────────────────────────── print_separator("Reflection Questions") print( "1. How did the iterative CoT output differ from the direct single-shot?\n" "2. Did the validation step catch any syntax errors? How were they fixed?\n" "3. What would happen if you gave the model a deliberately wrong plan?\n" "4. How does this manual CoT pipeline relate to built-in thinking modes\n" " in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n" "5. What are the trade-offs of step-by-step iteration vs. one-shot?\n" " (Think: latency, cost, error isolation, debuggability)\n" "6. How could you extend the validation step beyond syntax checking?\n" " (Hint: unit tests, type checking, linting)\n" )