""" Exercise 4 – SOLUTION – Build Your Own Chain-of-Thought Pipeline ================================================================ AISE501 · Prompting in Coding · Spring Semester 2026 """ import ast import json import subprocess import sys from pathlib import Path from server_utils import ( chat, chat_json, get_client, print_messages, print_separator, strip_code_fences, ) client = get_client() code_to_fix = Path("analyze_me.py").read_text() PROBLEM = """\ Rewrite the Python module analyze_me.py so that it is correct, robust, and production-ready. Requirements: 1. calculate_statistics() must handle empty lists without crashing. 2. Use sample variance (divide by N-1). 3. process_data() must use a context manager and handle non-numeric lines. 4. normalize() must fix the operator-precedence bug and raise ValueError for unknown methods. 5. All functions must have PEP-484 type hints and NumPy-style docstrings. 6. The module must pass basic sanity checks when run as __main__. """ # ── Part A: Planning Phase ──────────────────────────────────────────────────── print_separator("Part A – Planning Phase (CoT Step 1)") system_plan = """\ You are a software architect. Your ONLY job right now is to produce a structured reasoning plan. You must NOT write any Python code or code snippets anywhere in your response — not in action fields, not in reasoning fields, nowhere. Use plain English descriptions only. Respond with valid JSON only (no markdown fences, no extra text). """ prompt_plan = f"""\ {PROBLEM} {code_to_fix} Analyse the problem and the buggy code above. Produce a step-by-step plan that a developer can follow to implement the corrected module. Each step must be atomic and self-contained. {{ "goal": "", "steps": [ {{ "step_id": 1, "title": "", "reasoning": "", "action": "", "depends_on": [] }} ] }} """ messages_plan = [ {"role": "system", "content": system_plan}, {"role": "user", "content": prompt_plan}, ] print_messages(messages_plan) raw_plan = chat_json(client, messages_plan, max_tokens=4096) print("Raw plan JSON:") print(raw_plan) plan = json.loads(raw_plan) print(f"\nGoal: {plan['goal']}\n") for step in plan["steps"]: print(f"Step {step['step_id']} – {step['title']}") print(f" Reasoning : {step['reasoning']}") print(f" Action : {step['action']}") deps = step.get("depends_on", []) if deps: print(f" Depends on: steps {deps}") print() # ── Part B: Iterative Execution Phase ──────────────────────────────────────── print_separator("Part B – Iterative Execution Phase (CoT Step 2)") # Instead of dumping the entire plan into a single prompt, we iterate through # each step individually. After every step we: # 1. Feed the model only the CURRENT step + the accumulated code so far # 2. Validate the output (syntax check via py_compile) # 3. Use the validated output as input for the next step # # This mirrors how a real developer works: implement one change, verify it # compiles, then move on. It also means the model always works with CONCRETE # code from the previous step rather than an abstract plan of what it intends # to write. system_exec = """\ You are a senior Python developer. You receive the current state of a Python module together with a single step to implement. Apply ONLY the requested change. Return the complete updated module — no explanations outside the code block. """ def validate_syntax_ast(code: str) -> tuple[bool, str]: """Use ast.parse to check whether code is syntactically valid Python.""" try: ast.parse(code) return True, "" except SyntaxError as e: return False, str(e) def validate_syntax(code: str) -> tuple[bool, str]: """Write code to a temp file and run py_compile to check syntax.""" tmp = Path("_tmp_validate.py") # TODO: write code to tmp, run py_compile, clean up, return result tmp.unlink(missing_ok=True) return True, "" # placeholder current_code = code_to_fix # start with the original buggy code for step in plan["steps"]: step_id = step["step_id"] print_separator(f"Executing Step {step_id} – {step['title']}") prompt_step = f"""\ {current_code} Step {step_id}: {step['title']} Action: {step['action']} Reasoning: {step['reasoning']} Apply ONLY this single step to the current code above. Do not skip ahead to other steps. Mark your change with a comment: # Step {step_id} – {step['title']} Return the complete updated Python module. Do not include any explanation outside the code. """ messages_step = [ {"role": "system", "content": system_exec}, {"role": "user", "content": prompt_step}, ] print_messages(messages_step) raw_response = chat(client, messages_step, temperature=0.2, max_tokens=4096) step_code = strip_code_fences(raw_response) # ── Validate: syntax check before moving on ── ok, error_msg = validate_syntax(step_code) if ok: print(f" [PASS] Step {step_id} – syntax OK") current_code = step_code else: print(f" [FAIL] Step {step_id} – syntax error:\n{error_msg}") print(" Retrying with error feedback...") # Give the model one chance to fix its own syntax error retry_prompt = f"""\ The code you returned has a syntax error: {error_msg} {step_code} Fix the syntax error and return the complete corrected module. Do not include any explanation outside the code. """ messages_retry = [ {"role": "system", "content": system_exec}, {"role": "user", "content": retry_prompt}, ] print_messages(messages_retry) retry_response = chat(client, messages_retry, temperature=0.1, max_tokens=4096) retry_code = strip_code_fences(retry_response) ok2, error_msg2 = validate_syntax(retry_code) if ok2: print(f" [PASS] Step {step_id} – retry syntax OK") current_code = retry_code else: print(f" [FAIL] Step {step_id} – retry still has errors: {error_msg2}") print(" Continuing with last valid code.") print(f"\n--- Code after Step {step_id} ---") print(current_code) print() # Save final result Path("analyze_me_fixed.py").write_text(current_code) print("\nSaved iterative CoT result to analyze_me_fixed.py") # Final validation: run the module print_separator("Final Validation – Running analyze_me_fixed.py") result = subprocess.run( [sys.executable, "analyze_me_fixed.py"], capture_output=True, text=True, ) print("STDOUT:", result.stdout) if result.stderr: print("STDERR:", result.stderr) print(f"Exit code: {result.returncode}") # ── Part C: Baseline – Direct Prompt Without CoT ───────────────────────────── print_separator("Part C – Baseline: Direct Prompt Without CoT") direct_prompt = f"""\ {PROBLEM} {code_to_fix} Rewrite the module so that it satisfies all requirements in . Return only the corrected Python code. """ messages_direct = [{"role": "user", "content": direct_prompt}] print_messages(messages_direct) direct_response = chat(client, messages_direct, temperature=0.3, max_tokens=4096) print(direct_response) Path("analyze_me_direct.py").write_text(strip_code_fences(direct_response)) print("\nSaved direct-prompt result to analyze_me_direct.py") print( "\nCompare analyze_me_fixed.py (CoT) with analyze_me_direct.py (direct).\n" "Which is more complete? Which follows the requirements more closely?" ) # ── Reflection Questions ────────────────────────────────────────────────────── print_separator("Reflection Questions") print( "1. How did the iterative CoT output differ from the direct single-shot?\n" "2. Did the validation step catch any syntax errors? How were they fixed?\n" "3. What would happen if you gave the model a deliberately wrong plan?\n" "4. How does this manual CoT pipeline relate to built-in thinking modes\n" " in models like o1, DeepSeek-R1, and Qwen3 with think mode enabled?\n" "5. What are the trade-offs of step-by-step iteration vs. one-shot?\n" " (Think: latency, cost, error isolation, debuggability)\n" "6. How could you extend the validation step beyond syntax checking?\n" " (Hint: unit tests, type checking, linting)\n" )