#!/usr/bin/env python3 """ Baseline scorer for the SEC prospectus -> RDF triple PoC. Compares a system's predicted triples against the N-CEN gold graph and reports triple-level precision / recall / F1 (the metric used in the thesis). Two intended uses: 1. NON-MODEL baseline -- the gold itself is from N-CEN; this script also runs a trivial string-match baseline that scans the prospectus prose for each gold entity label and emits the edge if the object's name appears in the text. This gives a "no-LLM" lower bound: how many edges are even surfaceable by naive string matching, and how the prose supports each relation type. 2. STRONG-MODEL baseline -- point --pred at a JSONL of model predictions (one obj per line: {"cik": ..., "triples": [{"s","p","o"}, ...]}) to score GPT-4 / Opus extractions against the same gold. Triple matching is on (subject_type, predicate, normalized_object_label) so that IRI slug differences do not cause false negatives; object org names are normalized (lowercased, legal suffixes stripped) before comparison. Usage: python score_baseline.py stringmatch # no-model baseline over the PoC samples python score_baseline.py model --pred preds.jsonl """ import argparse import json import re from collections import defaultdict from pathlib import Path SAMPLES = Path("data/rdf_poc/samples.jsonl") GOLD = Path("data/rdf_poc/gold_graphs.jsonl") _SUFFIX = re.compile( r"\b(llc|l\.l\.c|inc|inc\.|incorporated|corp|corporation|company|co|co\.|" r"ltd|limited|lp|l\.p|llp|na|n\.a|trust|the)\b", re.I) def norm(name: str) -> str: s = (name or "").lower() s = _SUFFIX.sub(" ", s) s = re.sub(r"[^a-z0-9]+", " ", s) return re.sub(r"\s+", " ", s).strip() def triple_key(t, entities): """Match key: (subject_type, predicate, normalized object label).""" s_type = entities.get(t["s"], {}).get("type", "") o_label = entities.get(t["o"], {}).get("label", t["o"]) return (s_type, t["p"], norm(o_label)) def score(gold_triples, pred_triples, entities): g = {triple_key(t, entities) for t in gold_triples} p = {triple_key(t, entities) for t in pred_triples} tp = len(g & p) prec = tp / len(p) if p else 0.0 rec = tp / len(g) if g else 0.0 f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0 return tp, len(p), len(g), prec, rec, f1 def stringmatch_predict(sample): """No-model baseline: emit a gold edge iff the object label appears in the prose.""" text_norm = norm(sample["input_text"]) ents = sample.get("entities") or {} # samples.jsonl doesn't carry entities; rebuild a minimal label map from gold preds = [] for t in sample["target_triples"]: o_label = sample["_entities"].get(t["o"], {}).get("label", "") if o_label and norm(o_label) and norm(o_label) in text_norm: preds.append(t) return preds def load_samples_with_entities(): gold = {json.loads(l)["cik"]: json.loads(l) for l in open(GOLD, encoding="utf-8")} out = [] for l in open(SAMPLES, encoding="utf-8"): s = json.loads(l) g = gold.get(s["cik"], {}) s["_entities"] = g.get("entities", {}) out.append(s) return out def run_stringmatch(): samples = load_samples_with_entities() agg = defaultdict(lambda: [0, 0, 0]) # predicate -> [tp, npred, ngold] micro = [0, 0, 0] print(f"{'CIK':<12}{'trust':<34}{'P':>6}{'R':>6}{'F1':>6} (string-match, no model)") print("-" * 76) for s in samples: ents = s["_entities"] gold_t = s["target_triples"] pred_t = stringmatch_predict(s) tp, np_, ng, prec, rec, f1 = score(gold_t, pred_t, ents) micro[0] += tp; micro[1] += np_; micro[2] += ng for t in gold_t: agg[t["p"]][2] += 1 gk = {triple_key(t, ents) for t in gold_t} for t in pred_t: agg[t["p"]][1] += 1 if triple_key(t, ents) in gk: agg[t["p"]][0] += 1 print(f"{s['cik']:<12}{s['trust_name'][:32]:<34}{prec:>6.2f}{rec:>6.2f}{f1:>6.2f}") tp, np_, ng = micro P = tp / np_ if np_ else 0 R = tp / ng if ng else 0 F = 2 * P * R / (P + R) if (P + R) else 0 print("-" * 76) print(f"MICRO over {len(samples)} samples: P={P:.3f} R={R:.3f} F1={F:.3f} (tp={tp}, pred={np_}, gold={ng})") print("\nPer-relation recall of the no-model string-match baseline:") print(f" {'relation':<16}{'recall':>8}{'gold':>8} (how prose-grounded each edge type is)") for p, (t, npd, ngd) in sorted(agg.items(), key=lambda x: -x[1][2]): r = t / ngd if ngd else 0 print(f" {p:<16}{r:>8.2f}{ngd:>8}") def run_model(pred_path): samples = load_samples_with_entities() preds = defaultdict(list) for l in open(pred_path, encoding="utf-8"): r = json.loads(l) preds[r["cik"]] = r.get("triples", []) micro = [0, 0, 0] for s in samples: ents = s["_entities"] tp, np_, ng, *_ = score(s["target_triples"], preds.get(s["cik"], []), ents) micro[0] += tp; micro[1] += np_; micro[2] += ng tp, np_, ng = micro P = tp / np_ if np_ else 0 R = tp / ng if ng else 0 F = 2 * P * R / (P + R) if (P + R) else 0 print(f"MODEL baseline: P={P:.3f} R={R:.3f} F1={F:.3f} (tp={tp}, pred={np_}, gold={ng})") def main(): ap = argparse.ArgumentParser() sub = ap.add_subparsers(dest="cmd") sub.add_parser("stringmatch") m = sub.add_parser("model"); m.add_argument("--pred", required=True) args = ap.parse_args() if args.cmd == "stringmatch": run_stringmatch() elif args.cmd == "model": run_model(args.pred) else: ap.print_help() if __name__ == "__main__": main()