fund_rfid_data/score_baseline.py

#!/usr/bin/env python3
"""
Baseline scorer for the SEC prospectus -> RDF triple PoC.

Compares a system's predicted triples against the N-CEN gold graph and reports
triple-level precision / recall / F1 (the metric used in the thesis).

Two intended uses:
  1. NON-MODEL baseline -- the gold itself is from N-CEN; this script also runs a
     trivial string-match baseline that scans the prospectus prose for each gold
     entity label and emits the edge if the object's name appears in the text.
     This gives a "no-LLM" lower bound: how many edges are even surfaceable by
     naive string matching, and how the prose supports each relation type.
  2. STRONG-MODEL baseline -- point --pred at a JSONL of model predictions
     (one obj per line: {"cik": ..., "triples": [{"s","p","o"}, ...]}) to score
     GPT-4 / Opus extractions against the same gold.

Triple matching is on (subject_type, predicate, normalized_object_label) so that
IRI slug differences do not cause false negatives; object org names are normalized
(lowercased, legal suffixes stripped) before comparison.

Usage:
  python score_baseline.py stringmatch            # no-model baseline over the PoC samples
  python score_baseline.py model --pred preds.jsonl
"""

import argparse
import json
import re
from collections import defaultdict
from pathlib import Path

SAMPLES = Path("data/rdf_poc/samples.jsonl")
GOLD = Path("data/rdf_poc/gold_graphs.jsonl")

_SUFFIX = re.compile(
    r"\b(llc|l\.l\.c|inc|inc\.|incorporated|corp|corporation|company|co|co\.|"
    r"ltd|limited|lp|l\.p|llp|na|n\.a|trust|the)\b", re.I)


def norm(name: str) -> str:
    s = (name or "").lower()
    s = _SUFFIX.sub(" ", s)
    s = re.sub(r"[^a-z0-9]+", " ", s)
    return re.sub(r"\s+", " ", s).strip()


def triple_key(t, entities):
    """Match key: (subject_type, predicate, normalized object label)."""
    s_type = entities.get(t["s"], {}).get("type", "")
    o_label = entities.get(t["o"], {}).get("label", t["o"])
    return (s_type, t["p"], norm(o_label))


def score(gold_triples, pred_triples, entities):
    g = {triple_key(t, entities) for t in gold_triples}
    p = {triple_key(t, entities) for t in pred_triples}
    tp = len(g & p)
    prec = tp / len(p) if p else 0.0
    rec = tp / len(g) if g else 0.0
    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
    return tp, len(p), len(g), prec, rec, f1


def stringmatch_predict(sample):
    """No-model baseline: emit a gold edge iff the object label appears in the prose."""
    text_norm = norm(sample["input_text"])
    ents = sample.get("entities") or {}
    # samples.jsonl doesn't carry entities; rebuild a minimal label map from gold
    preds = []
    for t in sample["target_triples"]:
        o_label = sample["_entities"].get(t["o"], {}).get("label", "")
        if o_label and norm(o_label) and norm(o_label) in text_norm:
            preds.append(t)
    return preds


def load_samples_with_entities():
    gold = {json.loads(l)["cik"]: json.loads(l) for l in open(GOLD, encoding="utf-8")}
    out = []
    for l in open(SAMPLES, encoding="utf-8"):
        s = json.loads(l)
        g = gold.get(s["cik"], {})
        s["_entities"] = g.get("entities", {})
        out.append(s)
    return out


def run_stringmatch():
    samples = load_samples_with_entities()
    agg = defaultdict(lambda: [0, 0, 0])  # predicate -> [tp, npred, ngold]
    micro = [0, 0, 0]
    print(f"{'CIK':<12}{'trust':<34}{'P':>6}{'R':>6}{'F1':>6}  (string-match, no model)")
    print("-" * 76)
    for s in samples:
        ents = s["_entities"]
        gold_t = s["target_triples"]
        pred_t = stringmatch_predict(s)
        tp, np_, ng, prec, rec, f1 = score(gold_t, pred_t, ents)
        micro[0] += tp; micro[1] += np_; micro[2] += ng
        for t in gold_t:
            agg[t["p"]][2] += 1
        gk = {triple_key(t, ents) for t in gold_t}
        for t in pred_t:
            agg[t["p"]][1] += 1
            if triple_key(t, ents) in gk:
                agg[t["p"]][0] += 1
        print(f"{s['cik']:<12}{s['trust_name'][:32]:<34}{prec:>6.2f}{rec:>6.2f}{f1:>6.2f}")
    tp, np_, ng = micro
    P = tp / np_ if np_ else 0
    R = tp / ng if ng else 0
    F = 2 * P * R / (P + R) if (P + R) else 0
    print("-" * 76)
    print(f"MICRO over {len(samples)} samples:  P={P:.3f}  R={R:.3f}  F1={F:.3f}  (tp={tp}, pred={np_}, gold={ng})")
    print("\nPer-relation recall of the no-model string-match baseline:")
    print(f"  {'relation':<16}{'recall':>8}{'gold':>8}  (how prose-grounded each edge type is)")
    for p, (t, npd, ngd) in sorted(agg.items(), key=lambda x: -x[1][2]):
        r = t / ngd if ngd else 0
        print(f"  {p:<16}{r:>8.2f}{ngd:>8}")


def run_model(pred_path):
    samples = load_samples_with_entities()
    preds = defaultdict(list)
    for l in open(pred_path, encoding="utf-8"):
        r = json.loads(l)
        preds[r["cik"]] = r.get("triples", [])
    micro = [0, 0, 0]
    for s in samples:
        ents = s["_entities"]
        tp, np_, ng, *_ = score(s["target_triples"], preds.get(s["cik"], []), ents)
        micro[0] += tp; micro[1] += np_; micro[2] += ng
    tp, np_, ng = micro
    P = tp / np_ if np_ else 0
    R = tp / ng if ng else 0
    F = 2 * P * R / (P + R) if (P + R) else 0
    print(f"MODEL baseline:  P={P:.3f}  R={R:.3f}  F1={F:.3f}  (tp={tp}, pred={np_}, gold={ng})")


def main():
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd")
    sub.add_parser("stringmatch")
    m = sub.add_parser("model"); m.add_argument("--pred", required=True)
    args = ap.parse_args()
    if args.cmd == "stringmatch":
        run_stringmatch()
    elif args.cmd == "model":
        run_model(args.pred)
    else:
        ap.print_help()


if __name__ == "__main__":
    main()