fund_rfid_data/score_baseline.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

156 lines
5.6 KiB
Python

#!/usr/bin/env python3
"""
Baseline scorer for the SEC prospectus -> RDF triple PoC.
Compares a system's predicted triples against the N-CEN gold graph and reports
triple-level precision / recall / F1 (the metric used in the thesis).
Two intended uses:
1. NON-MODEL baseline -- the gold itself is from N-CEN; this script also runs a
trivial string-match baseline that scans the prospectus prose for each gold
entity label and emits the edge if the object's name appears in the text.
This gives a "no-LLM" lower bound: how many edges are even surfaceable by
naive string matching, and how the prose supports each relation type.
2. STRONG-MODEL baseline -- point --pred at a JSONL of model predictions
(one obj per line: {"cik": ..., "triples": [{"s","p","o"}, ...]}) to score
GPT-4 / Opus extractions against the same gold.
Triple matching is on (subject_type, predicate, normalized_object_label) so that
IRI slug differences do not cause false negatives; object org names are normalized
(lowercased, legal suffixes stripped) before comparison.
Usage:
python score_baseline.py stringmatch # no-model baseline over the PoC samples
python score_baseline.py model --pred preds.jsonl
"""
import argparse
import json
import re
from collections import defaultdict
from pathlib import Path
SAMPLES = Path("data/rdf_poc/samples.jsonl")
GOLD = Path("data/rdf_poc/gold_graphs.jsonl")
_SUFFIX = re.compile(
r"\b(llc|l\.l\.c|inc|inc\.|incorporated|corp|corporation|company|co|co\.|"
r"ltd|limited|lp|l\.p|llp|na|n\.a|trust|the)\b", re.I)
def norm(name: str) -> str:
s = (name or "").lower()
s = _SUFFIX.sub(" ", s)
s = re.sub(r"[^a-z0-9]+", " ", s)
return re.sub(r"\s+", " ", s).strip()
def triple_key(t, entities):
"""Match key: (subject_type, predicate, normalized object label)."""
s_type = entities.get(t["s"], {}).get("type", "")
o_label = entities.get(t["o"], {}).get("label", t["o"])
return (s_type, t["p"], norm(o_label))
def score(gold_triples, pred_triples, entities):
g = {triple_key(t, entities) for t in gold_triples}
p = {triple_key(t, entities) for t in pred_triples}
tp = len(g & p)
prec = tp / len(p) if p else 0.0
rec = tp / len(g) if g else 0.0
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
return tp, len(p), len(g), prec, rec, f1
def stringmatch_predict(sample):
"""No-model baseline: emit a gold edge iff the object label appears in the prose."""
text_norm = norm(sample["input_text"])
ents = sample.get("entities") or {}
# samples.jsonl doesn't carry entities; rebuild a minimal label map from gold
preds = []
for t in sample["target_triples"]:
o_label = sample["_entities"].get(t["o"], {}).get("label", "")
if o_label and norm(o_label) and norm(o_label) in text_norm:
preds.append(t)
return preds
def load_samples_with_entities():
gold = {json.loads(l)["cik"]: json.loads(l) for l in open(GOLD, encoding="utf-8")}
out = []
for l in open(SAMPLES, encoding="utf-8"):
s = json.loads(l)
g = gold.get(s["cik"], {})
s["_entities"] = g.get("entities", {})
out.append(s)
return out
def run_stringmatch():
samples = load_samples_with_entities()
agg = defaultdict(lambda: [0, 0, 0]) # predicate -> [tp, npred, ngold]
micro = [0, 0, 0]
print(f"{'CIK':<12}{'trust':<34}{'P':>6}{'R':>6}{'F1':>6} (string-match, no model)")
print("-" * 76)
for s in samples:
ents = s["_entities"]
gold_t = s["target_triples"]
pred_t = stringmatch_predict(s)
tp, np_, ng, prec, rec, f1 = score(gold_t, pred_t, ents)
micro[0] += tp; micro[1] += np_; micro[2] += ng
for t in gold_t:
agg[t["p"]][2] += 1
gk = {triple_key(t, ents) for t in gold_t}
for t in pred_t:
agg[t["p"]][1] += 1
if triple_key(t, ents) in gk:
agg[t["p"]][0] += 1
print(f"{s['cik']:<12}{s['trust_name'][:32]:<34}{prec:>6.2f}{rec:>6.2f}{f1:>6.2f}")
tp, np_, ng = micro
P = tp / np_ if np_ else 0
R = tp / ng if ng else 0
F = 2 * P * R / (P + R) if (P + R) else 0
print("-" * 76)
print(f"MICRO over {len(samples)} samples: P={P:.3f} R={R:.3f} F1={F:.3f} (tp={tp}, pred={np_}, gold={ng})")
print("\nPer-relation recall of the no-model string-match baseline:")
print(f" {'relation':<16}{'recall':>8}{'gold':>8} (how prose-grounded each edge type is)")
for p, (t, npd, ngd) in sorted(agg.items(), key=lambda x: -x[1][2]):
r = t / ngd if ngd else 0
print(f" {p:<16}{r:>8.2f}{ngd:>8}")
def run_model(pred_path):
samples = load_samples_with_entities()
preds = defaultdict(list)
for l in open(pred_path, encoding="utf-8"):
r = json.loads(l)
preds[r["cik"]] = r.get("triples", [])
micro = [0, 0, 0]
for s in samples:
ents = s["_entities"]
tp, np_, ng, *_ = score(s["target_triples"], preds.get(s["cik"], []), ents)
micro[0] += tp; micro[1] += np_; micro[2] += ng
tp, np_, ng = micro
P = tp / np_ if np_ else 0
R = tp / ng if ng else 0
F = 2 * P * R / (P + R) if (P + R) else 0
print(f"MODEL baseline: P={P:.3f} R={R:.3f} F1={F:.3f} (tp={tp}, pred={np_}, gold={ng})")
def main():
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd")
sub.add_parser("stringmatch")
m = sub.add_parser("model"); m.add_argument("--pred", required=True)
args = ap.parse_args()
if args.cmd == "stringmatch":
run_stringmatch()
elif args.cmd == "model":
run_model(args.pred)
else:
ap.print_help()
if __name__ == "__main__":
main()