Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
156 lines
5.6 KiB
Python
156 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Baseline scorer for the SEC prospectus -> RDF triple PoC.
|
|
|
|
Compares a system's predicted triples against the N-CEN gold graph and reports
|
|
triple-level precision / recall / F1 (the metric used in the thesis).
|
|
|
|
Two intended uses:
|
|
1. NON-MODEL baseline -- the gold itself is from N-CEN; this script also runs a
|
|
trivial string-match baseline that scans the prospectus prose for each gold
|
|
entity label and emits the edge if the object's name appears in the text.
|
|
This gives a "no-LLM" lower bound: how many edges are even surfaceable by
|
|
naive string matching, and how the prose supports each relation type.
|
|
2. STRONG-MODEL baseline -- point --pred at a JSONL of model predictions
|
|
(one obj per line: {"cik": ..., "triples": [{"s","p","o"}, ...]}) to score
|
|
GPT-4 / Opus extractions against the same gold.
|
|
|
|
Triple matching is on (subject_type, predicate, normalized_object_label) so that
|
|
IRI slug differences do not cause false negatives; object org names are normalized
|
|
(lowercased, legal suffixes stripped) before comparison.
|
|
|
|
Usage:
|
|
python score_baseline.py stringmatch # no-model baseline over the PoC samples
|
|
python score_baseline.py model --pred preds.jsonl
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
SAMPLES = Path("data/rdf_poc/samples.jsonl")
|
|
GOLD = Path("data/rdf_poc/gold_graphs.jsonl")
|
|
|
|
_SUFFIX = re.compile(
|
|
r"\b(llc|l\.l\.c|inc|inc\.|incorporated|corp|corporation|company|co|co\.|"
|
|
r"ltd|limited|lp|l\.p|llp|na|n\.a|trust|the)\b", re.I)
|
|
|
|
|
|
def norm(name: str) -> str:
|
|
s = (name or "").lower()
|
|
s = _SUFFIX.sub(" ", s)
|
|
s = re.sub(r"[^a-z0-9]+", " ", s)
|
|
return re.sub(r"\s+", " ", s).strip()
|
|
|
|
|
|
def triple_key(t, entities):
|
|
"""Match key: (subject_type, predicate, normalized object label)."""
|
|
s_type = entities.get(t["s"], {}).get("type", "")
|
|
o_label = entities.get(t["o"], {}).get("label", t["o"])
|
|
return (s_type, t["p"], norm(o_label))
|
|
|
|
|
|
def score(gold_triples, pred_triples, entities):
|
|
g = {triple_key(t, entities) for t in gold_triples}
|
|
p = {triple_key(t, entities) for t in pred_triples}
|
|
tp = len(g & p)
|
|
prec = tp / len(p) if p else 0.0
|
|
rec = tp / len(g) if g else 0.0
|
|
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) else 0.0
|
|
return tp, len(p), len(g), prec, rec, f1
|
|
|
|
|
|
def stringmatch_predict(sample):
|
|
"""No-model baseline: emit a gold edge iff the object label appears in the prose."""
|
|
text_norm = norm(sample["input_text"])
|
|
ents = sample.get("entities") or {}
|
|
# samples.jsonl doesn't carry entities; rebuild a minimal label map from gold
|
|
preds = []
|
|
for t in sample["target_triples"]:
|
|
o_label = sample["_entities"].get(t["o"], {}).get("label", "")
|
|
if o_label and norm(o_label) and norm(o_label) in text_norm:
|
|
preds.append(t)
|
|
return preds
|
|
|
|
|
|
def load_samples_with_entities():
|
|
gold = {json.loads(l)["cik"]: json.loads(l) for l in open(GOLD, encoding="utf-8")}
|
|
out = []
|
|
for l in open(SAMPLES, encoding="utf-8"):
|
|
s = json.loads(l)
|
|
g = gold.get(s["cik"], {})
|
|
s["_entities"] = g.get("entities", {})
|
|
out.append(s)
|
|
return out
|
|
|
|
|
|
def run_stringmatch():
|
|
samples = load_samples_with_entities()
|
|
agg = defaultdict(lambda: [0, 0, 0]) # predicate -> [tp, npred, ngold]
|
|
micro = [0, 0, 0]
|
|
print(f"{'CIK':<12}{'trust':<34}{'P':>6}{'R':>6}{'F1':>6} (string-match, no model)")
|
|
print("-" * 76)
|
|
for s in samples:
|
|
ents = s["_entities"]
|
|
gold_t = s["target_triples"]
|
|
pred_t = stringmatch_predict(s)
|
|
tp, np_, ng, prec, rec, f1 = score(gold_t, pred_t, ents)
|
|
micro[0] += tp; micro[1] += np_; micro[2] += ng
|
|
for t in gold_t:
|
|
agg[t["p"]][2] += 1
|
|
gk = {triple_key(t, ents) for t in gold_t}
|
|
for t in pred_t:
|
|
agg[t["p"]][1] += 1
|
|
if triple_key(t, ents) in gk:
|
|
agg[t["p"]][0] += 1
|
|
print(f"{s['cik']:<12}{s['trust_name'][:32]:<34}{prec:>6.2f}{rec:>6.2f}{f1:>6.2f}")
|
|
tp, np_, ng = micro
|
|
P = tp / np_ if np_ else 0
|
|
R = tp / ng if ng else 0
|
|
F = 2 * P * R / (P + R) if (P + R) else 0
|
|
print("-" * 76)
|
|
print(f"MICRO over {len(samples)} samples: P={P:.3f} R={R:.3f} F1={F:.3f} (tp={tp}, pred={np_}, gold={ng})")
|
|
print("\nPer-relation recall of the no-model string-match baseline:")
|
|
print(f" {'relation':<16}{'recall':>8}{'gold':>8} (how prose-grounded each edge type is)")
|
|
for p, (t, npd, ngd) in sorted(agg.items(), key=lambda x: -x[1][2]):
|
|
r = t / ngd if ngd else 0
|
|
print(f" {p:<16}{r:>8.2f}{ngd:>8}")
|
|
|
|
|
|
def run_model(pred_path):
|
|
samples = load_samples_with_entities()
|
|
preds = defaultdict(list)
|
|
for l in open(pred_path, encoding="utf-8"):
|
|
r = json.loads(l)
|
|
preds[r["cik"]] = r.get("triples", [])
|
|
micro = [0, 0, 0]
|
|
for s in samples:
|
|
ents = s["_entities"]
|
|
tp, np_, ng, *_ = score(s["target_triples"], preds.get(s["cik"], []), ents)
|
|
micro[0] += tp; micro[1] += np_; micro[2] += ng
|
|
tp, np_, ng = micro
|
|
P = tp / np_ if np_ else 0
|
|
R = tp / ng if ng else 0
|
|
F = 2 * P * R / (P + R) if (P + R) else 0
|
|
print(f"MODEL baseline: P={P:.3f} R={R:.3f} F1={F:.3f} (tp={tp}, pred={np_}, gold={ng})")
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
sub = ap.add_subparsers(dest="cmd")
|
|
sub.add_parser("stringmatch")
|
|
m = sub.add_parser("model"); m.add_argument("--pred", required=True)
|
|
args = ap.parse_args()
|
|
if args.cmd == "stringmatch":
|
|
run_stringmatch()
|
|
elif args.cmd == "model":
|
|
run_model(args.pred)
|
|
else:
|
|
ap.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|