fund_rfid_data/build_rdf_dataset.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

738 lines
32 KiB
Python

#!/usr/bin/env python3
"""
SEC Fund Prospectus -> RDF Triple dataset builder (proof of concept).
Produces text->triples samples where:
- INPUT = the prospectus prose for a fund family (long natural-language text),
- OUTPUT = a graph of entity->entity RDF triples (NOT flat literal attributes),
- GOLD = N-CEN structured filings (service providers) + the prospectus structure.
The graph edges are genuine relationships between entities:
Fund seriesOf Trust
Fund advisedBy InvestmentAdviser
Fund subAdvisedBy SubAdviser
Fund transferAgent TransferAgent
Fund custodian Custodian
Fund administrator Administrator
Trust underwrittenBy Distributor
Fund tracksIndex Index (flag-derived; index name from prose)
Gold for advisedBy/transferAgent/custodian/administrator/underwrittenBy comes
directly from N-CEN (no model). The prospectus prose is fetched from EDGAR and
serves as the model input.
Stages:
gold -- build the per-fund gold graph from the local N-CEN flat files
fetch -- download prospectus prose (485BPOS) for the selected N-CEN registrants
samples -- join prose + gold into text->triple training samples
all -- run gold, fetch, samples in order
Usage:
python build_rdf_dataset.py gold --ncen data/ncen/2025q3
python build_rdf_dataset.py fetch --limit 25
python build_rdf_dataset.py samples
python build_rdf_dataset.py all --limit 25
"""
import argparse
import csv
import gzip
import json
import logging
import re
import sys
import time
import urllib.request
from collections import defaultdict
from pathlib import Path
log = logging.getLogger("rdf")
HEADERS = {"User-Agent": "FundDataResearch/1.0 research@university.edu",
"Accept-Encoding": "gzip, deflate"}
EDGAR_RATE = 0.15 # seconds between SEC requests (well under 10/s limit)
_last = 0.0
OUT_DIR = Path("data/rdf_poc")
GOLD_PATH = OUT_DIR / "gold_graphs.jsonl"
PROSE_DIR = OUT_DIR / "prose"
SAMPLES_PATH = OUT_DIR / "samples.jsonl"
# RDF marker tokens from the thesis (Section 5.2)
T_START, T_PRED, T_OBJ, T_END = (
"<triple_start>", "<predicate_marker>", "<object_marker>", "<triple_end>")
# --------------------------------------------------------------------------- #
# helpers
# --------------------------------------------------------------------------- #
def _get(url: str, timeout: int = 120) -> bytes:
global _last
dt = time.time() - _last
if dt < EDGAR_RATE:
time.sleep(EDGAR_RATE - dt)
req = urllib.request.Request(url, headers=HEADERS)
resp = urllib.request.urlopen(req, timeout=timeout)
data = resp.read()
_last = time.time()
if resp.headers.get("Content-Encoding") == "gzip" or data[:2] == b"\x1f\x8b":
data = gzip.decompress(data)
return data
def _read_tsv(path: Path) -> list[dict]:
if not path.exists():
log.warning("missing %s", path)
return []
# csv.field_size_limit because some SEC fields are huge
csv.field_size_limit(min(sys.maxsize, 2**31 - 1))
with open(path, "r", encoding="utf-8", errors="replace", newline="") as f:
return list(csv.DictReader(f, delimiter="\t"))
def _slug(s: str) -> str:
"""Make an IRI-safe local name from an entity string."""
s = re.sub(r"[^A-Za-z0-9]+", "_", (s or "").strip())
return s.strip("_") or "x"
def html_to_text(raw: str) -> str:
raw = re.sub(r"(?is)<script.*?</script>", " ", raw)
raw = re.sub(r"(?is)<style.*?</style>", " ", raw)
txt = re.sub(r"(?s)<[^>]+>", " ", raw)
txt = re.sub(r"&#\d+;", " ", txt)
txt = re.sub(r"&[a-zA-Z]+;", " ", txt)
txt = re.sub(r"[ \t]+", " ", txt)
txt = re.sub(r"\s*\n\s*", "\n", txt)
return re.sub(r"\n{3,}", "\n\n", txt).strip()
# --------------------------------------------------------------------------- #
# stage: gold -- build per-registrant (trust) gold graph from N-CEN
# --------------------------------------------------------------------------- #
def build_gold(ncen_dir: Path, custodian_scope: str = "primary"):
d = Path(ncen_dir)
submission = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "SUBMISSION.tsv")}
registrant = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "REGISTRANT.tsv")}
# FUND_REPORTED_INFO: FUND_ID -> fund row (also gives ACCESSION + flags)
funds = _read_tsv(d / "FUND_REPORTED_INFO.tsv")
fund_by_id = {r["FUND_ID"]: r for r in funds}
funds_by_acc = defaultdict(list)
for r in funds:
funds_by_acc[r["ACCESSION_NUMBER"]].append(r)
# provider tables keyed by FUND_ID
def by_fund(fname, name_col, lei_col):
out = defaultdict(list)
for r in _read_tsv(d / fname):
fid = r.get("FUND_ID", "")
nm = (r.get(name_col, "") or "").strip()
if fid and nm:
out[fid].append({"name": nm, "lei": (r.get(lei_col, "") or "").strip(),
"affiliated": r.get("IS_AFFILIATED", ""),
"type": r.get("ADVISER_TYPE", ""),
"is_sub_custodian": r.get("IS_SUB_CUSTODIAN", "")})
return out
advisers = by_fund("ADVISER.tsv", "ADVISER_NAME", "ADVISER_LEI")
tagents = by_fund("TRANSFER_AGENT.tsv", "TRANSFERAGENT_NAME", "TRANSFERAGENT_LEI")
custodians = by_fund("CUSTODIAN.tsv", "CUSTODIAN_NAME", "CUSTODIAN_LEI")
admins = by_fund("ADMIN.tsv", "ADMIN_NAME", "ADMIN_LEI")
# Custodian scoping. Foreign sub-custodians (IS_SUB_CUSTODIAN=Y, ~88% of rows)
# appear ONLY in N-CEN, never in the prospectus prose, and dominate the edge
# count (66% of all edges). They are therefore unextractable noise for a
# text->triples task. Default keeps the PRIMARY custodian only.
# "primary" -> only IS_SUB_CUSTODIAN != Y (one prose-grounded edge per fund)
# "all" -> every custodian row (legacy behaviour)
# "none" -> drop the custodian relation entirely
if custodian_scope == "primary":
custodians = {fid: [c for c in cs
if (c.get("is_sub_custodian", "") or "").upper() != "Y"]
for fid, cs in custodians.items()}
elif custodian_scope == "none":
custodians = {}
# underwriter (distributor) keyed by ACCESSION (trust level)
underwriters = defaultdict(list)
for r in _read_tsv(d / "PRINCIPAL_UNDERWRITER.tsv"):
acc = r.get("ACCESSION_NUMBER", "")
nm = (r.get("UNDERWRITER_NAME", "") or "").strip()
if acc and nm:
underwriters[acc].append({"name": nm,
"lei": (r.get("UNDERWRITER_LEI", "") or "").strip()})
OUT_DIR.mkdir(parents=True, exist_ok=True)
n_graphs = 0
n_edges = 0
with open(GOLD_PATH, "w", encoding="utf-8") as out:
for acc, reg in registrant.items():
cik = (reg.get("CIK", "") or "").strip().zfill(10)
trust_name = (reg.get("REGISTRANT_NAME", "") or "").strip()
if not trust_name:
continue
trust_iri = "trust:" + _slug(trust_name)
triples = []
entities = {trust_iri: {"type": "Trust", "label": trust_name,
"lei": (reg.get("LEI", "") or "").strip()}}
# trust-level distributor edges
for u in underwriters.get(acc, []):
d_iri = "org:" + _slug(u["name"])
entities[d_iri] = {"type": "Distributor", "label": u["name"], "lei": u["lei"]}
triples.append((trust_iri, "underwrittenBy", d_iri))
for fr in funds_by_acc.get(acc, []):
fid = fr["FUND_ID"]
fname = (fr.get("FUND_NAME", "") or "").strip()
if not fname:
continue
f_iri = "fund:" + _slug(fname)
entities[f_iri] = {"type": "Fund", "label": fname,
"series_id": (fr.get("SERIES_ID", "") or "").strip(),
"lei": (fr.get("LEI", "") or "").strip()}
triples.append((f_iri, "seriesOf", trust_iri))
if (fr.get("IS_INDEX", "") or "").upper() == "Y":
entities[f_iri]["is_index"] = True
if (fr.get("IS_ETF", "") or "").upper() == "Y":
entities[f_iri]["is_etf"] = True
for a in advisers.get(fid, []):
o = "org:" + _slug(a["name"])
is_sub = (a.get("type", "") or "").lower().startswith("sub")
entities[o] = {"type": "SubAdviser" if is_sub else "InvestmentAdviser",
"label": a["name"], "lei": a["lei"]}
triples.append((f_iri, "subAdvisedBy" if is_sub else "advisedBy", o))
for a in tagents.get(fid, []):
o = "org:" + _slug(a["name"])
entities[o] = {"type": "TransferAgent", "label": a["name"], "lei": a["lei"]}
triples.append((f_iri, "transferAgent", o))
for a in custodians.get(fid, []):
o = "org:" + _slug(a["name"])
entities[o] = {"type": "Custodian", "label": a["name"], "lei": a["lei"]}
triples.append((f_iri, "custodian", o))
for a in admins.get(fid, []):
o = "org:" + _slug(a["name"])
entities[o] = {"type": "Administrator", "label": a["name"], "lei": a["lei"]}
triples.append((f_iri, "administrator", o))
# dedupe triples
triples = sorted(set(triples))
if not triples:
continue
rec = {
"accession": acc, "cik": cik,
"trust_name": trust_name, "trust_iri": trust_iri,
"n_funds": len(funds_by_acc.get(acc, [])),
"entities": entities,
"triples": [{"s": s, "p": p, "o": o} for s, p, o in triples],
}
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
n_graphs += 1
n_edges += len(triples)
log.info("gold: %d trust graphs, %d entity->entity edges -> %s",
n_graphs, n_edges, GOLD_PATH)
# quick edge-type histogram
hist = defaultdict(int)
with open(GOLD_PATH, encoding="utf-8") as f:
for line in f:
for t in json.loads(line)["triples"]:
hist[t["p"]] += 1
print("\nEdge-type histogram (entity->entity edges):")
for p, c in sorted(hist.items(), key=lambda x: -x[1]):
print(f" {p:16s} {c:>7,}")
print(f"\nTotal: {n_graphs:,} trust graphs, {n_edges:,} edges")
# --------------------------------------------------------------------------- #
# stage: fetch -- prospectus prose for selected registrants
# --------------------------------------------------------------------------- #
# Full statutory prospectuses (cover ALL funds of a book) vs. short supplements.
# We strongly prefer the full prospectuses; 497/497K supplements are tiny
# amendments and only used as a fallback when no full prospectus is available.
FULL_PROSPECTUS_FORMS = ("485BPOS", "485APOS")
SUPPLEMENT_FORMS = ("497", "497K")
def _prospectus_filings(cik: str, max_filings: int):
"""Return up to max_filings recent prospectus filings for a CIK.
Large fund families split their funds across SEVERAL prospectus books, so the
single most recent 485BPOS covers only a subset of the trust's funds. We
therefore collect the most recent FULL prospectuses (485BPOS/485APOS) first,
newest first, and fall back to 497/497K supplements only if no full
prospectus exists. Returns a list of (accession_nodash, primary_doc).
"""
url = f"https://data.sec.gov/submissions/CIK{cik}.json"
j = json.loads(_get(url).decode("utf-8", "replace"))
recent = j.get("filings", {}).get("recent", {})
forms = recent.get("form", [])
accs = recent.get("accessionNumber", [])
docs = recent.get("primaryDocument", [])
def collect(form_set):
seen, out = set(), []
for i, fm in enumerate(forms): # SEC feed is newest-first
if fm in form_set and docs[i]:
acc = accs[i].replace("-", "")
if acc not in seen:
seen.add(acc)
out.append((acc, docs[i]))
return out
full = collect(FULL_PROSPECTUS_FORMS)
chosen = full[:max_filings]
if not chosen: # fallback: this trust filed only supplements recently
chosen = collect(SUPPLEMENT_FORMS)[:max_filings]
return chosen
def fetch_prose(limit: int, max_filings: int = 8):
"""Fetch ALL recent prospectus filings per trust and concatenate per CIK.
For each selected trust, up to `max_filings` recent prospectus filings are
downloaded and their extracted text concatenated into data/rdf_poc/prose/
<cik>.txt, separated by a form-feed marker. This maximises the chance that
every N-CEN fund of the trust has its prospectus section present, raising
per-fund segmentation coverage.
"""
if not GOLD_PATH.exists():
log.error("run `gold` first"); return
PROSE_DIR.mkdir(parents=True, exist_ok=True)
rows = [json.loads(l) for l in open(GOLD_PATH, encoding="utf-8")]
# prefer trusts with the most edges (richer graphs) for the PoC slice
rows.sort(key=lambda r: -len(r["triples"]))
done = 0
tot_filings = 0
for r in rows:
if done >= limit:
break
cik = r["cik"]
out = PROSE_DIR / f"{cik}.txt"
if out.exists():
done += 1
continue
try:
filings = _prospectus_filings(cik, max_filings)
if not filings:
log.info("no prospectus filing for CIK %s (%s)", cik, r["trust_name"])
continue
parts = []
for acc, doc in filings:
try:
url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc}/{doc}"
raw = _get(url).decode("utf-8", "replace")
txt = html_to_text(raw)
if len(txt) >= 2000:
parts.append(txt)
except Exception as e:
log.debug(" filing %s failed: %s", acc, e)
if not parts:
log.info("no usable prospectus text for CIK %s, skipping", cik)
continue
combined = ("\n\n\f\n\n").join(parts) # form-feed separates books
out.write_text(combined, encoding="utf-8")
tot_filings += len(parts)
log.info("[%d/%d] %s %d filings prose=%d chars %s",
done + 1, limit, cik, len(parts), len(combined),
r["trust_name"][:40])
done += 1
except Exception as e:
log.warning("fetch failed for CIK %s: %s", cik, e)
log.info("fetched %d filings across %d trusts -> %s",
tot_filings, done, PROSE_DIR)
# --------------------------------------------------------------------------- #
# stage: samples -- join prose + gold into text->triple samples
# --------------------------------------------------------------------------- #
def serialize_triples(triples, entities) -> str:
"""Render triples in the thesis's <triple_start> marker format, grouped by subject.
Marker form (Models 2/4 with grammar-terminal tokens in the vocabulary).
"""
bys = defaultdict(list)
for t in triples:
bys[t["s"]].append((t["p"], t["o"]))
chunks = []
for s, pos in bys.items():
s_label = entities.get(s, {}).get("label", s)
body = [f"{T_START} {s_label}"]
for p, o in pos:
o_label = entities.get(o, {}).get("label", o)
body.append(f"{T_PRED} {p} {T_OBJ} {o_label}")
body.append(T_END)
chunks.append(" ".join(body))
return "\n".join(chunks)
def serialize_triples_plain(triples, entities) -> str:
"""Render the SAME triples in a plain Turtle-like form with NO special tokens.
For Models 1/3 (decoder-only / encoder-decoder without added grammar-terminal
tokens). Subjects are factored out and predicate-object lists are separated by
';' and ',' exactly as in Turtle, so the two serializations encode identical
content and differ only in whether the delimiters are dedicated tokens.
"""
bys = defaultdict(list)
for t in triples:
bys[t["s"]].append((t["p"], t["o"]))
chunks = []
for s, pos in bys.items():
s_label = entities.get(s, {}).get("label", s)
by_pred = defaultdict(list)
for p, o in pos:
by_pred[p].append(entities.get(o, {}).get("label", o))
preds = []
for p, objs in by_pred.items():
preds.append(f"{p} " + " , ".join(objs))
chunks.append(f"{s_label} " + " ; ".join(preds) + " .")
return "\n".join(chunks)
def ontology_schema(triples, entities) -> dict:
"""Inferred meta-schema (subject type -> predicate -> object type), per thesis 5.3."""
schema = defaultdict(lambda: defaultdict(set))
for t in triples:
st = entities.get(t["s"], {}).get("type", "Thing")
ot = entities.get(t["o"], {}).get("type", "Thing")
schema[st][t["p"]].add(ot)
return {st: {p: sorted(os) for p, os in preds.items()}
for st, preds in schema.items()}
# --------------------------------------------------------------------------- #
# per-fund segmentation
# --------------------------------------------------------------------------- #
def _name_variants(fund_name: str):
"""Generate prospectus-heading variants of an N-CEN fund name.
N-CEN names and prospectus headings often differ in the legal-form suffix
(Fund vs ETF vs Portfolio) and in spacing/punctuation, so we match on a set
of normalized variants rather than the exact string.
"""
base = fund_name.strip()
stems = {base}
# swap the trailing legal-form word
for suf in (" Fund", " ETF", " Portfolio", " Trust"):
if base.endswith(suf):
root = base[: -len(suf)]
for alt in (" Fund", " ETF", " Portfolio", ""):
stems.add(root + alt)
stems.add(root)
break
# also the bare root with no suffix
return {s.strip() for s in stems if len(s.strip()) >= 6}
# A fund section opens with the fund name immediately followed by a summary
# heading. Filers use several styles right after the name, so we accept any of:
# - "Fund Summary" / "Summary" (mutual funds)
# - "Investment Objective" / "Principal Investment Strateg" (objective heading)
# - "The Fund seeks ..." (ETF objective sentence)
# - "Class/Ticker:" or a ticker block / "(formerly ...)" (ETF & multi-class
# summary headers, e.g. JPMorgan, Invesco ETF)
# The MIN_SEGMENT_CHARS guard below is the real safety net: any anchor that
# resolves to a collapsed (too-short) segment is discarded, so a slightly broad
# anchor set cannot reintroduce the spurious-cross-reference problem.
_SECTION_ANCHOR = re.compile(
r"\s{0,5}("
r"Fund Summary|Summary Section|Investment Objective|Principal Investment Strateg"
r"|The Fund seeks|Class/Ticker|Ticker(s)?:|\(formerly"
r")", re.I)
MIN_SEGMENT_CHARS = 1500 # a real fund summary is at least this long
def _heading_positions(text: str, fund_names):
"""Return all (offset, fund_name) anchored heading candidates in the text.
A candidate is a position where a fund name variant is immediately followed
by a strong section anchor. A fund may have several candidates (its book may
be concatenated more than once, or it appears in multiple books); all are
kept so segmentation can choose the one that yields a non-trivial segment.
"""
cands = []
for fn in fund_names:
for v in sorted(_name_variants(fn), key=len, reverse=True):
for m in re.finditer(re.escape(v), text):
if _SECTION_ANCHOR.match(text[m.end():m.end() + 45]):
cands.append((m.start(), fn))
return sorted(cands)
def _segment_prose(text: str, fund_names):
"""Split prose into per-fund segments using anchored section headings.
Algorithm:
1. collect all anchored heading candidates across the (possibly multi-book)
text and sort them by offset;
2. each candidate's segment runs to the NEXT candidate heading (of any
fund), so boundaries come only from real section starts;
3. for each fund, choose the candidate whose segment is longest and at
least MIN_SEGMENT_CHARS, discarding collapsed (too-short) candidates.
Returns {fund_name: segment_text} for funds with a usable section.
"""
cands = _heading_positions(text, fund_names)
if not cands:
return {}
offsets = [c[0] for c in cands]
best = {} # fund -> (length, segment_text)
for i, (off, fn) in enumerate(cands):
end = offsets[i + 1] if i + 1 < len(offsets) else len(text)
seg = text[off:end]
if len(seg) < MIN_SEGMENT_CHARS:
continue
if fn not in best or len(seg) > best[fn][0]:
best[fn] = (len(seg), seg)
return {fn: seg for fn, (ln, seg) in best.items()}
def build_samples(per_fund: bool = True):
if not GOLD_PATH.exists():
log.error("run `gold` first"); return
if per_fund:
return _build_samples_per_fund()
gold = {json.loads(l)["cik"]: json.loads(l)
for l in open(GOLD_PATH, encoding="utf-8")}
n = 0
tot_text = tot_json = 0
with open(SAMPLES_PATH, "w", encoding="utf-8") as out:
for prose_file in sorted(PROSE_DIR.glob("*.txt")):
cik = prose_file.stem
g = gold.get(cik)
if not g:
continue
text = prose_file.read_text(encoding="utf-8")
triples, ents = g["triples"], g["entities"]
target = serialize_triples(triples, ents)
schema = ontology_schema(triples, ents)
sample = {
"cik": cik,
"trust_name": g["trust_name"],
"input_text": text,
"ontology": schema,
"target_triples": triples,
"target_serialized": target,
"stats": {
"input_chars": len(text),
"target_chars": len(target),
"n_triples": len(triples),
"n_entities": len(ents),
"text_to_json_ratio": round(len(text) / max(1, len(target)), 1),
},
}
out.write(json.dumps(sample, ensure_ascii=False) + "\n")
n += 1
tot_text += len(text)
tot_json += len(target)
log.info("wrote %d samples -> %s", n, SAMPLES_PATH)
if n:
print(f"\n{n} samples")
print(f" mean input : {tot_text//n:>8,} chars")
print(f" mean target : {tot_json//n:>8,} chars")
print(f" mean ratio : {tot_text/max(1,tot_json):>8.1f} : 1 (text : json)")
def _build_samples_per_fund():
"""One sample per fund: the fund's prospectus section -> the fund's subgraph.
For each trust, the prose is segmented into per-fund sections. Each fund's
target is its own edges (advisedBy, custodian, ...) plus the fund-anchored
seriesOf edge and the trust-level underwrittenBy edge (shared, but a true
fact about that fund's trust). Funds whose section cannot be located in the
prose are skipped and counted; trusts where nothing can be located fall back
to a single whole-trust sample so no data is silently dropped.
"""
gold = {json.loads(l)["cik"]: json.loads(l)
for l in open(GOLD_PATH, encoding="utf-8")}
n = 0
n_funds_total = n_funds_located = n_fallback_trusts = 0
tot_text = tot_json = 0
ratios = []
with open(SAMPLES_PATH, "w", encoding="utf-8") as out:
for prose_file in sorted(PROSE_DIR.glob("*.txt")):
cik = prose_file.stem
g = gold.get(cik)
if not g:
continue
text = prose_file.read_text(encoding="utf-8")
ents = g["entities"]
# group triples by subject fund IRI; collect trust-level edges
fund_iris = {e_iri for e_iri, e in ents.items() if e["type"] == "Fund"}
trust_iri = g["trust_iri"]
by_fund = defaultdict(list)
trust_edges = []
for t in g["triples"]:
if t["s"] in fund_iris:
by_fund[t["s"]].append(t)
elif t["s"] == trust_iri:
trust_edges.append(t) # e.g. underwrittenBy
fund_label = {iri: ents[iri]["label"] for iri in fund_iris}
n_funds_total += len(fund_iris)
segs = _segment_prose(text, list(fund_label.values()))
label_to_iri = {v: k for k, v in fund_label.items()}
if not segs: # whole-trust fallback (no section located)
n_fallback_trusts += 1
triples = g["triples"]
target = serialize_triples(triples, ents)
target_plain = serialize_triples_plain(triples, ents)
rec = {
"sample_id": f"{cik}:ALL", "cik": cik, "trust_name": g["trust_name"],
"fund": None, "segmented": False,
"input_text": text, "ontology": ontology_schema(triples, ents),
"target_triples": triples,
"target_serialized": target,
"target_serialized_plain": target_plain,
"stats": {"input_chars": len(text), "target_chars": len(target),
"n_triples": len(triples), "n_entities": len(ents),
"text_to_json_ratio": round(len(text) / max(1, len(target)), 1)},
}
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
n += 1; tot_text += len(text); tot_json += len(target)
ratios.append(rec["stats"]["text_to_json_ratio"])
continue
for label, seg in segs.items():
f_iri = label_to_iri.get(label)
if not f_iri:
continue
triples = list(by_fund.get(f_iri, [])) + list(trust_edges)
if not triples:
continue
n_funds_located += 1
# restrict entities to those referenced by this fund's triples
ref = {trust_iri}
for t in triples:
ref.add(t["s"]); ref.add(t["o"])
sub_ents = {k: ents[k] for k in ref if k in ents}
target = serialize_triples(triples, sub_ents)
target_plain = serialize_triples_plain(triples, sub_ents)
rec = {
"sample_id": f"{cik}:{ents[f_iri].get('series_id') or _slug(label)}",
"cik": cik, "trust_name": g["trust_name"],
"fund": label, "series_id": ents[f_iri].get("series_id", ""),
"segmented": True,
"input_text": seg, "ontology": ontology_schema(triples, sub_ents),
"target_triples": triples,
"target_serialized": target,
"target_serialized_plain": target_plain,
"stats": {"input_chars": len(seg), "target_chars": len(target),
"n_triples": len(triples), "n_entities": len(sub_ents),
"text_to_json_ratio": round(len(seg) / max(1, len(target)), 1)},
}
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
n += 1; tot_text += len(seg); tot_json += len(target)
ratios.append(rec["stats"]["text_to_json_ratio"])
import statistics as _st
log.info("wrote %d per-fund samples -> %s", n, SAMPLES_PATH)
if n:
cov = n_funds_located / max(1, n_funds_total)
print(f"\n{n} samples ({n_funds_located}/{n_funds_total} funds located, "
f"coverage {cov:.0%}; {n_fallback_trusts} trusts fell back to whole-doc)")
print(f" mean input : {tot_text//n:>8,} chars")
print(f" mean target : {tot_json//n:>8,} chars")
print(f" median ratio: {_st.median(ratios):>8.1f} : 1 (text : json)")
print(f" mean ratio : {tot_text/max(1,tot_json):>8.1f} : 1")
# --------------------------------------------------------------------------- #
# stage: split -- trust-level train/val/test split
# --------------------------------------------------------------------------- #
def _bucket(cik: str) -> float:
"""Deterministic [0,1) value per trust CIK (stable, no RNG state)."""
import hashlib
h = hashlib.sha256(cik.encode()).hexdigest()
return int(h[:8], 16) / 0xFFFFFFFF
def build_split(val_frac: float = 0.10, test_frac: float = 0.10):
"""Split samples.jsonl into train/val/test JSONL, partitioned by TRUST (CIK).
Splitting by trust (not by fund) prevents leakage: two funds of the same
trust share advisers, distributors and custodians, so allowing them into
different splits would let the model memorise trust-specific entities. The
assignment is a deterministic hash of the CIK, so the split is reproducible
and stable as new samples are added.
"""
if not SAMPLES_PATH.exists():
log.error("run `samples` first"); return
rows = [json.loads(l) for l in open(SAMPLES_PATH, encoding="utf-8")]
out = {"train": [], "val": [], "test": []}
for r in rows:
b = _bucket(r["cik"])
split = "test" if b < test_frac else \
"val" if b < test_frac + val_frac else "train"
out[split].append(r)
for name, recs in out.items():
p = OUT_DIR / f"{name}.jsonl"
with open(p, "w", encoding="utf-8") as f:
for r in recs:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
n_tr = {s: len({r["cik"] for r in recs}) for s, recs in out.items()}
print("\nTrust-level split (deterministic by CIK):")
for s in ("train", "val", "test"):
print(f" {s:5s}: {len(out[s]):>5,} samples from {n_tr[s]:>4} trusts")
total = sum(len(v) for v in out.values())
print(f" total: {total:,} samples -> {OUT_DIR}/{{train,val,test}}.jsonl")
# leakage check
cik_sets = {s: {r["cik"] for r in recs} for s, recs in out.items()}
overlap = (cik_sets["train"] & cik_sets["val"]) | \
(cik_sets["train"] & cik_sets["test"]) | \
(cik_sets["val"] & cik_sets["test"])
print(f" trust overlap across splits: {len(overlap)} (should be 0)")
# --------------------------------------------------------------------------- #
def main():
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
ap = argparse.ArgumentParser()
sub = ap.add_subparsers(dest="cmd")
g = sub.add_parser("gold"); g.add_argument("--ncen", default="data/ncen/2025q3")
g.add_argument("--custodian-scope", choices=["primary", "all", "none"],
default="primary",
help="primary=only IS_SUB_CUSTODIAN!=Y (default); all=every row; none=drop")
f = sub.add_parser("fetch"); f.add_argument("--limit", type=int, default=25)
f.add_argument("--max-filings", type=int, default=8,
help="max prospectus filings to fetch & concatenate per trust")
s = sub.add_parser("samples")
s.add_argument("--whole-trust", action="store_true",
help="one sample per trust instead of per-fund segmentation")
sp = sub.add_parser("split")
sp.add_argument("--val-frac", type=float, default=0.10)
sp.add_argument("--test-frac", type=float, default=0.10)
a = sub.add_parser("all"); a.add_argument("--ncen", default="data/ncen/2025q3")
a.add_argument("--limit", type=int, default=25)
a.add_argument("--max-filings", type=int, default=8)
a.add_argument("--custodian-scope", choices=["primary", "all", "none"], default="primary")
a.add_argument("--whole-trust", action="store_true")
a.add_argument("--val-frac", type=float, default=0.10)
a.add_argument("--test-frac", type=float, default=0.10)
args = ap.parse_args()
if args.cmd == "gold":
build_gold(Path(args.ncen), custodian_scope=args.custodian_scope)
elif args.cmd == "fetch":
fetch_prose(args.limit, max_filings=args.max_filings)
elif args.cmd == "samples":
build_samples(per_fund=not args.whole_trust)
elif args.cmd == "split":
build_split(val_frac=args.val_frac, test_frac=args.test_frac)
elif args.cmd == "all":
build_gold(Path(args.ncen), custodian_scope=args.custodian_scope)
fetch_prose(args.limit, max_filings=args.max_filings)
build_samples(per_fund=not args.whole_trust)
build_split(val_frac=args.val_frac, test_frac=args.test_frac)
else:
ap.print_help()
if __name__ == "__main__":
main()