Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
738 lines
32 KiB
Python
738 lines
32 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
SEC Fund Prospectus -> RDF Triple dataset builder (proof of concept).
|
|
|
|
Produces text->triples samples where:
|
|
- INPUT = the prospectus prose for a fund family (long natural-language text),
|
|
- OUTPUT = a graph of entity->entity RDF triples (NOT flat literal attributes),
|
|
- GOLD = N-CEN structured filings (service providers) + the prospectus structure.
|
|
|
|
The graph edges are genuine relationships between entities:
|
|
Fund seriesOf Trust
|
|
Fund advisedBy InvestmentAdviser
|
|
Fund subAdvisedBy SubAdviser
|
|
Fund transferAgent TransferAgent
|
|
Fund custodian Custodian
|
|
Fund administrator Administrator
|
|
Trust underwrittenBy Distributor
|
|
Fund tracksIndex Index (flag-derived; index name from prose)
|
|
|
|
Gold for advisedBy/transferAgent/custodian/administrator/underwrittenBy comes
|
|
directly from N-CEN (no model). The prospectus prose is fetched from EDGAR and
|
|
serves as the model input.
|
|
|
|
Stages:
|
|
gold -- build the per-fund gold graph from the local N-CEN flat files
|
|
fetch -- download prospectus prose (485BPOS) for the selected N-CEN registrants
|
|
samples -- join prose + gold into text->triple training samples
|
|
all -- run gold, fetch, samples in order
|
|
|
|
Usage:
|
|
python build_rdf_dataset.py gold --ncen data/ncen/2025q3
|
|
python build_rdf_dataset.py fetch --limit 25
|
|
python build_rdf_dataset.py samples
|
|
python build_rdf_dataset.py all --limit 25
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import gzip
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.request
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
log = logging.getLogger("rdf")
|
|
|
|
HEADERS = {"User-Agent": "FundDataResearch/1.0 research@university.edu",
|
|
"Accept-Encoding": "gzip, deflate"}
|
|
EDGAR_RATE = 0.15 # seconds between SEC requests (well under 10/s limit)
|
|
_last = 0.0
|
|
|
|
OUT_DIR = Path("data/rdf_poc")
|
|
GOLD_PATH = OUT_DIR / "gold_graphs.jsonl"
|
|
PROSE_DIR = OUT_DIR / "prose"
|
|
SAMPLES_PATH = OUT_DIR / "samples.jsonl"
|
|
|
|
# RDF marker tokens from the thesis (Section 5.2)
|
|
T_START, T_PRED, T_OBJ, T_END = (
|
|
"<triple_start>", "<predicate_marker>", "<object_marker>", "<triple_end>")
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# helpers
|
|
# --------------------------------------------------------------------------- #
|
|
def _get(url: str, timeout: int = 120) -> bytes:
|
|
global _last
|
|
dt = time.time() - _last
|
|
if dt < EDGAR_RATE:
|
|
time.sleep(EDGAR_RATE - dt)
|
|
req = urllib.request.Request(url, headers=HEADERS)
|
|
resp = urllib.request.urlopen(req, timeout=timeout)
|
|
data = resp.read()
|
|
_last = time.time()
|
|
if resp.headers.get("Content-Encoding") == "gzip" or data[:2] == b"\x1f\x8b":
|
|
data = gzip.decompress(data)
|
|
return data
|
|
|
|
|
|
def _read_tsv(path: Path) -> list[dict]:
|
|
if not path.exists():
|
|
log.warning("missing %s", path)
|
|
return []
|
|
# csv.field_size_limit because some SEC fields are huge
|
|
csv.field_size_limit(min(sys.maxsize, 2**31 - 1))
|
|
with open(path, "r", encoding="utf-8", errors="replace", newline="") as f:
|
|
return list(csv.DictReader(f, delimiter="\t"))
|
|
|
|
|
|
def _slug(s: str) -> str:
|
|
"""Make an IRI-safe local name from an entity string."""
|
|
s = re.sub(r"[^A-Za-z0-9]+", "_", (s or "").strip())
|
|
return s.strip("_") or "x"
|
|
|
|
|
|
def html_to_text(raw: str) -> str:
|
|
raw = re.sub(r"(?is)<script.*?</script>", " ", raw)
|
|
raw = re.sub(r"(?is)<style.*?</style>", " ", raw)
|
|
txt = re.sub(r"(?s)<[^>]+>", " ", raw)
|
|
txt = re.sub(r"&#\d+;", " ", txt)
|
|
txt = re.sub(r"&[a-zA-Z]+;", " ", txt)
|
|
txt = re.sub(r"[ \t]+", " ", txt)
|
|
txt = re.sub(r"\s*\n\s*", "\n", txt)
|
|
return re.sub(r"\n{3,}", "\n\n", txt).strip()
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# stage: gold -- build per-registrant (trust) gold graph from N-CEN
|
|
# --------------------------------------------------------------------------- #
|
|
def build_gold(ncen_dir: Path, custodian_scope: str = "primary"):
|
|
d = Path(ncen_dir)
|
|
submission = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "SUBMISSION.tsv")}
|
|
registrant = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "REGISTRANT.tsv")}
|
|
|
|
# FUND_REPORTED_INFO: FUND_ID -> fund row (also gives ACCESSION + flags)
|
|
funds = _read_tsv(d / "FUND_REPORTED_INFO.tsv")
|
|
fund_by_id = {r["FUND_ID"]: r for r in funds}
|
|
funds_by_acc = defaultdict(list)
|
|
for r in funds:
|
|
funds_by_acc[r["ACCESSION_NUMBER"]].append(r)
|
|
|
|
# provider tables keyed by FUND_ID
|
|
def by_fund(fname, name_col, lei_col):
|
|
out = defaultdict(list)
|
|
for r in _read_tsv(d / fname):
|
|
fid = r.get("FUND_ID", "")
|
|
nm = (r.get(name_col, "") or "").strip()
|
|
if fid and nm:
|
|
out[fid].append({"name": nm, "lei": (r.get(lei_col, "") or "").strip(),
|
|
"affiliated": r.get("IS_AFFILIATED", ""),
|
|
"type": r.get("ADVISER_TYPE", ""),
|
|
"is_sub_custodian": r.get("IS_SUB_CUSTODIAN", "")})
|
|
return out
|
|
|
|
advisers = by_fund("ADVISER.tsv", "ADVISER_NAME", "ADVISER_LEI")
|
|
tagents = by_fund("TRANSFER_AGENT.tsv", "TRANSFERAGENT_NAME", "TRANSFERAGENT_LEI")
|
|
custodians = by_fund("CUSTODIAN.tsv", "CUSTODIAN_NAME", "CUSTODIAN_LEI")
|
|
admins = by_fund("ADMIN.tsv", "ADMIN_NAME", "ADMIN_LEI")
|
|
|
|
# Custodian scoping. Foreign sub-custodians (IS_SUB_CUSTODIAN=Y, ~88% of rows)
|
|
# appear ONLY in N-CEN, never in the prospectus prose, and dominate the edge
|
|
# count (66% of all edges). They are therefore unextractable noise for a
|
|
# text->triples task. Default keeps the PRIMARY custodian only.
|
|
# "primary" -> only IS_SUB_CUSTODIAN != Y (one prose-grounded edge per fund)
|
|
# "all" -> every custodian row (legacy behaviour)
|
|
# "none" -> drop the custodian relation entirely
|
|
if custodian_scope == "primary":
|
|
custodians = {fid: [c for c in cs
|
|
if (c.get("is_sub_custodian", "") or "").upper() != "Y"]
|
|
for fid, cs in custodians.items()}
|
|
elif custodian_scope == "none":
|
|
custodians = {}
|
|
|
|
# underwriter (distributor) keyed by ACCESSION (trust level)
|
|
underwriters = defaultdict(list)
|
|
for r in _read_tsv(d / "PRINCIPAL_UNDERWRITER.tsv"):
|
|
acc = r.get("ACCESSION_NUMBER", "")
|
|
nm = (r.get("UNDERWRITER_NAME", "") or "").strip()
|
|
if acc and nm:
|
|
underwriters[acc].append({"name": nm,
|
|
"lei": (r.get("UNDERWRITER_LEI", "") or "").strip()})
|
|
|
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
n_graphs = 0
|
|
n_edges = 0
|
|
with open(GOLD_PATH, "w", encoding="utf-8") as out:
|
|
for acc, reg in registrant.items():
|
|
cik = (reg.get("CIK", "") or "").strip().zfill(10)
|
|
trust_name = (reg.get("REGISTRANT_NAME", "") or "").strip()
|
|
if not trust_name:
|
|
continue
|
|
trust_iri = "trust:" + _slug(trust_name)
|
|
|
|
triples = []
|
|
entities = {trust_iri: {"type": "Trust", "label": trust_name,
|
|
"lei": (reg.get("LEI", "") or "").strip()}}
|
|
|
|
# trust-level distributor edges
|
|
for u in underwriters.get(acc, []):
|
|
d_iri = "org:" + _slug(u["name"])
|
|
entities[d_iri] = {"type": "Distributor", "label": u["name"], "lei": u["lei"]}
|
|
triples.append((trust_iri, "underwrittenBy", d_iri))
|
|
|
|
for fr in funds_by_acc.get(acc, []):
|
|
fid = fr["FUND_ID"]
|
|
fname = (fr.get("FUND_NAME", "") or "").strip()
|
|
if not fname:
|
|
continue
|
|
f_iri = "fund:" + _slug(fname)
|
|
entities[f_iri] = {"type": "Fund", "label": fname,
|
|
"series_id": (fr.get("SERIES_ID", "") or "").strip(),
|
|
"lei": (fr.get("LEI", "") or "").strip()}
|
|
triples.append((f_iri, "seriesOf", trust_iri))
|
|
|
|
if (fr.get("IS_INDEX", "") or "").upper() == "Y":
|
|
entities[f_iri]["is_index"] = True
|
|
if (fr.get("IS_ETF", "") or "").upper() == "Y":
|
|
entities[f_iri]["is_etf"] = True
|
|
|
|
for a in advisers.get(fid, []):
|
|
o = "org:" + _slug(a["name"])
|
|
is_sub = (a.get("type", "") or "").lower().startswith("sub")
|
|
entities[o] = {"type": "SubAdviser" if is_sub else "InvestmentAdviser",
|
|
"label": a["name"], "lei": a["lei"]}
|
|
triples.append((f_iri, "subAdvisedBy" if is_sub else "advisedBy", o))
|
|
for a in tagents.get(fid, []):
|
|
o = "org:" + _slug(a["name"])
|
|
entities[o] = {"type": "TransferAgent", "label": a["name"], "lei": a["lei"]}
|
|
triples.append((f_iri, "transferAgent", o))
|
|
for a in custodians.get(fid, []):
|
|
o = "org:" + _slug(a["name"])
|
|
entities[o] = {"type": "Custodian", "label": a["name"], "lei": a["lei"]}
|
|
triples.append((f_iri, "custodian", o))
|
|
for a in admins.get(fid, []):
|
|
o = "org:" + _slug(a["name"])
|
|
entities[o] = {"type": "Administrator", "label": a["name"], "lei": a["lei"]}
|
|
triples.append((f_iri, "administrator", o))
|
|
|
|
# dedupe triples
|
|
triples = sorted(set(triples))
|
|
if not triples:
|
|
continue
|
|
rec = {
|
|
"accession": acc, "cik": cik,
|
|
"trust_name": trust_name, "trust_iri": trust_iri,
|
|
"n_funds": len(funds_by_acc.get(acc, [])),
|
|
"entities": entities,
|
|
"triples": [{"s": s, "p": p, "o": o} for s, p, o in triples],
|
|
}
|
|
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
n_graphs += 1
|
|
n_edges += len(triples)
|
|
|
|
log.info("gold: %d trust graphs, %d entity->entity edges -> %s",
|
|
n_graphs, n_edges, GOLD_PATH)
|
|
# quick edge-type histogram
|
|
hist = defaultdict(int)
|
|
with open(GOLD_PATH, encoding="utf-8") as f:
|
|
for line in f:
|
|
for t in json.loads(line)["triples"]:
|
|
hist[t["p"]] += 1
|
|
print("\nEdge-type histogram (entity->entity edges):")
|
|
for p, c in sorted(hist.items(), key=lambda x: -x[1]):
|
|
print(f" {p:16s} {c:>7,}")
|
|
print(f"\nTotal: {n_graphs:,} trust graphs, {n_edges:,} edges")
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# stage: fetch -- prospectus prose for selected registrants
|
|
# --------------------------------------------------------------------------- #
|
|
# Full statutory prospectuses (cover ALL funds of a book) vs. short supplements.
|
|
# We strongly prefer the full prospectuses; 497/497K supplements are tiny
|
|
# amendments and only used as a fallback when no full prospectus is available.
|
|
FULL_PROSPECTUS_FORMS = ("485BPOS", "485APOS")
|
|
SUPPLEMENT_FORMS = ("497", "497K")
|
|
|
|
|
|
def _prospectus_filings(cik: str, max_filings: int):
|
|
"""Return up to max_filings recent prospectus filings for a CIK.
|
|
|
|
Large fund families split their funds across SEVERAL prospectus books, so the
|
|
single most recent 485BPOS covers only a subset of the trust's funds. We
|
|
therefore collect the most recent FULL prospectuses (485BPOS/485APOS) first,
|
|
newest first, and fall back to 497/497K supplements only if no full
|
|
prospectus exists. Returns a list of (accession_nodash, primary_doc).
|
|
"""
|
|
url = f"https://data.sec.gov/submissions/CIK{cik}.json"
|
|
j = json.loads(_get(url).decode("utf-8", "replace"))
|
|
recent = j.get("filings", {}).get("recent", {})
|
|
forms = recent.get("form", [])
|
|
accs = recent.get("accessionNumber", [])
|
|
docs = recent.get("primaryDocument", [])
|
|
|
|
def collect(form_set):
|
|
seen, out = set(), []
|
|
for i, fm in enumerate(forms): # SEC feed is newest-first
|
|
if fm in form_set and docs[i]:
|
|
acc = accs[i].replace("-", "")
|
|
if acc not in seen:
|
|
seen.add(acc)
|
|
out.append((acc, docs[i]))
|
|
return out
|
|
|
|
full = collect(FULL_PROSPECTUS_FORMS)
|
|
chosen = full[:max_filings]
|
|
if not chosen: # fallback: this trust filed only supplements recently
|
|
chosen = collect(SUPPLEMENT_FORMS)[:max_filings]
|
|
return chosen
|
|
|
|
|
|
def fetch_prose(limit: int, max_filings: int = 8):
|
|
"""Fetch ALL recent prospectus filings per trust and concatenate per CIK.
|
|
|
|
For each selected trust, up to `max_filings` recent prospectus filings are
|
|
downloaded and their extracted text concatenated into data/rdf_poc/prose/
|
|
<cik>.txt, separated by a form-feed marker. This maximises the chance that
|
|
every N-CEN fund of the trust has its prospectus section present, raising
|
|
per-fund segmentation coverage.
|
|
"""
|
|
if not GOLD_PATH.exists():
|
|
log.error("run `gold` first"); return
|
|
PROSE_DIR.mkdir(parents=True, exist_ok=True)
|
|
rows = [json.loads(l) for l in open(GOLD_PATH, encoding="utf-8")]
|
|
# prefer trusts with the most edges (richer graphs) for the PoC slice
|
|
rows.sort(key=lambda r: -len(r["triples"]))
|
|
done = 0
|
|
tot_filings = 0
|
|
for r in rows:
|
|
if done >= limit:
|
|
break
|
|
cik = r["cik"]
|
|
out = PROSE_DIR / f"{cik}.txt"
|
|
if out.exists():
|
|
done += 1
|
|
continue
|
|
try:
|
|
filings = _prospectus_filings(cik, max_filings)
|
|
if not filings:
|
|
log.info("no prospectus filing for CIK %s (%s)", cik, r["trust_name"])
|
|
continue
|
|
parts = []
|
|
for acc, doc in filings:
|
|
try:
|
|
url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc}/{doc}"
|
|
raw = _get(url).decode("utf-8", "replace")
|
|
txt = html_to_text(raw)
|
|
if len(txt) >= 2000:
|
|
parts.append(txt)
|
|
except Exception as e:
|
|
log.debug(" filing %s failed: %s", acc, e)
|
|
if not parts:
|
|
log.info("no usable prospectus text for CIK %s, skipping", cik)
|
|
continue
|
|
combined = ("\n\n\f\n\n").join(parts) # form-feed separates books
|
|
out.write_text(combined, encoding="utf-8")
|
|
tot_filings += len(parts)
|
|
log.info("[%d/%d] %s %d filings prose=%d chars %s",
|
|
done + 1, limit, cik, len(parts), len(combined),
|
|
r["trust_name"][:40])
|
|
done += 1
|
|
except Exception as e:
|
|
log.warning("fetch failed for CIK %s: %s", cik, e)
|
|
log.info("fetched %d filings across %d trusts -> %s",
|
|
tot_filings, done, PROSE_DIR)
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# stage: samples -- join prose + gold into text->triple samples
|
|
# --------------------------------------------------------------------------- #
|
|
def serialize_triples(triples, entities) -> str:
|
|
"""Render triples in the thesis's <triple_start> marker format, grouped by subject.
|
|
|
|
Marker form (Models 2/4 with grammar-terminal tokens in the vocabulary).
|
|
"""
|
|
bys = defaultdict(list)
|
|
for t in triples:
|
|
bys[t["s"]].append((t["p"], t["o"]))
|
|
chunks = []
|
|
for s, pos in bys.items():
|
|
s_label = entities.get(s, {}).get("label", s)
|
|
body = [f"{T_START} {s_label}"]
|
|
for p, o in pos:
|
|
o_label = entities.get(o, {}).get("label", o)
|
|
body.append(f"{T_PRED} {p} {T_OBJ} {o_label}")
|
|
body.append(T_END)
|
|
chunks.append(" ".join(body))
|
|
return "\n".join(chunks)
|
|
|
|
|
|
def serialize_triples_plain(triples, entities) -> str:
|
|
"""Render the SAME triples in a plain Turtle-like form with NO special tokens.
|
|
|
|
For Models 1/3 (decoder-only / encoder-decoder without added grammar-terminal
|
|
tokens). Subjects are factored out and predicate-object lists are separated by
|
|
';' and ',' exactly as in Turtle, so the two serializations encode identical
|
|
content and differ only in whether the delimiters are dedicated tokens.
|
|
"""
|
|
bys = defaultdict(list)
|
|
for t in triples:
|
|
bys[t["s"]].append((t["p"], t["o"]))
|
|
chunks = []
|
|
for s, pos in bys.items():
|
|
s_label = entities.get(s, {}).get("label", s)
|
|
by_pred = defaultdict(list)
|
|
for p, o in pos:
|
|
by_pred[p].append(entities.get(o, {}).get("label", o))
|
|
preds = []
|
|
for p, objs in by_pred.items():
|
|
preds.append(f"{p} " + " , ".join(objs))
|
|
chunks.append(f"{s_label} " + " ; ".join(preds) + " .")
|
|
return "\n".join(chunks)
|
|
|
|
|
|
def ontology_schema(triples, entities) -> dict:
|
|
"""Inferred meta-schema (subject type -> predicate -> object type), per thesis 5.3."""
|
|
schema = defaultdict(lambda: defaultdict(set))
|
|
for t in triples:
|
|
st = entities.get(t["s"], {}).get("type", "Thing")
|
|
ot = entities.get(t["o"], {}).get("type", "Thing")
|
|
schema[st][t["p"]].add(ot)
|
|
return {st: {p: sorted(os) for p, os in preds.items()}
|
|
for st, preds in schema.items()}
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# per-fund segmentation
|
|
# --------------------------------------------------------------------------- #
|
|
def _name_variants(fund_name: str):
|
|
"""Generate prospectus-heading variants of an N-CEN fund name.
|
|
|
|
N-CEN names and prospectus headings often differ in the legal-form suffix
|
|
(Fund vs ETF vs Portfolio) and in spacing/punctuation, so we match on a set
|
|
of normalized variants rather than the exact string.
|
|
"""
|
|
base = fund_name.strip()
|
|
stems = {base}
|
|
# swap the trailing legal-form word
|
|
for suf in (" Fund", " ETF", " Portfolio", " Trust"):
|
|
if base.endswith(suf):
|
|
root = base[: -len(suf)]
|
|
for alt in (" Fund", " ETF", " Portfolio", ""):
|
|
stems.add(root + alt)
|
|
stems.add(root)
|
|
break
|
|
# also the bare root with no suffix
|
|
return {s.strip() for s in stems if len(s.strip()) >= 6}
|
|
|
|
|
|
# A fund section opens with the fund name immediately followed by a summary
|
|
# heading. Filers use several styles right after the name, so we accept any of:
|
|
# - "Fund Summary" / "Summary" (mutual funds)
|
|
# - "Investment Objective" / "Principal Investment Strateg" (objective heading)
|
|
# - "The Fund seeks ..." (ETF objective sentence)
|
|
# - "Class/Ticker:" or a ticker block / "(formerly ...)" (ETF & multi-class
|
|
# summary headers, e.g. JPMorgan, Invesco ETF)
|
|
# The MIN_SEGMENT_CHARS guard below is the real safety net: any anchor that
|
|
# resolves to a collapsed (too-short) segment is discarded, so a slightly broad
|
|
# anchor set cannot reintroduce the spurious-cross-reference problem.
|
|
_SECTION_ANCHOR = re.compile(
|
|
r"\s{0,5}("
|
|
r"Fund Summary|Summary Section|Investment Objective|Principal Investment Strateg"
|
|
r"|The Fund seeks|Class/Ticker|Ticker(s)?:|\(formerly"
|
|
r")", re.I)
|
|
MIN_SEGMENT_CHARS = 1500 # a real fund summary is at least this long
|
|
|
|
|
|
def _heading_positions(text: str, fund_names):
|
|
"""Return all (offset, fund_name) anchored heading candidates in the text.
|
|
|
|
A candidate is a position where a fund name variant is immediately followed
|
|
by a strong section anchor. A fund may have several candidates (its book may
|
|
be concatenated more than once, or it appears in multiple books); all are
|
|
kept so segmentation can choose the one that yields a non-trivial segment.
|
|
"""
|
|
cands = []
|
|
for fn in fund_names:
|
|
for v in sorted(_name_variants(fn), key=len, reverse=True):
|
|
for m in re.finditer(re.escape(v), text):
|
|
if _SECTION_ANCHOR.match(text[m.end():m.end() + 45]):
|
|
cands.append((m.start(), fn))
|
|
return sorted(cands)
|
|
|
|
|
|
def _segment_prose(text: str, fund_names):
|
|
"""Split prose into per-fund segments using anchored section headings.
|
|
|
|
Algorithm:
|
|
1. collect all anchored heading candidates across the (possibly multi-book)
|
|
text and sort them by offset;
|
|
2. each candidate's segment runs to the NEXT candidate heading (of any
|
|
fund), so boundaries come only from real section starts;
|
|
3. for each fund, choose the candidate whose segment is longest and at
|
|
least MIN_SEGMENT_CHARS, discarding collapsed (too-short) candidates.
|
|
Returns {fund_name: segment_text} for funds with a usable section.
|
|
"""
|
|
cands = _heading_positions(text, fund_names)
|
|
if not cands:
|
|
return {}
|
|
offsets = [c[0] for c in cands]
|
|
best = {} # fund -> (length, segment_text)
|
|
for i, (off, fn) in enumerate(cands):
|
|
end = offsets[i + 1] if i + 1 < len(offsets) else len(text)
|
|
seg = text[off:end]
|
|
if len(seg) < MIN_SEGMENT_CHARS:
|
|
continue
|
|
if fn not in best or len(seg) > best[fn][0]:
|
|
best[fn] = (len(seg), seg)
|
|
return {fn: seg for fn, (ln, seg) in best.items()}
|
|
|
|
|
|
def build_samples(per_fund: bool = True):
|
|
if not GOLD_PATH.exists():
|
|
log.error("run `gold` first"); return
|
|
if per_fund:
|
|
return _build_samples_per_fund()
|
|
gold = {json.loads(l)["cik"]: json.loads(l)
|
|
for l in open(GOLD_PATH, encoding="utf-8")}
|
|
n = 0
|
|
tot_text = tot_json = 0
|
|
with open(SAMPLES_PATH, "w", encoding="utf-8") as out:
|
|
for prose_file in sorted(PROSE_DIR.glob("*.txt")):
|
|
cik = prose_file.stem
|
|
g = gold.get(cik)
|
|
if not g:
|
|
continue
|
|
text = prose_file.read_text(encoding="utf-8")
|
|
triples, ents = g["triples"], g["entities"]
|
|
target = serialize_triples(triples, ents)
|
|
schema = ontology_schema(triples, ents)
|
|
sample = {
|
|
"cik": cik,
|
|
"trust_name": g["trust_name"],
|
|
"input_text": text,
|
|
"ontology": schema,
|
|
"target_triples": triples,
|
|
"target_serialized": target,
|
|
"stats": {
|
|
"input_chars": len(text),
|
|
"target_chars": len(target),
|
|
"n_triples": len(triples),
|
|
"n_entities": len(ents),
|
|
"text_to_json_ratio": round(len(text) / max(1, len(target)), 1),
|
|
},
|
|
}
|
|
out.write(json.dumps(sample, ensure_ascii=False) + "\n")
|
|
n += 1
|
|
tot_text += len(text)
|
|
tot_json += len(target)
|
|
log.info("wrote %d samples -> %s", n, SAMPLES_PATH)
|
|
if n:
|
|
print(f"\n{n} samples")
|
|
print(f" mean input : {tot_text//n:>8,} chars")
|
|
print(f" mean target : {tot_json//n:>8,} chars")
|
|
print(f" mean ratio : {tot_text/max(1,tot_json):>8.1f} : 1 (text : json)")
|
|
|
|
|
|
def _build_samples_per_fund():
|
|
"""One sample per fund: the fund's prospectus section -> the fund's subgraph.
|
|
|
|
For each trust, the prose is segmented into per-fund sections. Each fund's
|
|
target is its own edges (advisedBy, custodian, ...) plus the fund-anchored
|
|
seriesOf edge and the trust-level underwrittenBy edge (shared, but a true
|
|
fact about that fund's trust). Funds whose section cannot be located in the
|
|
prose are skipped and counted; trusts where nothing can be located fall back
|
|
to a single whole-trust sample so no data is silently dropped.
|
|
"""
|
|
gold = {json.loads(l)["cik"]: json.loads(l)
|
|
for l in open(GOLD_PATH, encoding="utf-8")}
|
|
n = 0
|
|
n_funds_total = n_funds_located = n_fallback_trusts = 0
|
|
tot_text = tot_json = 0
|
|
ratios = []
|
|
with open(SAMPLES_PATH, "w", encoding="utf-8") as out:
|
|
for prose_file in sorted(PROSE_DIR.glob("*.txt")):
|
|
cik = prose_file.stem
|
|
g = gold.get(cik)
|
|
if not g:
|
|
continue
|
|
text = prose_file.read_text(encoding="utf-8")
|
|
ents = g["entities"]
|
|
# group triples by subject fund IRI; collect trust-level edges
|
|
fund_iris = {e_iri for e_iri, e in ents.items() if e["type"] == "Fund"}
|
|
trust_iri = g["trust_iri"]
|
|
by_fund = defaultdict(list)
|
|
trust_edges = []
|
|
for t in g["triples"]:
|
|
if t["s"] in fund_iris:
|
|
by_fund[t["s"]].append(t)
|
|
elif t["s"] == trust_iri:
|
|
trust_edges.append(t) # e.g. underwrittenBy
|
|
|
|
fund_label = {iri: ents[iri]["label"] for iri in fund_iris}
|
|
n_funds_total += len(fund_iris)
|
|
segs = _segment_prose(text, list(fund_label.values()))
|
|
label_to_iri = {v: k for k, v in fund_label.items()}
|
|
|
|
if not segs: # whole-trust fallback (no section located)
|
|
n_fallback_trusts += 1
|
|
triples = g["triples"]
|
|
target = serialize_triples(triples, ents)
|
|
target_plain = serialize_triples_plain(triples, ents)
|
|
rec = {
|
|
"sample_id": f"{cik}:ALL", "cik": cik, "trust_name": g["trust_name"],
|
|
"fund": None, "segmented": False,
|
|
"input_text": text, "ontology": ontology_schema(triples, ents),
|
|
"target_triples": triples,
|
|
"target_serialized": target,
|
|
"target_serialized_plain": target_plain,
|
|
"stats": {"input_chars": len(text), "target_chars": len(target),
|
|
"n_triples": len(triples), "n_entities": len(ents),
|
|
"text_to_json_ratio": round(len(text) / max(1, len(target)), 1)},
|
|
}
|
|
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
n += 1; tot_text += len(text); tot_json += len(target)
|
|
ratios.append(rec["stats"]["text_to_json_ratio"])
|
|
continue
|
|
|
|
for label, seg in segs.items():
|
|
f_iri = label_to_iri.get(label)
|
|
if not f_iri:
|
|
continue
|
|
triples = list(by_fund.get(f_iri, [])) + list(trust_edges)
|
|
if not triples:
|
|
continue
|
|
n_funds_located += 1
|
|
# restrict entities to those referenced by this fund's triples
|
|
ref = {trust_iri}
|
|
for t in triples:
|
|
ref.add(t["s"]); ref.add(t["o"])
|
|
sub_ents = {k: ents[k] for k in ref if k in ents}
|
|
target = serialize_triples(triples, sub_ents)
|
|
target_plain = serialize_triples_plain(triples, sub_ents)
|
|
rec = {
|
|
"sample_id": f"{cik}:{ents[f_iri].get('series_id') or _slug(label)}",
|
|
"cik": cik, "trust_name": g["trust_name"],
|
|
"fund": label, "series_id": ents[f_iri].get("series_id", ""),
|
|
"segmented": True,
|
|
"input_text": seg, "ontology": ontology_schema(triples, sub_ents),
|
|
"target_triples": triples,
|
|
"target_serialized": target,
|
|
"target_serialized_plain": target_plain,
|
|
"stats": {"input_chars": len(seg), "target_chars": len(target),
|
|
"n_triples": len(triples), "n_entities": len(sub_ents),
|
|
"text_to_json_ratio": round(len(seg) / max(1, len(target)), 1)},
|
|
}
|
|
out.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
|
n += 1; tot_text += len(seg); tot_json += len(target)
|
|
ratios.append(rec["stats"]["text_to_json_ratio"])
|
|
|
|
import statistics as _st
|
|
log.info("wrote %d per-fund samples -> %s", n, SAMPLES_PATH)
|
|
if n:
|
|
cov = n_funds_located / max(1, n_funds_total)
|
|
print(f"\n{n} samples ({n_funds_located}/{n_funds_total} funds located, "
|
|
f"coverage {cov:.0%}; {n_fallback_trusts} trusts fell back to whole-doc)")
|
|
print(f" mean input : {tot_text//n:>8,} chars")
|
|
print(f" mean target : {tot_json//n:>8,} chars")
|
|
print(f" median ratio: {_st.median(ratios):>8.1f} : 1 (text : json)")
|
|
print(f" mean ratio : {tot_text/max(1,tot_json):>8.1f} : 1")
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
# stage: split -- trust-level train/val/test split
|
|
# --------------------------------------------------------------------------- #
|
|
def _bucket(cik: str) -> float:
|
|
"""Deterministic [0,1) value per trust CIK (stable, no RNG state)."""
|
|
import hashlib
|
|
h = hashlib.sha256(cik.encode()).hexdigest()
|
|
return int(h[:8], 16) / 0xFFFFFFFF
|
|
|
|
|
|
def build_split(val_frac: float = 0.10, test_frac: float = 0.10):
|
|
"""Split samples.jsonl into train/val/test JSONL, partitioned by TRUST (CIK).
|
|
|
|
Splitting by trust (not by fund) prevents leakage: two funds of the same
|
|
trust share advisers, distributors and custodians, so allowing them into
|
|
different splits would let the model memorise trust-specific entities. The
|
|
assignment is a deterministic hash of the CIK, so the split is reproducible
|
|
and stable as new samples are added.
|
|
"""
|
|
if not SAMPLES_PATH.exists():
|
|
log.error("run `samples` first"); return
|
|
rows = [json.loads(l) for l in open(SAMPLES_PATH, encoding="utf-8")]
|
|
out = {"train": [], "val": [], "test": []}
|
|
for r in rows:
|
|
b = _bucket(r["cik"])
|
|
split = "test" if b < test_frac else \
|
|
"val" if b < test_frac + val_frac else "train"
|
|
out[split].append(r)
|
|
for name, recs in out.items():
|
|
p = OUT_DIR / f"{name}.jsonl"
|
|
with open(p, "w", encoding="utf-8") as f:
|
|
for r in recs:
|
|
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
|
n_tr = {s: len({r["cik"] for r in recs}) for s, recs in out.items()}
|
|
print("\nTrust-level split (deterministic by CIK):")
|
|
for s in ("train", "val", "test"):
|
|
print(f" {s:5s}: {len(out[s]):>5,} samples from {n_tr[s]:>4} trusts")
|
|
total = sum(len(v) for v in out.values())
|
|
print(f" total: {total:,} samples -> {OUT_DIR}/{{train,val,test}}.jsonl")
|
|
# leakage check
|
|
cik_sets = {s: {r["cik"] for r in recs} for s, recs in out.items()}
|
|
overlap = (cik_sets["train"] & cik_sets["val"]) | \
|
|
(cik_sets["train"] & cik_sets["test"]) | \
|
|
(cik_sets["val"] & cik_sets["test"])
|
|
print(f" trust overlap across splits: {len(overlap)} (should be 0)")
|
|
|
|
|
|
# --------------------------------------------------------------------------- #
|
|
def main():
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
|
|
ap = argparse.ArgumentParser()
|
|
sub = ap.add_subparsers(dest="cmd")
|
|
g = sub.add_parser("gold"); g.add_argument("--ncen", default="data/ncen/2025q3")
|
|
g.add_argument("--custodian-scope", choices=["primary", "all", "none"],
|
|
default="primary",
|
|
help="primary=only IS_SUB_CUSTODIAN!=Y (default); all=every row; none=drop")
|
|
f = sub.add_parser("fetch"); f.add_argument("--limit", type=int, default=25)
|
|
f.add_argument("--max-filings", type=int, default=8,
|
|
help="max prospectus filings to fetch & concatenate per trust")
|
|
s = sub.add_parser("samples")
|
|
s.add_argument("--whole-trust", action="store_true",
|
|
help="one sample per trust instead of per-fund segmentation")
|
|
sp = sub.add_parser("split")
|
|
sp.add_argument("--val-frac", type=float, default=0.10)
|
|
sp.add_argument("--test-frac", type=float, default=0.10)
|
|
a = sub.add_parser("all"); a.add_argument("--ncen", default="data/ncen/2025q3")
|
|
a.add_argument("--limit", type=int, default=25)
|
|
a.add_argument("--max-filings", type=int, default=8)
|
|
a.add_argument("--custodian-scope", choices=["primary", "all", "none"], default="primary")
|
|
a.add_argument("--whole-trust", action="store_true")
|
|
a.add_argument("--val-frac", type=float, default=0.10)
|
|
a.add_argument("--test-frac", type=float, default=0.10)
|
|
args = ap.parse_args()
|
|
|
|
if args.cmd == "gold":
|
|
build_gold(Path(args.ncen), custodian_scope=args.custodian_scope)
|
|
elif args.cmd == "fetch":
|
|
fetch_prose(args.limit, max_filings=args.max_filings)
|
|
elif args.cmd == "samples":
|
|
build_samples(per_fund=not args.whole_trust)
|
|
elif args.cmd == "split":
|
|
build_split(val_frac=args.val_frac, test_frac=args.test_frac)
|
|
elif args.cmd == "all":
|
|
build_gold(Path(args.ncen), custodian_scope=args.custodian_scope)
|
|
fetch_prose(args.limit, max_filings=args.max_filings)
|
|
build_samples(per_fund=not args.whole_trust)
|
|
build_split(val_frac=args.val_frac, test_frac=args.test_frac)
|
|
else:
|
|
ap.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|