#!/usr/bin/env python3 """ SEC Fund Prospectus -> RDF Triple dataset builder (proof of concept). Produces text->triples samples where: - INPUT = the prospectus prose for a fund family (long natural-language text), - OUTPUT = a graph of entity->entity RDF triples (NOT flat literal attributes), - GOLD = N-CEN structured filings (service providers) + the prospectus structure. The graph edges are genuine relationships between entities: Fund seriesOf Trust Fund advisedBy InvestmentAdviser Fund subAdvisedBy SubAdviser Fund transferAgent TransferAgent Fund custodian Custodian Fund administrator Administrator Trust underwrittenBy Distributor Fund tracksIndex Index (flag-derived; index name from prose) Gold for advisedBy/transferAgent/custodian/administrator/underwrittenBy comes directly from N-CEN (no model). The prospectus prose is fetched from EDGAR and serves as the model input. Stages: gold -- build the per-fund gold graph from the local N-CEN flat files fetch -- download prospectus prose (485BPOS) for the selected N-CEN registrants samples -- join prose + gold into text->triple training samples all -- run gold, fetch, samples in order Usage: python build_rdf_dataset.py gold --ncen data/ncen/2025q3 python build_rdf_dataset.py fetch --limit 25 python build_rdf_dataset.py samples python build_rdf_dataset.py all --limit 25 """ import argparse import csv import gzip import json import logging import re import sys import time import urllib.request from collections import defaultdict from pathlib import Path log = logging.getLogger("rdf") HEADERS = {"User-Agent": "FundDataResearch/1.0 research@university.edu", "Accept-Encoding": "gzip, deflate"} EDGAR_RATE = 0.15 # seconds between SEC requests (well under 10/s limit) _last = 0.0 OUT_DIR = Path("data/rdf_poc") GOLD_PATH = OUT_DIR / "gold_graphs.jsonl" PROSE_DIR = OUT_DIR / "prose" SAMPLES_PATH = OUT_DIR / "samples.jsonl" # RDF marker tokens from the thesis (Section 5.2) T_START, T_PRED, T_OBJ, T_END = ( "", "", "", "") # --------------------------------------------------------------------------- # # helpers # --------------------------------------------------------------------------- # def _get(url: str, timeout: int = 120) -> bytes: global _last dt = time.time() - _last if dt < EDGAR_RATE: time.sleep(EDGAR_RATE - dt) req = urllib.request.Request(url, headers=HEADERS) resp = urllib.request.urlopen(req, timeout=timeout) data = resp.read() _last = time.time() if resp.headers.get("Content-Encoding") == "gzip" or data[:2] == b"\x1f\x8b": data = gzip.decompress(data) return data def _read_tsv(path: Path) -> list[dict]: if not path.exists(): log.warning("missing %s", path) return [] # csv.field_size_limit because some SEC fields are huge csv.field_size_limit(min(sys.maxsize, 2**31 - 1)) with open(path, "r", encoding="utf-8", errors="replace", newline="") as f: return list(csv.DictReader(f, delimiter="\t")) def _slug(s: str) -> str: """Make an IRI-safe local name from an entity string.""" s = re.sub(r"[^A-Za-z0-9]+", "_", (s or "").strip()) return s.strip("_") or "x" def html_to_text(raw: str) -> str: raw = re.sub(r"(?is)", " ", raw) raw = re.sub(r"(?is)", " ", raw) txt = re.sub(r"(?s)<[^>]+>", " ", raw) txt = re.sub(r"&#\d+;", " ", txt) txt = re.sub(r"&[a-zA-Z]+;", " ", txt) txt = re.sub(r"[ \t]+", " ", txt) txt = re.sub(r"\s*\n\s*", "\n", txt) return re.sub(r"\n{3,}", "\n\n", txt).strip() # --------------------------------------------------------------------------- # # stage: gold -- build per-registrant (trust) gold graph from N-CEN # --------------------------------------------------------------------------- # def build_gold(ncen_dir: Path, custodian_scope: str = "primary"): d = Path(ncen_dir) submission = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "SUBMISSION.tsv")} registrant = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "REGISTRANT.tsv")} # FUND_REPORTED_INFO: FUND_ID -> fund row (also gives ACCESSION + flags) funds = _read_tsv(d / "FUND_REPORTED_INFO.tsv") fund_by_id = {r["FUND_ID"]: r for r in funds} funds_by_acc = defaultdict(list) for r in funds: funds_by_acc[r["ACCESSION_NUMBER"]].append(r) # provider tables keyed by FUND_ID def by_fund(fname, name_col, lei_col): out = defaultdict(list) for r in _read_tsv(d / fname): fid = r.get("FUND_ID", "") nm = (r.get(name_col, "") or "").strip() if fid and nm: out[fid].append({"name": nm, "lei": (r.get(lei_col, "") or "").strip(), "affiliated": r.get("IS_AFFILIATED", ""), "type": r.get("ADVISER_TYPE", ""), "is_sub_custodian": r.get("IS_SUB_CUSTODIAN", "")}) return out advisers = by_fund("ADVISER.tsv", "ADVISER_NAME", "ADVISER_LEI") tagents = by_fund("TRANSFER_AGENT.tsv", "TRANSFERAGENT_NAME", "TRANSFERAGENT_LEI") custodians = by_fund("CUSTODIAN.tsv", "CUSTODIAN_NAME", "CUSTODIAN_LEI") admins = by_fund("ADMIN.tsv", "ADMIN_NAME", "ADMIN_LEI") # Custodian scoping. Foreign sub-custodians (IS_SUB_CUSTODIAN=Y, ~88% of rows) # appear ONLY in N-CEN, never in the prospectus prose, and dominate the edge # count (66% of all edges). They are therefore unextractable noise for a # text->triples task. Default keeps the PRIMARY custodian only. # "primary" -> only IS_SUB_CUSTODIAN != Y (one prose-grounded edge per fund) # "all" -> every custodian row (legacy behaviour) # "none" -> drop the custodian relation entirely if custodian_scope == "primary": custodians = {fid: [c for c in cs if (c.get("is_sub_custodian", "") or "").upper() != "Y"] for fid, cs in custodians.items()} elif custodian_scope == "none": custodians = {} # underwriter (distributor) keyed by ACCESSION (trust level) underwriters = defaultdict(list) for r in _read_tsv(d / "PRINCIPAL_UNDERWRITER.tsv"): acc = r.get("ACCESSION_NUMBER", "") nm = (r.get("UNDERWRITER_NAME", "") or "").strip() if acc and nm: underwriters[acc].append({"name": nm, "lei": (r.get("UNDERWRITER_LEI", "") or "").strip()}) OUT_DIR.mkdir(parents=True, exist_ok=True) n_graphs = 0 n_edges = 0 with open(GOLD_PATH, "w", encoding="utf-8") as out: for acc, reg in registrant.items(): cik = (reg.get("CIK", "") or "").strip().zfill(10) trust_name = (reg.get("REGISTRANT_NAME", "") or "").strip() if not trust_name: continue trust_iri = "trust:" + _slug(trust_name) triples = [] entities = {trust_iri: {"type": "Trust", "label": trust_name, "lei": (reg.get("LEI", "") or "").strip()}} # trust-level distributor edges for u in underwriters.get(acc, []): d_iri = "org:" + _slug(u["name"]) entities[d_iri] = {"type": "Distributor", "label": u["name"], "lei": u["lei"]} triples.append((trust_iri, "underwrittenBy", d_iri)) for fr in funds_by_acc.get(acc, []): fid = fr["FUND_ID"] fname = (fr.get("FUND_NAME", "") or "").strip() if not fname: continue f_iri = "fund:" + _slug(fname) entities[f_iri] = {"type": "Fund", "label": fname, "series_id": (fr.get("SERIES_ID", "") or "").strip(), "lei": (fr.get("LEI", "") or "").strip()} triples.append((f_iri, "seriesOf", trust_iri)) if (fr.get("IS_INDEX", "") or "").upper() == "Y": entities[f_iri]["is_index"] = True if (fr.get("IS_ETF", "") or "").upper() == "Y": entities[f_iri]["is_etf"] = True for a in advisers.get(fid, []): o = "org:" + _slug(a["name"]) is_sub = (a.get("type", "") or "").lower().startswith("sub") entities[o] = {"type": "SubAdviser" if is_sub else "InvestmentAdviser", "label": a["name"], "lei": a["lei"]} triples.append((f_iri, "subAdvisedBy" if is_sub else "advisedBy", o)) for a in tagents.get(fid, []): o = "org:" + _slug(a["name"]) entities[o] = {"type": "TransferAgent", "label": a["name"], "lei": a["lei"]} triples.append((f_iri, "transferAgent", o)) for a in custodians.get(fid, []): o = "org:" + _slug(a["name"]) entities[o] = {"type": "Custodian", "label": a["name"], "lei": a["lei"]} triples.append((f_iri, "custodian", o)) for a in admins.get(fid, []): o = "org:" + _slug(a["name"]) entities[o] = {"type": "Administrator", "label": a["name"], "lei": a["lei"]} triples.append((f_iri, "administrator", o)) # dedupe triples triples = sorted(set(triples)) if not triples: continue rec = { "accession": acc, "cik": cik, "trust_name": trust_name, "trust_iri": trust_iri, "n_funds": len(funds_by_acc.get(acc, [])), "entities": entities, "triples": [{"s": s, "p": p, "o": o} for s, p, o in triples], } out.write(json.dumps(rec, ensure_ascii=False) + "\n") n_graphs += 1 n_edges += len(triples) log.info("gold: %d trust graphs, %d entity->entity edges -> %s", n_graphs, n_edges, GOLD_PATH) # quick edge-type histogram hist = defaultdict(int) with open(GOLD_PATH, encoding="utf-8") as f: for line in f: for t in json.loads(line)["triples"]: hist[t["p"]] += 1 print("\nEdge-type histogram (entity->entity edges):") for p, c in sorted(hist.items(), key=lambda x: -x[1]): print(f" {p:16s} {c:>7,}") print(f"\nTotal: {n_graphs:,} trust graphs, {n_edges:,} edges") # --------------------------------------------------------------------------- # # stage: fetch -- prospectus prose for selected registrants # --------------------------------------------------------------------------- # # Full statutory prospectuses (cover ALL funds of a book) vs. short supplements. # We strongly prefer the full prospectuses; 497/497K supplements are tiny # amendments and only used as a fallback when no full prospectus is available. FULL_PROSPECTUS_FORMS = ("485BPOS", "485APOS") SUPPLEMENT_FORMS = ("497", "497K") def _prospectus_filings(cik: str, max_filings: int): """Return up to max_filings recent prospectus filings for a CIK. Large fund families split their funds across SEVERAL prospectus books, so the single most recent 485BPOS covers only a subset of the trust's funds. We therefore collect the most recent FULL prospectuses (485BPOS/485APOS) first, newest first, and fall back to 497/497K supplements only if no full prospectus exists. Returns a list of (accession_nodash, primary_doc). """ url = f"https://data.sec.gov/submissions/CIK{cik}.json" j = json.loads(_get(url).decode("utf-8", "replace")) recent = j.get("filings", {}).get("recent", {}) forms = recent.get("form", []) accs = recent.get("accessionNumber", []) docs = recent.get("primaryDocument", []) def collect(form_set): seen, out = set(), [] for i, fm in enumerate(forms): # SEC feed is newest-first if fm in form_set and docs[i]: acc = accs[i].replace("-", "") if acc not in seen: seen.add(acc) out.append((acc, docs[i])) return out full = collect(FULL_PROSPECTUS_FORMS) chosen = full[:max_filings] if not chosen: # fallback: this trust filed only supplements recently chosen = collect(SUPPLEMENT_FORMS)[:max_filings] return chosen def fetch_prose(limit: int, max_filings: int = 8): """Fetch ALL recent prospectus filings per trust and concatenate per CIK. For each selected trust, up to `max_filings` recent prospectus filings are downloaded and their extracted text concatenated into data/rdf_poc/prose/ .txt, separated by a form-feed marker. This maximises the chance that every N-CEN fund of the trust has its prospectus section present, raising per-fund segmentation coverage. """ if not GOLD_PATH.exists(): log.error("run `gold` first"); return PROSE_DIR.mkdir(parents=True, exist_ok=True) rows = [json.loads(l) for l in open(GOLD_PATH, encoding="utf-8")] # prefer trusts with the most edges (richer graphs) for the PoC slice rows.sort(key=lambda r: -len(r["triples"])) done = 0 tot_filings = 0 for r in rows: if done >= limit: break cik = r["cik"] out = PROSE_DIR / f"{cik}.txt" if out.exists(): done += 1 continue try: filings = _prospectus_filings(cik, max_filings) if not filings: log.info("no prospectus filing for CIK %s (%s)", cik, r["trust_name"]) continue parts = [] for acc, doc in filings: try: url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc}/{doc}" raw = _get(url).decode("utf-8", "replace") txt = html_to_text(raw) if len(txt) >= 2000: parts.append(txt) except Exception as e: log.debug(" filing %s failed: %s", acc, e) if not parts: log.info("no usable prospectus text for CIK %s, skipping", cik) continue combined = ("\n\n\f\n\n").join(parts) # form-feed separates books out.write_text(combined, encoding="utf-8") tot_filings += len(parts) log.info("[%d/%d] %s %d filings prose=%d chars %s", done + 1, limit, cik, len(parts), len(combined), r["trust_name"][:40]) done += 1 except Exception as e: log.warning("fetch failed for CIK %s: %s", cik, e) log.info("fetched %d filings across %d trusts -> %s", tot_filings, done, PROSE_DIR) # --------------------------------------------------------------------------- # # stage: samples -- join prose + gold into text->triple samples # --------------------------------------------------------------------------- # def serialize_triples(triples, entities) -> str: """Render triples in the thesis's marker format, grouped by subject. Marker form (Models 2/4 with grammar-terminal tokens in the vocabulary). """ bys = defaultdict(list) for t in triples: bys[t["s"]].append((t["p"], t["o"])) chunks = [] for s, pos in bys.items(): s_label = entities.get(s, {}).get("label", s) body = [f"{T_START} {s_label}"] for p, o in pos: o_label = entities.get(o, {}).get("label", o) body.append(f"{T_PRED} {p} {T_OBJ} {o_label}") body.append(T_END) chunks.append(" ".join(body)) return "\n".join(chunks) def serialize_triples_plain(triples, entities) -> str: """Render the SAME triples in a plain Turtle-like form with NO special tokens. For Models 1/3 (decoder-only / encoder-decoder without added grammar-terminal tokens). Subjects are factored out and predicate-object lists are separated by ';' and ',' exactly as in Turtle, so the two serializations encode identical content and differ only in whether the delimiters are dedicated tokens. """ bys = defaultdict(list) for t in triples: bys[t["s"]].append((t["p"], t["o"])) chunks = [] for s, pos in bys.items(): s_label = entities.get(s, {}).get("label", s) by_pred = defaultdict(list) for p, o in pos: by_pred[p].append(entities.get(o, {}).get("label", o)) preds = [] for p, objs in by_pred.items(): preds.append(f"{p} " + " , ".join(objs)) chunks.append(f"{s_label} " + " ; ".join(preds) + " .") return "\n".join(chunks) def ontology_schema(triples, entities) -> dict: """Inferred meta-schema (subject type -> predicate -> object type), per thesis 5.3.""" schema = defaultdict(lambda: defaultdict(set)) for t in triples: st = entities.get(t["s"], {}).get("type", "Thing") ot = entities.get(t["o"], {}).get("type", "Thing") schema[st][t["p"]].add(ot) return {st: {p: sorted(os) for p, os in preds.items()} for st, preds in schema.items()} # --------------------------------------------------------------------------- # # per-fund segmentation # --------------------------------------------------------------------------- # def _name_variants(fund_name: str): """Generate prospectus-heading variants of an N-CEN fund name. N-CEN names and prospectus headings often differ in the legal-form suffix (Fund vs ETF vs Portfolio) and in spacing/punctuation, so we match on a set of normalized variants rather than the exact string. """ base = fund_name.strip() stems = {base} # swap the trailing legal-form word for suf in (" Fund", " ETF", " Portfolio", " Trust"): if base.endswith(suf): root = base[: -len(suf)] for alt in (" Fund", " ETF", " Portfolio", ""): stems.add(root + alt) stems.add(root) break # also the bare root with no suffix return {s.strip() for s in stems if len(s.strip()) >= 6} # A fund section opens with the fund name immediately followed by a summary # heading. Filers use several styles right after the name, so we accept any of: # - "Fund Summary" / "Summary" (mutual funds) # - "Investment Objective" / "Principal Investment Strateg" (objective heading) # - "The Fund seeks ..." (ETF objective sentence) # - "Class/Ticker:" or a ticker block / "(formerly ...)" (ETF & multi-class # summary headers, e.g. JPMorgan, Invesco ETF) # The MIN_SEGMENT_CHARS guard below is the real safety net: any anchor that # resolves to a collapsed (too-short) segment is discarded, so a slightly broad # anchor set cannot reintroduce the spurious-cross-reference problem. _SECTION_ANCHOR = re.compile( r"\s{0,5}(" r"Fund Summary|Summary Section|Investment Objective|Principal Investment Strateg" r"|The Fund seeks|Class/Ticker|Ticker(s)?:|\(formerly" r")", re.I) MIN_SEGMENT_CHARS = 1500 # a real fund summary is at least this long def _heading_positions(text: str, fund_names): """Return all (offset, fund_name) anchored heading candidates in the text. A candidate is a position where a fund name variant is immediately followed by a strong section anchor. A fund may have several candidates (its book may be concatenated more than once, or it appears in multiple books); all are kept so segmentation can choose the one that yields a non-trivial segment. """ cands = [] for fn in fund_names: for v in sorted(_name_variants(fn), key=len, reverse=True): for m in re.finditer(re.escape(v), text): if _SECTION_ANCHOR.match(text[m.end():m.end() + 45]): cands.append((m.start(), fn)) return sorted(cands) def _segment_prose(text: str, fund_names): """Split prose into per-fund segments using anchored section headings. Algorithm: 1. collect all anchored heading candidates across the (possibly multi-book) text and sort them by offset; 2. each candidate's segment runs to the NEXT candidate heading (of any fund), so boundaries come only from real section starts; 3. for each fund, choose the candidate whose segment is longest and at least MIN_SEGMENT_CHARS, discarding collapsed (too-short) candidates. Returns {fund_name: segment_text} for funds with a usable section. """ cands = _heading_positions(text, fund_names) if not cands: return {} offsets = [c[0] for c in cands] best = {} # fund -> (length, segment_text) for i, (off, fn) in enumerate(cands): end = offsets[i + 1] if i + 1 < len(offsets) else len(text) seg = text[off:end] if len(seg) < MIN_SEGMENT_CHARS: continue if fn not in best or len(seg) > best[fn][0]: best[fn] = (len(seg), seg) return {fn: seg for fn, (ln, seg) in best.items()} def build_samples(per_fund: bool = True): if not GOLD_PATH.exists(): log.error("run `gold` first"); return if per_fund: return _build_samples_per_fund() gold = {json.loads(l)["cik"]: json.loads(l) for l in open(GOLD_PATH, encoding="utf-8")} n = 0 tot_text = tot_json = 0 with open(SAMPLES_PATH, "w", encoding="utf-8") as out: for prose_file in sorted(PROSE_DIR.glob("*.txt")): cik = prose_file.stem g = gold.get(cik) if not g: continue text = prose_file.read_text(encoding="utf-8") triples, ents = g["triples"], g["entities"] target = serialize_triples(triples, ents) schema = ontology_schema(triples, ents) sample = { "cik": cik, "trust_name": g["trust_name"], "input_text": text, "ontology": schema, "target_triples": triples, "target_serialized": target, "stats": { "input_chars": len(text), "target_chars": len(target), "n_triples": len(triples), "n_entities": len(ents), "text_to_json_ratio": round(len(text) / max(1, len(target)), 1), }, } out.write(json.dumps(sample, ensure_ascii=False) + "\n") n += 1 tot_text += len(text) tot_json += len(target) log.info("wrote %d samples -> %s", n, SAMPLES_PATH) if n: print(f"\n{n} samples") print(f" mean input : {tot_text//n:>8,} chars") print(f" mean target : {tot_json//n:>8,} chars") print(f" mean ratio : {tot_text/max(1,tot_json):>8.1f} : 1 (text : json)") def _build_samples_per_fund(): """One sample per fund: the fund's prospectus section -> the fund's subgraph. For each trust, the prose is segmented into per-fund sections. Each fund's target is its own edges (advisedBy, custodian, ...) plus the fund-anchored seriesOf edge and the trust-level underwrittenBy edge (shared, but a true fact about that fund's trust). Funds whose section cannot be located in the prose are skipped and counted; trusts where nothing can be located fall back to a single whole-trust sample so no data is silently dropped. """ gold = {json.loads(l)["cik"]: json.loads(l) for l in open(GOLD_PATH, encoding="utf-8")} n = 0 n_funds_total = n_funds_located = n_fallback_trusts = 0 tot_text = tot_json = 0 ratios = [] with open(SAMPLES_PATH, "w", encoding="utf-8") as out: for prose_file in sorted(PROSE_DIR.glob("*.txt")): cik = prose_file.stem g = gold.get(cik) if not g: continue text = prose_file.read_text(encoding="utf-8") ents = g["entities"] # group triples by subject fund IRI; collect trust-level edges fund_iris = {e_iri for e_iri, e in ents.items() if e["type"] == "Fund"} trust_iri = g["trust_iri"] by_fund = defaultdict(list) trust_edges = [] for t in g["triples"]: if t["s"] in fund_iris: by_fund[t["s"]].append(t) elif t["s"] == trust_iri: trust_edges.append(t) # e.g. underwrittenBy fund_label = {iri: ents[iri]["label"] for iri in fund_iris} n_funds_total += len(fund_iris) segs = _segment_prose(text, list(fund_label.values())) label_to_iri = {v: k for k, v in fund_label.items()} if not segs: # whole-trust fallback (no section located) n_fallback_trusts += 1 triples = g["triples"] target = serialize_triples(triples, ents) target_plain = serialize_triples_plain(triples, ents) rec = { "sample_id": f"{cik}:ALL", "cik": cik, "trust_name": g["trust_name"], "fund": None, "segmented": False, "input_text": text, "ontology": ontology_schema(triples, ents), "target_triples": triples, "target_serialized": target, "target_serialized_plain": target_plain, "stats": {"input_chars": len(text), "target_chars": len(target), "n_triples": len(triples), "n_entities": len(ents), "text_to_json_ratio": round(len(text) / max(1, len(target)), 1)}, } out.write(json.dumps(rec, ensure_ascii=False) + "\n") n += 1; tot_text += len(text); tot_json += len(target) ratios.append(rec["stats"]["text_to_json_ratio"]) continue for label, seg in segs.items(): f_iri = label_to_iri.get(label) if not f_iri: continue triples = list(by_fund.get(f_iri, [])) + list(trust_edges) if not triples: continue n_funds_located += 1 # restrict entities to those referenced by this fund's triples ref = {trust_iri} for t in triples: ref.add(t["s"]); ref.add(t["o"]) sub_ents = {k: ents[k] for k in ref if k in ents} target = serialize_triples(triples, sub_ents) target_plain = serialize_triples_plain(triples, sub_ents) rec = { "sample_id": f"{cik}:{ents[f_iri].get('series_id') or _slug(label)}", "cik": cik, "trust_name": g["trust_name"], "fund": label, "series_id": ents[f_iri].get("series_id", ""), "segmented": True, "input_text": seg, "ontology": ontology_schema(triples, sub_ents), "target_triples": triples, "target_serialized": target, "target_serialized_plain": target_plain, "stats": {"input_chars": len(seg), "target_chars": len(target), "n_triples": len(triples), "n_entities": len(sub_ents), "text_to_json_ratio": round(len(seg) / max(1, len(target)), 1)}, } out.write(json.dumps(rec, ensure_ascii=False) + "\n") n += 1; tot_text += len(seg); tot_json += len(target) ratios.append(rec["stats"]["text_to_json_ratio"]) import statistics as _st log.info("wrote %d per-fund samples -> %s", n, SAMPLES_PATH) if n: cov = n_funds_located / max(1, n_funds_total) print(f"\n{n} samples ({n_funds_located}/{n_funds_total} funds located, " f"coverage {cov:.0%}; {n_fallback_trusts} trusts fell back to whole-doc)") print(f" mean input : {tot_text//n:>8,} chars") print(f" mean target : {tot_json//n:>8,} chars") print(f" median ratio: {_st.median(ratios):>8.1f} : 1 (text : json)") print(f" mean ratio : {tot_text/max(1,tot_json):>8.1f} : 1") # --------------------------------------------------------------------------- # # stage: split -- trust-level train/val/test split # --------------------------------------------------------------------------- # def _bucket(cik: str) -> float: """Deterministic [0,1) value per trust CIK (stable, no RNG state).""" import hashlib h = hashlib.sha256(cik.encode()).hexdigest() return int(h[:8], 16) / 0xFFFFFFFF def build_split(val_frac: float = 0.10, test_frac: float = 0.10): """Split samples.jsonl into train/val/test JSONL, partitioned by TRUST (CIK). Splitting by trust (not by fund) prevents leakage: two funds of the same trust share advisers, distributors and custodians, so allowing them into different splits would let the model memorise trust-specific entities. The assignment is a deterministic hash of the CIK, so the split is reproducible and stable as new samples are added. """ if not SAMPLES_PATH.exists(): log.error("run `samples` first"); return rows = [json.loads(l) for l in open(SAMPLES_PATH, encoding="utf-8")] out = {"train": [], "val": [], "test": []} for r in rows: b = _bucket(r["cik"]) split = "test" if b < test_frac else \ "val" if b < test_frac + val_frac else "train" out[split].append(r) for name, recs in out.items(): p = OUT_DIR / f"{name}.jsonl" with open(p, "w", encoding="utf-8") as f: for r in recs: f.write(json.dumps(r, ensure_ascii=False) + "\n") n_tr = {s: len({r["cik"] for r in recs}) for s, recs in out.items()} print("\nTrust-level split (deterministic by CIK):") for s in ("train", "val", "test"): print(f" {s:5s}: {len(out[s]):>5,} samples from {n_tr[s]:>4} trusts") total = sum(len(v) for v in out.values()) print(f" total: {total:,} samples -> {OUT_DIR}/{{train,val,test}}.jsonl") # leakage check cik_sets = {s: {r["cik"] for r in recs} for s, recs in out.items()} overlap = (cik_sets["train"] & cik_sets["val"]) | \ (cik_sets["train"] & cik_sets["test"]) | \ (cik_sets["val"] & cik_sets["test"]) print(f" trust overlap across splits: {len(overlap)} (should be 0)") # --------------------------------------------------------------------------- # def main(): logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") ap = argparse.ArgumentParser() sub = ap.add_subparsers(dest="cmd") g = sub.add_parser("gold"); g.add_argument("--ncen", default="data/ncen/2025q3") g.add_argument("--custodian-scope", choices=["primary", "all", "none"], default="primary", help="primary=only IS_SUB_CUSTODIAN!=Y (default); all=every row; none=drop") f = sub.add_parser("fetch"); f.add_argument("--limit", type=int, default=25) f.add_argument("--max-filings", type=int, default=8, help="max prospectus filings to fetch & concatenate per trust") s = sub.add_parser("samples") s.add_argument("--whole-trust", action="store_true", help="one sample per trust instead of per-fund segmentation") sp = sub.add_parser("split") sp.add_argument("--val-frac", type=float, default=0.10) sp.add_argument("--test-frac", type=float, default=0.10) a = sub.add_parser("all"); a.add_argument("--ncen", default="data/ncen/2025q3") a.add_argument("--limit", type=int, default=25) a.add_argument("--max-filings", type=int, default=8) a.add_argument("--custodian-scope", choices=["primary", "all", "none"], default="primary") a.add_argument("--whole-trust", action="store_true") a.add_argument("--val-frac", type=float, default=0.10) a.add_argument("--test-frac", type=float, default=0.10) args = ap.parse_args() if args.cmd == "gold": build_gold(Path(args.ncen), custodian_scope=args.custodian_scope) elif args.cmd == "fetch": fetch_prose(args.limit, max_filings=args.max_filings) elif args.cmd == "samples": build_samples(per_fund=not args.whole_trust) elif args.cmd == "split": build_split(val_frac=args.val_frac, test_frac=args.test_frac) elif args.cmd == "all": build_gold(Path(args.ncen), custodian_scope=args.custodian_scope) fetch_prose(args.limit, max_filings=args.max_filings) build_samples(per_fund=not args.whole_trust) build_split(val_frac=args.val_frac, test_frac=args.test_frac) else: ap.print_help() if __name__ == "__main__": main()