fund_rfid_data/build_rdf_dataset.py

#!/usr/bin/env python3
"""
SEC Fund Prospectus -> RDF Triple dataset builder (proof of concept).

Produces text->triples samples where:
  - INPUT  = the prospectus prose for a fund family (long natural-language text),
  - OUTPUT = a graph of entity->entity RDF triples (NOT flat literal attributes),
  - GOLD   = N-CEN structured filings (service providers) + the prospectus structure.

The graph edges are genuine relationships between entities:
    Fund   seriesOf       Trust
    Fund   advisedBy      InvestmentAdviser
    Fund   subAdvisedBy   SubAdviser
    Fund   transferAgent  TransferAgent
    Fund   custodian      Custodian
    Fund   administrator  Administrator
    Trust  underwrittenBy Distributor
    Fund   tracksIndex    Index           (flag-derived; index name from prose)

Gold for advisedBy/transferAgent/custodian/administrator/underwrittenBy comes
directly from N-CEN (no model). The prospectus prose is fetched from EDGAR and
serves as the model input.

Stages:
  gold     -- build the per-fund gold graph from the local N-CEN flat files
  fetch    -- download prospectus prose (485BPOS) for the selected N-CEN registrants
  samples  -- join prose + gold into text->triple training samples
  all      -- run gold, fetch, samples in order

Usage:
  python build_rdf_dataset.py gold     --ncen data/ncen/2025q3
  python build_rdf_dataset.py fetch    --limit 25
  python build_rdf_dataset.py samples
  python build_rdf_dataset.py all      --limit 25
"""

import argparse
import csv
import gzip
import json
import logging
import re
import sys
import time
import urllib.request
from collections import defaultdict
from pathlib import Path

log = logging.getLogger("rdf")

HEADERS = {"User-Agent": "FundDataResearch/1.0 research@university.edu",
           "Accept-Encoding": "gzip, deflate"}
EDGAR_RATE = 0.15  # seconds between SEC requests (well under 10/s limit)
_last = 0.0

OUT_DIR = Path("data/rdf_poc")
GOLD_PATH = OUT_DIR / "gold_graphs.jsonl"
PROSE_DIR = OUT_DIR / "prose"
SAMPLES_PATH = OUT_DIR / "samples.jsonl"

# RDF marker tokens from the thesis (Section 5.2)
T_START, T_PRED, T_OBJ, T_END = (
    "<triple_start>", "<predicate_marker>", "<object_marker>", "<triple_end>")


# --------------------------------------------------------------------------- #
# helpers
# --------------------------------------------------------------------------- #
def _get(url: str, timeout: int = 120) -> bytes:
    global _last
    dt = time.time() - _last
    if dt < EDGAR_RATE:
        time.sleep(EDGAR_RATE - dt)
    req = urllib.request.Request(url, headers=HEADERS)
    resp = urllib.request.urlopen(req, timeout=timeout)
    data = resp.read()
    _last = time.time()
    if resp.headers.get("Content-Encoding") == "gzip" or data[:2] == b"\x1f\x8b":
        data = gzip.decompress(data)
    return data


def _read_tsv(path: Path) -> list[dict]:
    if not path.exists():
        log.warning("missing %s", path)
        return []
    # csv.field_size_limit because some SEC fields are huge
    csv.field_size_limit(min(sys.maxsize, 2**31 - 1))
    with open(path, "r", encoding="utf-8", errors="replace", newline="") as f:
        return list(csv.DictReader(f, delimiter="\t"))


def _slug(s: str) -> str:
    """Make an IRI-safe local name from an entity string."""
    s = re.sub(r"[^A-Za-z0-9]+", "_", (s or "").strip())
    return s.strip("_") or "x"


def html_to_text(raw: str) -> str:
    raw = re.sub(r"(?is)<script.*?</script>", " ", raw)
    raw = re.sub(r"(?is)<style.*?</style>", " ", raw)
    txt = re.sub(r"(?s)<[^>]+>", " ", raw)
    txt = re.sub(r"&#\d+;", " ", txt)
    txt = re.sub(r"&[a-zA-Z]+;", " ", txt)
    txt = re.sub(r"[ \t]+", " ", txt)
    txt = re.sub(r"\s*\n\s*", "\n", txt)
    return re.sub(r"\n{3,}", "\n\n", txt).strip()


# --------------------------------------------------------------------------- #
# stage: gold  -- build per-registrant (trust) gold graph from N-CEN
# --------------------------------------------------------------------------- #
def build_gold(ncen_dir: Path, custodian_scope: str = "primary"):
    d = Path(ncen_dir)
    submission = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "SUBMISSION.tsv")}
    registrant = {r["ACCESSION_NUMBER"]: r for r in _read_tsv(d / "REGISTRANT.tsv")}

    # FUND_REPORTED_INFO: FUND_ID -> fund row (also gives ACCESSION + flags)
    funds = _read_tsv(d / "FUND_REPORTED_INFO.tsv")
    fund_by_id = {r["FUND_ID"]: r for r in funds}
    funds_by_acc = defaultdict(list)
    for r in funds:
        funds_by_acc[r["ACCESSION_NUMBER"]].append(r)

    # provider tables keyed by FUND_ID
    def by_fund(fname, name_col, lei_col):
        out = defaultdict(list)
        for r in _read_tsv(d / fname):
            fid = r.get("FUND_ID", "")
            nm = (r.get(name_col, "") or "").strip()
            if fid and nm:
                out[fid].append({"name": nm, "lei": (r.get(lei_col, "") or "").strip(),
                                 "affiliated": r.get("IS_AFFILIATED", ""),
                                 "type": r.get("ADVISER_TYPE", ""),
                                 "is_sub_custodian": r.get("IS_SUB_CUSTODIAN", "")})
        return out

    advisers = by_fund("ADVISER.tsv", "ADVISER_NAME", "ADVISER_LEI")
    tagents = by_fund("TRANSFER_AGENT.tsv", "TRANSFERAGENT_NAME", "TRANSFERAGENT_LEI")
    custodians = by_fund("CUSTODIAN.tsv", "CUSTODIAN_NAME", "CUSTODIAN_LEI")
    admins = by_fund("ADMIN.tsv", "ADMIN_NAME", "ADMIN_LEI")

    # Custodian scoping. Foreign sub-custodians (IS_SUB_CUSTODIAN=Y, ~88% of rows)
    # appear ONLY in N-CEN, never in the prospectus prose, and dominate the edge
    # count (66% of all edges). They are therefore unextractable noise for a
    # text->triples task. Default keeps the PRIMARY custodian only.
    #   "primary" -> only IS_SUB_CUSTODIAN != Y  (one prose-grounded edge per fund)
    #   "all"     -> every custodian row (legacy behaviour)
    #   "none"    -> drop the custodian relation entirely
    if custodian_scope == "primary":
        custodians = {fid: [c for c in cs
                            if (c.get("is_sub_custodian", "") or "").upper() != "Y"]
                      for fid, cs in custodians.items()}
    elif custodian_scope == "none":
        custodians = {}

    # underwriter (distributor) keyed by ACCESSION (trust level)
    underwriters = defaultdict(list)
    for r in _read_tsv(d / "PRINCIPAL_UNDERWRITER.tsv"):
        acc = r.get("ACCESSION_NUMBER", "")
        nm = (r.get("UNDERWRITER_NAME", "") or "").strip()
        if acc and nm:
            underwriters[acc].append({"name": nm,
                                      "lei": (r.get("UNDERWRITER_LEI", "") or "").strip()})

    OUT_DIR.mkdir(parents=True, exist_ok=True)
    n_graphs = 0
    n_edges = 0
    with open(GOLD_PATH, "w", encoding="utf-8") as out:
        for acc, reg in registrant.items():
            cik = (reg.get("CIK", "") or "").strip().zfill(10)
            trust_name = (reg.get("REGISTRANT_NAME", "") or "").strip()
            if not trust_name:
                continue
            trust_iri = "trust:" + _slug(trust_name)

            triples = []
            entities = {trust_iri: {"type": "Trust", "label": trust_name,
                                    "lei": (reg.get("LEI", "") or "").strip()}}

            # trust-level distributor edges
            for u in underwriters.get(acc, []):
                d_iri = "org:" + _slug(u["name"])
                entities[d_iri] = {"type": "Distributor", "label": u["name"], "lei": u["lei"]}
                triples.append((trust_iri, "underwrittenBy", d_iri))

            for fr in funds_by_acc.get(acc, []):
                fid = fr["FUND_ID"]
                fname = (fr.get("FUND_NAME", "") or "").strip()
                if not fname:
                    continue
                f_iri = "fund:" + _slug(fname)
                entities[f_iri] = {"type": "Fund", "label": fname,
                                   "series_id": (fr.get("SERIES_ID", "") or "").strip(),
                                   "lei": (fr.get("LEI", "") or "").strip()}
                triples.append((f_iri, "seriesOf", trust_iri))

                if (fr.get("IS_INDEX", "") or "").upper() == "Y":
                    entities[f_iri]["is_index"] = True
                if (fr.get("IS_ETF", "") or "").upper() == "Y":
                    entities[f_iri]["is_etf"] = True

                for a in advisers.get(fid, []):
                    o = "org:" + _slug(a["name"])
                    is_sub = (a.get("type", "") or "").lower().startswith("sub")
                    entities[o] = {"type": "SubAdviser" if is_sub else "InvestmentAdviser",
                                   "label": a["name"], "lei": a["lei"]}
                    triples.append((f_iri, "subAdvisedBy" if is_sub else "advisedBy", o))
                for a in tagents.get(fid, []):
                    o = "org:" + _slug(a["name"])
                    entities[o] = {"type": "TransferAgent", "label": a["name"], "lei": a["lei"]}
                    triples.append((f_iri, "transferAgent", o))
                for a in custodians.get(fid, []):
                    o = "org:" + _slug(a["name"])
                    entities[o] = {"type": "Custodian", "label": a["name"], "lei": a["lei"]}
                    triples.append((f_iri, "custodian", o))
                for a in admins.get(fid, []):
                    o = "org:" + _slug(a["name"])
                    entities[o] = {"type": "Administrator", "label": a["name"], "lei": a["lei"]}
                    triples.append((f_iri, "administrator", o))

            # dedupe triples
            triples = sorted(set(triples))
            if not triples:
                continue
            rec = {
                "accession": acc, "cik": cik,
                "trust_name": trust_name, "trust_iri": trust_iri,
                "n_funds": len(funds_by_acc.get(acc, [])),
                "entities": entities,
                "triples": [{"s": s, "p": p, "o": o} for s, p, o in triples],
            }
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            n_graphs += 1
            n_edges += len(triples)

    log.info("gold: %d trust graphs, %d entity->entity edges -> %s",
             n_graphs, n_edges, GOLD_PATH)
    # quick edge-type histogram
    hist = defaultdict(int)
    with open(GOLD_PATH, encoding="utf-8") as f:
        for line in f:
            for t in json.loads(line)["triples"]:
                hist[t["p"]] += 1
    print("\nEdge-type histogram (entity->entity edges):")
    for p, c in sorted(hist.items(), key=lambda x: -x[1]):
        print(f"  {p:16s} {c:>7,}")
    print(f"\nTotal: {n_graphs:,} trust graphs, {n_edges:,} edges")


# --------------------------------------------------------------------------- #
# stage: fetch  -- prospectus prose for selected registrants
# --------------------------------------------------------------------------- #
# Full statutory prospectuses (cover ALL funds of a book) vs. short supplements.
# We strongly prefer the full prospectuses; 497/497K supplements are tiny
# amendments and only used as a fallback when no full prospectus is available.
FULL_PROSPECTUS_FORMS = ("485BPOS", "485APOS")
SUPPLEMENT_FORMS = ("497", "497K")


def _prospectus_filings(cik: str, max_filings: int):
    """Return up to max_filings recent prospectus filings for a CIK.

    Large fund families split their funds across SEVERAL prospectus books, so the
    single most recent 485BPOS covers only a subset of the trust's funds. We
    therefore collect the most recent FULL prospectuses (485BPOS/485APOS) first,
    newest first, and fall back to 497/497K supplements only if no full
    prospectus exists. Returns a list of (accession_nodash, primary_doc).
    """
    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    j = json.loads(_get(url).decode("utf-8", "replace"))
    recent = j.get("filings", {}).get("recent", {})
    forms = recent.get("form", [])
    accs = recent.get("accessionNumber", [])
    docs = recent.get("primaryDocument", [])

    def collect(form_set):
        seen, out = set(), []
        for i, fm in enumerate(forms):  # SEC feed is newest-first
            if fm in form_set and docs[i]:
                acc = accs[i].replace("-", "")
                if acc not in seen:
                    seen.add(acc)
                    out.append((acc, docs[i]))
        return out

    full = collect(FULL_PROSPECTUS_FORMS)
    chosen = full[:max_filings]
    if not chosen:  # fallback: this trust filed only supplements recently
        chosen = collect(SUPPLEMENT_FORMS)[:max_filings]
    return chosen


def fetch_prose(limit: int, max_filings: int = 8):
    """Fetch ALL recent prospectus filings per trust and concatenate per CIK.

    For each selected trust, up to `max_filings` recent prospectus filings are
    downloaded and their extracted text concatenated into data/rdf_poc/prose/
    <cik>.txt, separated by a form-feed marker. This maximises the chance that
    every N-CEN fund of the trust has its prospectus section present, raising
    per-fund segmentation coverage.
    """
    if not GOLD_PATH.exists():
        log.error("run `gold` first"); return
    PROSE_DIR.mkdir(parents=True, exist_ok=True)
    rows = [json.loads(l) for l in open(GOLD_PATH, encoding="utf-8")]
    # prefer trusts with the most edges (richer graphs) for the PoC slice
    rows.sort(key=lambda r: -len(r["triples"]))
    done = 0
    tot_filings = 0
    for r in rows:
        if done >= limit:
            break
        cik = r["cik"]
        out = PROSE_DIR / f"{cik}.txt"
        if out.exists():
            done += 1
            continue
        try:
            filings = _prospectus_filings(cik, max_filings)
            if not filings:
                log.info("no prospectus filing for CIK %s (%s)", cik, r["trust_name"])
                continue
            parts = []
            for acc, doc in filings:
                try:
                    url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{acc}/{doc}"
                    raw = _get(url).decode("utf-8", "replace")
                    txt = html_to_text(raw)
                    if len(txt) >= 2000:
                        parts.append(txt)
                except Exception as e:
                    log.debug("  filing %s failed: %s", acc, e)
            if not parts:
                log.info("no usable prospectus text for CIK %s, skipping", cik)
                continue
            combined = ("\n\n\f\n\n").join(parts)  # form-feed separates books
            out.write_text(combined, encoding="utf-8")
            tot_filings += len(parts)
            log.info("[%d/%d] %s  %d filings  prose=%d chars  %s",
                     done + 1, limit, cik, len(parts), len(combined),
                     r["trust_name"][:40])
            done += 1
        except Exception as e:
            log.warning("fetch failed for CIK %s: %s", cik, e)
    log.info("fetched %d filings across %d trusts -> %s",
             tot_filings, done, PROSE_DIR)


# --------------------------------------------------------------------------- #
# stage: samples  -- join prose + gold into text->triple samples
# --------------------------------------------------------------------------- #
def serialize_triples(triples, entities) -> str:
    """Render triples in the thesis's <triple_start> marker format, grouped by subject.

    Marker form (Models 2/4 with grammar-terminal tokens in the vocabulary).
    """
    bys = defaultdict(list)
    for t in triples:
        bys[t["s"]].append((t["p"], t["o"]))
    chunks = []
    for s, pos in bys.items():
        s_label = entities.get(s, {}).get("label", s)
        body = [f"{T_START} {s_label}"]
        for p, o in pos:
            o_label = entities.get(o, {}).get("label", o)
            body.append(f"{T_PRED} {p} {T_OBJ} {o_label}")
        body.append(T_END)
        chunks.append(" ".join(body))
    return "\n".join(chunks)


def serialize_triples_plain(triples, entities) -> str:
    """Render the SAME triples in a plain Turtle-like form with NO special tokens.

    For Models 1/3 (decoder-only / encoder-decoder without added grammar-terminal
    tokens). Subjects are factored out and predicate-object lists are separated by
    ';' and ',' exactly as in Turtle, so the two serializations encode identical
    content and differ only in whether the delimiters are dedicated tokens.
    """
    bys = defaultdict(list)
    for t in triples:
        bys[t["s"]].append((t["p"], t["o"]))
    chunks = []
    for s, pos in bys.items():
        s_label = entities.get(s, {}).get("label", s)
        by_pred = defaultdict(list)
        for p, o in pos:
            by_pred[p].append(entities.get(o, {}).get("label", o))
        preds = []
        for p, objs in by_pred.items():
            preds.append(f"{p} " + " , ".join(objs))
        chunks.append(f"{s_label} " + " ; ".join(preds) + " .")
    return "\n".join(chunks)


def ontology_schema(triples, entities) -> dict:
    """Inferred meta-schema (subject type -> predicate -> object type), per thesis 5.3."""
    schema = defaultdict(lambda: defaultdict(set))
    for t in triples:
        st = entities.get(t["s"], {}).get("type", "Thing")
        ot = entities.get(t["o"], {}).get("type", "Thing")
        schema[st][t["p"]].add(ot)
    return {st: {p: sorted(os) for p, os in preds.items()}
            for st, preds in schema.items()}


# --------------------------------------------------------------------------- #
# per-fund segmentation
# --------------------------------------------------------------------------- #
def _name_variants(fund_name: str):
    """Generate prospectus-heading variants of an N-CEN fund name.

    N-CEN names and prospectus headings often differ in the legal-form suffix
    (Fund vs ETF vs Portfolio) and in spacing/punctuation, so we match on a set
    of normalized variants rather than the exact string.
    """
    base = fund_name.strip()
    stems = {base}
    # swap the trailing legal-form word
    for suf in (" Fund", " ETF", " Portfolio", " Trust"):
        if base.endswith(suf):
            root = base[: -len(suf)]
            for alt in (" Fund", " ETF", " Portfolio", ""):
                stems.add(root + alt)
            stems.add(root)
            break
    # also the bare root with no suffix
    return {s.strip() for s in stems if len(s.strip()) >= 6}


# A fund section opens with the fund name immediately followed by a summary
# heading. Filers use several styles right after the name, so we accept any of:
#   - "Fund Summary" / "Summary" (mutual funds)
#   - "Investment Objective" / "Principal Investment Strateg" (objective heading)
#   - "The Fund seeks ..." (ETF objective sentence)
#   - "Class/Ticker:" or a ticker block / "(formerly ...)" (ETF & multi-class
#     summary headers, e.g. JPMorgan, Invesco ETF)
# The MIN_SEGMENT_CHARS guard below is the real safety net: any anchor that
# resolves to a collapsed (too-short) segment is discarded, so a slightly broad
# anchor set cannot reintroduce the spurious-cross-reference problem.
_SECTION_ANCHOR = re.compile(
    r"\s{0,5}("
    r"Fund Summary|Summary Section|Investment Objective|Principal Investment Strateg"
    r"|The Fund seeks|Class/Ticker|Ticker(s)?:|\(formerly"
    r")", re.I)
MIN_SEGMENT_CHARS = 1500  # a real fund summary is at least this long


def _heading_positions(text: str, fund_names):
    """Return all (offset, fund_name) anchored heading candidates in the text.

    A candidate is a position where a fund name variant is immediately followed
    by a strong section anchor. A fund may have several candidates (its book may
    be concatenated more than once, or it appears in multiple books); all are
    kept so segmentation can choose the one that yields a non-trivial segment.
    """
    cands = []
    for fn in fund_names:
        for v in sorted(_name_variants(fn), key=len, reverse=True):
            for m in re.finditer(re.escape(v), text):
                if _SECTION_ANCHOR.match(text[m.end():m.end() + 45]):
                    cands.append((m.start(), fn))
    return sorted(cands)


def _segment_prose(text: str, fund_names):
    """Split prose into per-fund segments using anchored section headings.

    Algorithm:
      1. collect all anchored heading candidates across the (possibly multi-book)
         text and sort them by offset;
      2. each candidate's segment runs to the NEXT candidate heading (of any
         fund), so boundaries come only from real section starts;
      3. for each fund, choose the candidate whose segment is longest and at
         least MIN_SEGMENT_CHARS, discarding collapsed (too-short) candidates.
    Returns {fund_name: segment_text} for funds with a usable section.
    """
    cands = _heading_positions(text, fund_names)
    if not cands:
        return {}
    offsets = [c[0] for c in cands]
    best = {}  # fund -> (length, segment_text)
    for i, (off, fn) in enumerate(cands):
        end = offsets[i + 1] if i + 1 < len(offsets) else len(text)
        seg = text[off:end]
        if len(seg) < MIN_SEGMENT_CHARS:
            continue
        if fn not in best or len(seg) > best[fn][0]:
            best[fn] = (len(seg), seg)
    return {fn: seg for fn, (ln, seg) in best.items()}


def build_samples(per_fund: bool = True):
    if not GOLD_PATH.exists():
        log.error("run `gold` first"); return
    if per_fund:
        return _build_samples_per_fund()
    gold = {json.loads(l)["cik"]: json.loads(l)
            for l in open(GOLD_PATH, encoding="utf-8")}
    n = 0
    tot_text = tot_json = 0
    with open(SAMPLES_PATH, "w", encoding="utf-8") as out:
        for prose_file in sorted(PROSE_DIR.glob("*.txt")):
            cik = prose_file.stem
            g = gold.get(cik)
            if not g:
                continue
            text = prose_file.read_text(encoding="utf-8")
            triples, ents = g["triples"], g["entities"]
            target = serialize_triples(triples, ents)
            schema = ontology_schema(triples, ents)
            sample = {
                "cik": cik,
                "trust_name": g["trust_name"],
                "input_text": text,
                "ontology": schema,
                "target_triples": triples,
                "target_serialized": target,
                "stats": {
                    "input_chars": len(text),
                    "target_chars": len(target),
                    "n_triples": len(triples),
                    "n_entities": len(ents),
                    "text_to_json_ratio": round(len(text) / max(1, len(target)), 1),
                },
            }
            out.write(json.dumps(sample, ensure_ascii=False) + "\n")
            n += 1
            tot_text += len(text)
            tot_json += len(target)
    log.info("wrote %d samples -> %s", n, SAMPLES_PATH)
    if n:
        print(f"\n{n} samples")
        print(f"  mean input  : {tot_text//n:>8,} chars")
        print(f"  mean target : {tot_json//n:>8,} chars")
        print(f"  mean ratio  : {tot_text/max(1,tot_json):>8.1f} : 1  (text : json)")


def _build_samples_per_fund():
    """One sample per fund: the fund's prospectus section -> the fund's subgraph.

    For each trust, the prose is segmented into per-fund sections. Each fund's
    target is its own edges (advisedBy, custodian, ...) plus the fund-anchored
    seriesOf edge and the trust-level underwrittenBy edge (shared, but a true
    fact about that fund's trust). Funds whose section cannot be located in the
    prose are skipped and counted; trusts where nothing can be located fall back
    to a single whole-trust sample so no data is silently dropped.
    """
    gold = {json.loads(l)["cik"]: json.loads(l)
            for l in open(GOLD_PATH, encoding="utf-8")}
    n = 0
    n_funds_total = n_funds_located = n_fallback_trusts = 0
    tot_text = tot_json = 0
    ratios = []
    with open(SAMPLES_PATH, "w", encoding="utf-8") as out:
        for prose_file in sorted(PROSE_DIR.glob("*.txt")):
            cik = prose_file.stem
            g = gold.get(cik)
            if not g:
                continue
            text = prose_file.read_text(encoding="utf-8")
            ents = g["entities"]
            # group triples by subject fund IRI; collect trust-level edges
            fund_iris = {e_iri for e_iri, e in ents.items() if e["type"] == "Fund"}
            trust_iri = g["trust_iri"]
            by_fund = defaultdict(list)
            trust_edges = []
            for t in g["triples"]:
                if t["s"] in fund_iris:
                    by_fund[t["s"]].append(t)
                elif t["s"] == trust_iri:
                    trust_edges.append(t)  # e.g. underwrittenBy

            fund_label = {iri: ents[iri]["label"] for iri in fund_iris}
            n_funds_total += len(fund_iris)
            segs = _segment_prose(text, list(fund_label.values()))
            label_to_iri = {v: k for k, v in fund_label.items()}

            if not segs:  # whole-trust fallback (no section located)
                n_fallback_trusts += 1
                triples = g["triples"]
                target = serialize_triples(triples, ents)
                target_plain = serialize_triples_plain(triples, ents)
                rec = {
                    "sample_id": f"{cik}:ALL", "cik": cik, "trust_name": g["trust_name"],
                    "fund": None, "segmented": False,
                    "input_text": text, "ontology": ontology_schema(triples, ents),
                    "target_triples": triples,
                    "target_serialized": target,
                    "target_serialized_plain": target_plain,
                    "stats": {"input_chars": len(text), "target_chars": len(target),
                              "n_triples": len(triples), "n_entities": len(ents),
                              "text_to_json_ratio": round(len(text) / max(1, len(target)), 1)},
                }
                out.write(json.dumps(rec, ensure_ascii=False) + "\n")
                n += 1; tot_text += len(text); tot_json += len(target)
                ratios.append(rec["stats"]["text_to_json_ratio"])
                continue

            for label, seg in segs.items():
                f_iri = label_to_iri.get(label)
                if not f_iri:
                    continue
                triples = list(by_fund.get(f_iri, [])) + list(trust_edges)
                if not triples:
                    continue
                n_funds_located += 1
                # restrict entities to those referenced by this fund's triples
                ref = {trust_iri}
                for t in triples:
                    ref.add(t["s"]); ref.add(t["o"])
                sub_ents = {k: ents[k] for k in ref if k in ents}
                target = serialize_triples(triples, sub_ents)
                target_plain = serialize_triples_plain(triples, sub_ents)
                rec = {
                    "sample_id": f"{cik}:{ents[f_iri].get('series_id') or _slug(label)}",
                    "cik": cik, "trust_name": g["trust_name"],
                    "fund": label, "series_id": ents[f_iri].get("series_id", ""),
                    "segmented": True,
                    "input_text": seg, "ontology": ontology_schema(triples, sub_ents),
                    "target_triples": triples,
                    "target_serialized": target,
                    "target_serialized_plain": target_plain,
                    "stats": {"input_chars": len(seg), "target_chars": len(target),
                              "n_triples": len(triples), "n_entities": len(sub_ents),
                              "text_to_json_ratio": round(len(seg) / max(1, len(target)), 1)},
                }
                out.write(json.dumps(rec, ensure_ascii=False) + "\n")
                n += 1; tot_text += len(seg); tot_json += len(target)
                ratios.append(rec["stats"]["text_to_json_ratio"])

    import statistics as _st
    log.info("wrote %d per-fund samples -> %s", n, SAMPLES_PATH)
    if n:
        cov = n_funds_located / max(1, n_funds_total)
        print(f"\n{n} samples ({n_funds_located}/{n_funds_total} funds located, "
              f"coverage {cov:.0%}; {n_fallback_trusts} trusts fell back to whole-doc)")
        print(f"  mean input  : {tot_text//n:>8,} chars")
        print(f"  mean target : {tot_json//n:>8,} chars")
        print(f"  median ratio: {_st.median(ratios):>8.1f} : 1  (text : json)")
        print(f"  mean ratio  : {tot_text/max(1,tot_json):>8.1f} : 1")


# --------------------------------------------------------------------------- #
# stage: split  -- trust-level train/val/test split
# --------------------------------------------------------------------------- #
def _bucket(cik: str) -> float:
    """Deterministic [0,1) value per trust CIK (stable, no RNG state)."""
    import hashlib
    h = hashlib.sha256(cik.encode()).hexdigest()
    return int(h[:8], 16) / 0xFFFFFFFF


def build_split(val_frac: float = 0.10, test_frac: float = 0.10):
    """Split samples.jsonl into train/val/test JSONL, partitioned by TRUST (CIK).

    Splitting by trust (not by fund) prevents leakage: two funds of the same
    trust share advisers, distributors and custodians, so allowing them into
    different splits would let the model memorise trust-specific entities. The
    assignment is a deterministic hash of the CIK, so the split is reproducible
    and stable as new samples are added.
    """
    if not SAMPLES_PATH.exists():
        log.error("run `samples` first"); return
    rows = [json.loads(l) for l in open(SAMPLES_PATH, encoding="utf-8")]
    out = {"train": [], "val": [], "test": []}
    for r in rows:
        b = _bucket(r["cik"])
        split = "test" if b < test_frac else \
                "val" if b < test_frac + val_frac else "train"
        out[split].append(r)
    for name, recs in out.items():
        p = OUT_DIR / f"{name}.jsonl"
        with open(p, "w", encoding="utf-8") as f:
            for r in recs:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")
    n_tr = {s: len({r["cik"] for r in recs}) for s, recs in out.items()}
    print("\nTrust-level split (deterministic by CIK):")
    for s in ("train", "val", "test"):
        print(f"  {s:5s}: {len(out[s]):>5,} samples  from {n_tr[s]:>4} trusts")
    total = sum(len(v) for v in out.values())
    print(f"  total: {total:,} samples -> {OUT_DIR}/{{train,val,test}}.jsonl")
    # leakage check
    cik_sets = {s: {r["cik"] for r in recs} for s, recs in out.items()}
    overlap = (cik_sets["train"] & cik_sets["val"]) | \
              (cik_sets["train"] & cik_sets["test"]) | \
              (cik_sets["val"] & cik_sets["test"])
    print(f"  trust overlap across splits: {len(overlap)} (should be 0)")


# --------------------------------------------------------------------------- #
def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
    ap = argparse.ArgumentParser()
    sub = ap.add_subparsers(dest="cmd")
    g = sub.add_parser("gold"); g.add_argument("--ncen", default="data/ncen/2025q3")
    g.add_argument("--custodian-scope", choices=["primary", "all", "none"],
                   default="primary",
                   help="primary=only IS_SUB_CUSTODIAN!=Y (default); all=every row; none=drop")
    f = sub.add_parser("fetch"); f.add_argument("--limit", type=int, default=25)
    f.add_argument("--max-filings", type=int, default=8,
                   help="max prospectus filings to fetch & concatenate per trust")
    s = sub.add_parser("samples")
    s.add_argument("--whole-trust", action="store_true",
                   help="one sample per trust instead of per-fund segmentation")
    sp = sub.add_parser("split")
    sp.add_argument("--val-frac", type=float, default=0.10)
    sp.add_argument("--test-frac", type=float, default=0.10)
    a = sub.add_parser("all"); a.add_argument("--ncen", default="data/ncen/2025q3")
    a.add_argument("--limit", type=int, default=25)
    a.add_argument("--max-filings", type=int, default=8)
    a.add_argument("--custodian-scope", choices=["primary", "all", "none"], default="primary")
    a.add_argument("--whole-trust", action="store_true")
    a.add_argument("--val-frac", type=float, default=0.10)
    a.add_argument("--test-frac", type=float, default=0.10)
    args = ap.parse_args()

    if args.cmd == "gold":
        build_gold(Path(args.ncen), custodian_scope=args.custodian_scope)
    elif args.cmd == "fetch":
        fetch_prose(args.limit, max_filings=args.max_filings)
    elif args.cmd == "samples":
        build_samples(per_fund=not args.whole_trust)
    elif args.cmd == "split":
        build_split(val_frac=args.val_frac, test_frac=args.test_frac)
    elif args.cmd == "all":
        build_gold(Path(args.ncen), custodian_scope=args.custodian_scope)
        fetch_prose(args.limit, max_filings=args.max_filings)
        build_samples(per_fund=not args.whole_trust)
        build_split(val_frac=args.val_frac, test_frac=args.test_frac)
    else:
        ap.print_help()


if __name__ == "__main__":
    main()