fund_rfid_data/sec_fund_fetcher.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

538 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
SEC Fund Prospectus & Reference Data Fetcher
Fetches prospectuses (485BPOS, 497, 497K), supplements, and amendments
for US SEC-registered investment funds from EDGAR. Pairs the legal
documents with structured reference data (series/class identifiers,
tickers, CUSIPs) to build an LLM training dataset.
Data sources:
1. EDGAR Submissions API — filing history per CIK
2. EDGAR Full-Text Search — search filings by form type
3. EDGAR Archives — download actual filing documents
4. SEC Series/Class CSV — fund & share-class reference data
5. XBRL Risk/Return Datasets — structured prospectus extracts
"""
import csv
import io
import json
import logging
import os
import re
import time
import zipfile
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Optional
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
SEC_BASE = "https://www.sec.gov"
DATA_SEC = "https://data.sec.gov"
EFTS_SEC = "https://efts.sec.gov/LATEST/search-index"
ARCHIVES = f"{SEC_BASE}/Archives/edgar/data"
SERIES_CLASS_CSV_URL = (
"https://www.sec.gov/files/investment/data/other/"
"investment-company-series-class-information/"
"investment-company-series-class-2025.csv"
)
PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A"}
USER_AGENT = "SECFundFetcher/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12 # ~8 req/s to stay under SEC's 10/s limit
# ---------------------------------------------------------------------------
# Data classes
# ---------------------------------------------------------------------------
@dataclass
class ShareClass:
class_id: str # C######
class_name: str
ticker: str = ""
cusip: str = ""
@dataclass
class FundSeries:
series_id: str # S######
series_name: str
classes: list[ShareClass] = field(default_factory=list)
@dataclass
class FundTrust:
cik: str # 10-digit, zero-padded
trust_name: str
file_number: str = ""
series: list[FundSeries] = field(default_factory=list)
@dataclass
class Filing:
accession_number: str
form_type: str
filing_date: str
primary_document: str
description: str = ""
document_url: str = ""
text_content: str = ""
@dataclass
class FundDataset:
"""One record in the final dataset: trust + series + classes + filings."""
trust: FundTrust
prospectus_filings: list[Filing] = field(default_factory=list)
supplement_filings: list[Filing] = field(default_factory=list)
# ---------------------------------------------------------------------------
# SEC API helpers
# ---------------------------------------------------------------------------
_last_request_time = 0.0
def _throttled_get(url: str, **kwargs) -> requests.Response:
"""GET with rate-limiting and proper User-Agent."""
global _last_request_time
elapsed = time.time() - _last_request_time
if elapsed < REQUEST_INTERVAL:
time.sleep(REQUEST_INTERVAL - elapsed)
kwargs.setdefault("headers", {}).update(HEADERS)
kwargs.setdefault("timeout", 30)
resp = requests.get(url, **kwargs)
_last_request_time = time.time()
if resp.status_code == 403:
log.warning("403 Forbidden for %s — check User-Agent header", url)
resp.raise_for_status()
return resp
# ---------------------------------------------------------------------------
# 1. Load the SEC Series/Class reference data (CSV)
# ---------------------------------------------------------------------------
def load_series_class_reference(csv_url: str = SERIES_CLASS_CSV_URL) -> dict[str, FundTrust]:
"""
Download the SEC Investment Company Series and Class CSV.
Returns a dict keyed by CIK -> FundTrust with nested series/classes.
Actual CSV columns (2025):
Reporting File Number, CIK Number, Entity Name, Entity Org Type,
Series ID, Series Name, Class ID, Class Name, Class Ticker,
Address_1, Address_2, City, State, Zip Code
"""
log.info("Downloading series/class reference CSV …")
resp = _throttled_get(csv_url)
resp.encoding = "utf-8"
reader = csv.DictReader(io.StringIO(resp.text))
trusts: dict[str, FundTrust] = {}
for row in reader:
cik = row.get("CIK Number", "").strip().zfill(10)
if not cik or cik == "0" * 10:
continue
if cik not in trusts:
trusts[cik] = FundTrust(
cik=cik,
trust_name=row.get("Entity Name", "").strip(),
file_number=row.get("Reporting File Number", "").strip(),
)
trust = trusts[cik]
series_id = row.get("Series ID", "").strip()
if not series_id:
continue
existing_series = {s.series_id: s for s in trust.series}
if series_id not in existing_series:
series = FundSeries(
series_id=series_id,
series_name=row.get("Series Name", "").strip(),
)
trust.series.append(series)
existing_series[series_id] = series
series = existing_series[series_id]
class_id = row.get("Class ID", "").strip()
if class_id and not any(c.class_id == class_id for c in series.classes):
series.classes.append(ShareClass(
class_id=class_id,
class_name=row.get("Class Name", "").strip(),
ticker=row.get("Class Ticker", "").strip(),
))
log.info("Loaded %d investment company trusts from CSV", len(trusts))
return trusts
# ---------------------------------------------------------------------------
# 2. Fetch filing history for a CIK via the Submissions API
# ---------------------------------------------------------------------------
def fetch_submissions(cik: str) -> dict:
"""Return the full JSON from data.sec.gov/submissions/CIK{cik}.json."""
cik_padded = cik.zfill(10)
url = f"{DATA_SEC}/submissions/CIK{cik_padded}.json"
log.info("Fetching submissions for CIK %s", cik_padded)
resp = _throttled_get(url)
return resp.json()
def extract_filings(submissions_json: dict, form_types: set[str]) -> list[Filing]:
"""Extract filings of given form types from the submissions JSON."""
cik = str(submissions_json.get("cik", "")).zfill(10)
recent = submissions_json.get("filings", {}).get("recent", {})
filings = []
accessions = recent.get("accessionNumber", [])
forms = recent.get("form", [])
dates = recent.get("filingDate", [])
docs = recent.get("primaryDocument", [])
descs = recent.get("primaryDocDescription", [])
for i in range(len(accessions)):
if forms[i] not in form_types:
continue
accession_no_dashes = accessions[i].replace("-", "")
doc_url = f"{ARCHIVES}/{int(cik)}/{accession_no_dashes}/{docs[i]}"
filings.append(Filing(
accession_number=accessions[i],
form_type=forms[i],
filing_date=dates[i],
primary_document=docs[i],
description=descs[i] if i < len(descs) else "",
document_url=doc_url,
))
return filings
# ---------------------------------------------------------------------------
# 3. Download and parse a filing document (HTML/XML → text)
# ---------------------------------------------------------------------------
def download_filing_text(filing: Filing, max_bytes: int = 5_000_000) -> str:
"""Download a filing document and extract plain text from HTML/XML."""
if not filing.document_url:
return ""
try:
resp = _throttled_get(filing.document_url, stream=True)
content_type = resp.headers.get("Content-Type", "")
if "pdf" in content_type.lower():
log.info("Skipping PDF document: %s", filing.document_url)
return "[PDF — binary content not extracted]"
raw = resp.content[:max_bytes]
text = raw.decode("utf-8", errors="replace")
soup = BeautifulSoup(text, "lxml")
for tag in soup(["script", "style", "meta", "link"]):
tag.decompose()
plain = soup.get_text(separator="\n", strip=True)
plain = re.sub(r"\n{3,}", "\n\n", plain)
return plain
except Exception as e:
log.warning("Failed to download %s: %s", filing.document_url, e)
return ""
# ---------------------------------------------------------------------------
# 4. Search EDGAR full-text index for filings
# ---------------------------------------------------------------------------
def search_filings(
query: str = "",
forms: str = "485BPOS,497,497K",
start_date: str = "2024-01-01",
end_date: str = "2025-12-31",
cik: str = "",
max_results: int = 20,
) -> list[dict]:
"""
Use the EDGAR Full-Text Search API (efts.sec.gov) to find filings.
Returns a list of hit dicts from the Elasticsearch response.
"""
params: dict = {
"q": query or "*",
"forms": forms,
"dateRange": "custom",
"startdt": start_date,
"enddt": end_date,
"from": 0,
"size": max_results,
}
if cik:
params["q"] = f'"{cik}"' if not query else f"{query} AND {cik}"
log.info("EFTS search: forms=%s, date=%s%s, q=%s", forms, start_date, end_date, params["q"])
resp = _throttled_get(EFTS_SEC, params=params)
data = resp.json()
hits = data.get("hits", {}).get("hits", [])
log.info("EFTS returned %d hits", len(hits))
return [h.get("_source", {}) for h in hits]
# ---------------------------------------------------------------------------
# 5. Build the full dataset for a list of CIKs
# ---------------------------------------------------------------------------
EXAMPLE_FUNDS = [
{
"cik": "0000036405",
"name": "Vanguard Index Funds (Vanguard 500 Index Fund, VOO, VFIAX)",
},
{
"cik": "0000024238",
"name": "Fidelity Contrafund (FCNTX, FCNKX)",
},
{
"cik": "0001100663",
"name": "iShares Trust (IVV, iShares Core S&P 500 ETF)",
},
{
"cik": "0000773757",
"name": "Columbia Funds Series Trust I",
},
{
"cik": "0001795351",
"name": "T. Rowe Price Exchange-Traded Funds, Inc.",
},
]
def build_dataset(
ciks: list[str],
output_dir: str = "dataset",
max_prospectus_filings: int = 5,
max_supplement_filings: int = 5,
download_text: bool = True,
reference_data: Optional[dict[str, FundTrust]] = None,
) -> list[FundDataset]:
"""
For each CIK:
1. Merge reference data (series/class info)
2. Fetch filing history from Submissions API
3. Extract prospectus & supplement filings
4. Optionally download filing text
5. Save to JSON
"""
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
datasets: list[FundDataset] = []
for cik in tqdm(ciks, desc="Processing funds"):
cik_padded = cik.zfill(10)
# Resolve reference data
if reference_data and cik_padded in reference_data:
trust = reference_data[cik_padded]
else:
trust = FundTrust(cik=cik_padded, trust_name="(unknown — CSV not loaded)")
# Fetch submissions
try:
subs = fetch_submissions(cik_padded)
except requests.HTTPError as e:
log.error("Could not fetch submissions for CIK %s: %s", cik_padded, e)
continue
api_name = subs.get("name", "")
if api_name and "(unknown" in trust.trust_name:
trust.trust_name = api_name
# Extract filings
all_filings = extract_filings(subs, PROSPECTUS_FORM_TYPES)
log.info("CIK %s: found %d prospectus-related filings", cik_padded, len(all_filings))
prospectus_filings = [
f for f in all_filings if f.form_type in ("485BPOS", "485APOS", "N-1A")
][:max_prospectus_filings]
supplement_filings = [
f for f in all_filings if f.form_type in ("497", "497K")
][:max_supplement_filings]
if download_text:
for f in prospectus_filings + supplement_filings:
f.text_content = download_filing_text(f)
log.info(
" %s %s%d chars",
f.form_type, f.filing_date, len(f.text_content),
)
ds = FundDataset(
trust=trust,
prospectus_filings=prospectus_filings,
supplement_filings=supplement_filings,
)
datasets.append(ds)
# Save individual fund JSON
fund_file = out / f"{cik_padded}.json"
with open(fund_file, "w", encoding="utf-8") as fp:
json.dump(asdict(ds), fp, indent=2, ensure_ascii=False)
log.info("Saved %s", fund_file)
# Save combined manifest
manifest = []
for ds in datasets:
manifest.append({
"cik": ds.trust.cik,
"trust_name": ds.trust.trust_name,
"num_series": len(ds.trust.series),
"num_classes": sum(len(s.classes) for s in ds.trust.series),
"num_prospectus_filings": len(ds.prospectus_filings),
"num_supplement_filings": len(ds.supplement_filings),
})
manifest_file = out / "manifest.json"
with open(manifest_file, "w", encoding="utf-8") as fp:
json.dump(manifest, fp, indent=2)
log.info("Saved manifest with %d funds → %s", len(manifest), manifest_file)
return datasets
# ---------------------------------------------------------------------------
# 6. Download XBRL Risk/Return data (quarterly ZIP)
# ---------------------------------------------------------------------------
def download_xbrl_risk_return(
quarter: str = "2025q2",
output_dir: str = "dataset/xbrl_rr",
) -> Path:
"""
Download a quarterly Mutual Fund Prospectus Risk/Return Summary
ZIP from the SEC and extract it.
"""
url = (
f"https://www.sec.gov/files/dera/data/"
f"mutual-fund-prospectus-risk/return-summary-data-sets/{quarter}_rr1.zip"
)
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
zip_path = out / f"{quarter}_rr1.zip"
log.info("Downloading XBRL Risk/Return dataset: %s", url)
resp = _throttled_get(url, stream=True)
with open(zip_path, "wb") as fp:
for chunk in resp.iter_content(chunk_size=8192):
fp.write(chunk)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(out / quarter)
log.info("Extracted to %s", out / quarter)
return out / quarter
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
import argparse
parser = argparse.ArgumentParser(
description="Fetch SEC fund prospectus + reference data for LLM training."
)
parser.add_argument(
"--ciks",
nargs="*",
help="CIK numbers to fetch (default: 5 example funds)",
)
parser.add_argument(
"--output", default="dataset",
help="Output directory (default: dataset)",
)
parser.add_argument(
"--max-prospectus", type=int, default=3,
help="Max prospectus filings per fund (default: 3)",
)
parser.add_argument(
"--max-supplements", type=int, default=5,
help="Max supplement filings per fund (default: 5)",
)
parser.add_argument(
"--no-download-text", action="store_true",
help="Skip downloading filing document text",
)
parser.add_argument(
"--load-reference-csv", action="store_true",
help="Download and load the full SEC series/class CSV (~15 MB)",
)
parser.add_argument(
"--download-xbrl-rr", type=str, default="",
help="Also download XBRL Risk/Return dataset for this quarter (e.g. 2025q2)",
)
args = parser.parse_args()
ciks = args.ciks or [f["cik"] for f in EXAMPLE_FUNDS]
ref_data = None
if args.load_reference_csv:
ref_data = load_series_class_reference()
datasets = build_dataset(
ciks=ciks,
output_dir=args.output,
max_prospectus_filings=args.max_prospectus,
max_supplement_filings=args.max_supplements,
download_text=not args.no_download_text,
reference_data=ref_data,
)
if args.download_xbrl_rr:
download_xbrl_risk_return(args.download_xbrl_rr, f"{args.output}/xbrl_rr")
print(f"\nDone. Processed {len(datasets)} funds → ./{args.output}/")
print("\nSummary:")
for ds in datasets:
n_classes = sum(len(s.classes) for s in ds.trust.series)
print(
f" {ds.trust.cik} | {ds.trust.trust_name[:50]:50s} | "
f"{len(ds.trust.series)} series, {n_classes} classes | "
f"{len(ds.prospectus_filings)} prospectus, "
f"{len(ds.supplement_filings)} supplements"
)
if __name__ == "__main__":
main()