""" SEC Fund Prospectus & Reference Data Fetcher Fetches prospectuses (485BPOS, 497, 497K), supplements, and amendments for US SEC-registered investment funds from EDGAR. Pairs the legal documents with structured reference data (series/class identifiers, tickers, CUSIPs) to build an LLM training dataset. Data sources: 1. EDGAR Submissions API — filing history per CIK 2. EDGAR Full-Text Search — search filings by form type 3. EDGAR Archives — download actual filing documents 4. SEC Series/Class CSV — fund & share-class reference data 5. XBRL Risk/Return Datasets — structured prospectus extracts """ import csv import io import json import logging import os import re import time import zipfile from dataclasses import dataclass, field, asdict from pathlib import Path from typing import Optional from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from tqdm import tqdm logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", ) log = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- SEC_BASE = "https://www.sec.gov" DATA_SEC = "https://data.sec.gov" EFTS_SEC = "https://efts.sec.gov/LATEST/search-index" ARCHIVES = f"{SEC_BASE}/Archives/edgar/data" SERIES_CLASS_CSV_URL = ( "https://www.sec.gov/files/investment/data/other/" "investment-company-series-class-information/" "investment-company-series-class-2025.csv" ) PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A"} USER_AGENT = "SECFundFetcher/1.0 research@university.edu" HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"} REQUEST_INTERVAL = 0.12 # ~8 req/s to stay under SEC's 10/s limit # --------------------------------------------------------------------------- # Data classes # --------------------------------------------------------------------------- @dataclass class ShareClass: class_id: str # C###### class_name: str ticker: str = "" cusip: str = "" @dataclass class FundSeries: series_id: str # S###### series_name: str classes: list[ShareClass] = field(default_factory=list) @dataclass class FundTrust: cik: str # 10-digit, zero-padded trust_name: str file_number: str = "" series: list[FundSeries] = field(default_factory=list) @dataclass class Filing: accession_number: str form_type: str filing_date: str primary_document: str description: str = "" document_url: str = "" text_content: str = "" @dataclass class FundDataset: """One record in the final dataset: trust + series + classes + filings.""" trust: FundTrust prospectus_filings: list[Filing] = field(default_factory=list) supplement_filings: list[Filing] = field(default_factory=list) # --------------------------------------------------------------------------- # SEC API helpers # --------------------------------------------------------------------------- _last_request_time = 0.0 def _throttled_get(url: str, **kwargs) -> requests.Response: """GET with rate-limiting and proper User-Agent.""" global _last_request_time elapsed = time.time() - _last_request_time if elapsed < REQUEST_INTERVAL: time.sleep(REQUEST_INTERVAL - elapsed) kwargs.setdefault("headers", {}).update(HEADERS) kwargs.setdefault("timeout", 30) resp = requests.get(url, **kwargs) _last_request_time = time.time() if resp.status_code == 403: log.warning("403 Forbidden for %s — check User-Agent header", url) resp.raise_for_status() return resp # --------------------------------------------------------------------------- # 1. Load the SEC Series/Class reference data (CSV) # --------------------------------------------------------------------------- def load_series_class_reference(csv_url: str = SERIES_CLASS_CSV_URL) -> dict[str, FundTrust]: """ Download the SEC Investment Company Series and Class CSV. Returns a dict keyed by CIK -> FundTrust with nested series/classes. Actual CSV columns (2025): Reporting File Number, CIK Number, Entity Name, Entity Org Type, Series ID, Series Name, Class ID, Class Name, Class Ticker, Address_1, Address_2, City, State, Zip Code """ log.info("Downloading series/class reference CSV …") resp = _throttled_get(csv_url) resp.encoding = "utf-8" reader = csv.DictReader(io.StringIO(resp.text)) trusts: dict[str, FundTrust] = {} for row in reader: cik = row.get("CIK Number", "").strip().zfill(10) if not cik or cik == "0" * 10: continue if cik not in trusts: trusts[cik] = FundTrust( cik=cik, trust_name=row.get("Entity Name", "").strip(), file_number=row.get("Reporting File Number", "").strip(), ) trust = trusts[cik] series_id = row.get("Series ID", "").strip() if not series_id: continue existing_series = {s.series_id: s for s in trust.series} if series_id not in existing_series: series = FundSeries( series_id=series_id, series_name=row.get("Series Name", "").strip(), ) trust.series.append(series) existing_series[series_id] = series series = existing_series[series_id] class_id = row.get("Class ID", "").strip() if class_id and not any(c.class_id == class_id for c in series.classes): series.classes.append(ShareClass( class_id=class_id, class_name=row.get("Class Name", "").strip(), ticker=row.get("Class Ticker", "").strip(), )) log.info("Loaded %d investment company trusts from CSV", len(trusts)) return trusts # --------------------------------------------------------------------------- # 2. Fetch filing history for a CIK via the Submissions API # --------------------------------------------------------------------------- def fetch_submissions(cik: str) -> dict: """Return the full JSON from data.sec.gov/submissions/CIK{cik}.json.""" cik_padded = cik.zfill(10) url = f"{DATA_SEC}/submissions/CIK{cik_padded}.json" log.info("Fetching submissions for CIK %s", cik_padded) resp = _throttled_get(url) return resp.json() def extract_filings(submissions_json: dict, form_types: set[str]) -> list[Filing]: """Extract filings of given form types from the submissions JSON.""" cik = str(submissions_json.get("cik", "")).zfill(10) recent = submissions_json.get("filings", {}).get("recent", {}) filings = [] accessions = recent.get("accessionNumber", []) forms = recent.get("form", []) dates = recent.get("filingDate", []) docs = recent.get("primaryDocument", []) descs = recent.get("primaryDocDescription", []) for i in range(len(accessions)): if forms[i] not in form_types: continue accession_no_dashes = accessions[i].replace("-", "") doc_url = f"{ARCHIVES}/{int(cik)}/{accession_no_dashes}/{docs[i]}" filings.append(Filing( accession_number=accessions[i], form_type=forms[i], filing_date=dates[i], primary_document=docs[i], description=descs[i] if i < len(descs) else "", document_url=doc_url, )) return filings # --------------------------------------------------------------------------- # 3. Download and parse a filing document (HTML/XML → text) # --------------------------------------------------------------------------- def download_filing_text(filing: Filing, max_bytes: int = 5_000_000) -> str: """Download a filing document and extract plain text from HTML/XML.""" if not filing.document_url: return "" try: resp = _throttled_get(filing.document_url, stream=True) content_type = resp.headers.get("Content-Type", "") if "pdf" in content_type.lower(): log.info("Skipping PDF document: %s", filing.document_url) return "[PDF — binary content not extracted]" raw = resp.content[:max_bytes] text = raw.decode("utf-8", errors="replace") soup = BeautifulSoup(text, "lxml") for tag in soup(["script", "style", "meta", "link"]): tag.decompose() plain = soup.get_text(separator="\n", strip=True) plain = re.sub(r"\n{3,}", "\n\n", plain) return plain except Exception as e: log.warning("Failed to download %s: %s", filing.document_url, e) return "" # --------------------------------------------------------------------------- # 4. Search EDGAR full-text index for filings # --------------------------------------------------------------------------- def search_filings( query: str = "", forms: str = "485BPOS,497,497K", start_date: str = "2024-01-01", end_date: str = "2025-12-31", cik: str = "", max_results: int = 20, ) -> list[dict]: """ Use the EDGAR Full-Text Search API (efts.sec.gov) to find filings. Returns a list of hit dicts from the Elasticsearch response. """ params: dict = { "q": query or "*", "forms": forms, "dateRange": "custom", "startdt": start_date, "enddt": end_date, "from": 0, "size": max_results, } if cik: params["q"] = f'"{cik}"' if not query else f"{query} AND {cik}" log.info("EFTS search: forms=%s, date=%s–%s, q=%s", forms, start_date, end_date, params["q"]) resp = _throttled_get(EFTS_SEC, params=params) data = resp.json() hits = data.get("hits", {}).get("hits", []) log.info("EFTS returned %d hits", len(hits)) return [h.get("_source", {}) for h in hits] # --------------------------------------------------------------------------- # 5. Build the full dataset for a list of CIKs # --------------------------------------------------------------------------- EXAMPLE_FUNDS = [ { "cik": "0000036405", "name": "Vanguard Index Funds (Vanguard 500 Index Fund, VOO, VFIAX)", }, { "cik": "0000024238", "name": "Fidelity Contrafund (FCNTX, FCNKX)", }, { "cik": "0001100663", "name": "iShares Trust (IVV, iShares Core S&P 500 ETF)", }, { "cik": "0000773757", "name": "Columbia Funds Series Trust I", }, { "cik": "0001795351", "name": "T. Rowe Price Exchange-Traded Funds, Inc.", }, ] def build_dataset( ciks: list[str], output_dir: str = "dataset", max_prospectus_filings: int = 5, max_supplement_filings: int = 5, download_text: bool = True, reference_data: Optional[dict[str, FundTrust]] = None, ) -> list[FundDataset]: """ For each CIK: 1. Merge reference data (series/class info) 2. Fetch filing history from Submissions API 3. Extract prospectus & supplement filings 4. Optionally download filing text 5. Save to JSON """ out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) datasets: list[FundDataset] = [] for cik in tqdm(ciks, desc="Processing funds"): cik_padded = cik.zfill(10) # Resolve reference data if reference_data and cik_padded in reference_data: trust = reference_data[cik_padded] else: trust = FundTrust(cik=cik_padded, trust_name="(unknown — CSV not loaded)") # Fetch submissions try: subs = fetch_submissions(cik_padded) except requests.HTTPError as e: log.error("Could not fetch submissions for CIK %s: %s", cik_padded, e) continue api_name = subs.get("name", "") if api_name and "(unknown" in trust.trust_name: trust.trust_name = api_name # Extract filings all_filings = extract_filings(subs, PROSPECTUS_FORM_TYPES) log.info("CIK %s: found %d prospectus-related filings", cik_padded, len(all_filings)) prospectus_filings = [ f for f in all_filings if f.form_type in ("485BPOS", "485APOS", "N-1A") ][:max_prospectus_filings] supplement_filings = [ f for f in all_filings if f.form_type in ("497", "497K") ][:max_supplement_filings] if download_text: for f in prospectus_filings + supplement_filings: f.text_content = download_filing_text(f) log.info( " %s %s → %d chars", f.form_type, f.filing_date, len(f.text_content), ) ds = FundDataset( trust=trust, prospectus_filings=prospectus_filings, supplement_filings=supplement_filings, ) datasets.append(ds) # Save individual fund JSON fund_file = out / f"{cik_padded}.json" with open(fund_file, "w", encoding="utf-8") as fp: json.dump(asdict(ds), fp, indent=2, ensure_ascii=False) log.info("Saved %s", fund_file) # Save combined manifest manifest = [] for ds in datasets: manifest.append({ "cik": ds.trust.cik, "trust_name": ds.trust.trust_name, "num_series": len(ds.trust.series), "num_classes": sum(len(s.classes) for s in ds.trust.series), "num_prospectus_filings": len(ds.prospectus_filings), "num_supplement_filings": len(ds.supplement_filings), }) manifest_file = out / "manifest.json" with open(manifest_file, "w", encoding="utf-8") as fp: json.dump(manifest, fp, indent=2) log.info("Saved manifest with %d funds → %s", len(manifest), manifest_file) return datasets # --------------------------------------------------------------------------- # 6. Download XBRL Risk/Return data (quarterly ZIP) # --------------------------------------------------------------------------- def download_xbrl_risk_return( quarter: str = "2025q2", output_dir: str = "dataset/xbrl_rr", ) -> Path: """ Download a quarterly Mutual Fund Prospectus Risk/Return Summary ZIP from the SEC and extract it. """ url = ( f"https://www.sec.gov/files/dera/data/" f"mutual-fund-prospectus-risk/return-summary-data-sets/{quarter}_rr1.zip" ) out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) zip_path = out / f"{quarter}_rr1.zip" log.info("Downloading XBRL Risk/Return dataset: %s", url) resp = _throttled_get(url, stream=True) with open(zip_path, "wb") as fp: for chunk in resp.iter_content(chunk_size=8192): fp.write(chunk) with zipfile.ZipFile(zip_path, "r") as zf: zf.extractall(out / quarter) log.info("Extracted to %s", out / quarter) return out / quarter # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): import argparse parser = argparse.ArgumentParser( description="Fetch SEC fund prospectus + reference data for LLM training." ) parser.add_argument( "--ciks", nargs="*", help="CIK numbers to fetch (default: 5 example funds)", ) parser.add_argument( "--output", default="dataset", help="Output directory (default: dataset)", ) parser.add_argument( "--max-prospectus", type=int, default=3, help="Max prospectus filings per fund (default: 3)", ) parser.add_argument( "--max-supplements", type=int, default=5, help="Max supplement filings per fund (default: 5)", ) parser.add_argument( "--no-download-text", action="store_true", help="Skip downloading filing document text", ) parser.add_argument( "--load-reference-csv", action="store_true", help="Download and load the full SEC series/class CSV (~15 MB)", ) parser.add_argument( "--download-xbrl-rr", type=str, default="", help="Also download XBRL Risk/Return dataset for this quarter (e.g. 2025q2)", ) args = parser.parse_args() ciks = args.ciks or [f["cik"] for f in EXAMPLE_FUNDS] ref_data = None if args.load_reference_csv: ref_data = load_series_class_reference() datasets = build_dataset( ciks=ciks, output_dir=args.output, max_prospectus_filings=args.max_prospectus, max_supplement_filings=args.max_supplements, download_text=not args.no_download_text, reference_data=ref_data, ) if args.download_xbrl_rr: download_xbrl_risk_return(args.download_xbrl_rr, f"{args.output}/xbrl_rr") print(f"\nDone. Processed {len(datasets)} funds → ./{args.output}/") print("\nSummary:") for ds in datasets: n_classes = sum(len(s.classes) for s in ds.trust.series) print( f" {ds.trust.cik} | {ds.trust.trust_name[:50]:50s} | " f"{len(ds.trust.series)} series, {n_classes} classes | " f"{len(ds.prospectus_filings)} prospectus, " f"{len(ds.supplement_filings)} supplements" ) if __name__ == "__main__": main()