Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
538 lines
17 KiB
Python
538 lines
17 KiB
Python
"""
|
||
SEC Fund Prospectus & Reference Data Fetcher
|
||
|
||
Fetches prospectuses (485BPOS, 497, 497K), supplements, and amendments
|
||
for US SEC-registered investment funds from EDGAR. Pairs the legal
|
||
documents with structured reference data (series/class identifiers,
|
||
tickers, CUSIPs) to build an LLM training dataset.
|
||
|
||
Data sources:
|
||
1. EDGAR Submissions API — filing history per CIK
|
||
2. EDGAR Full-Text Search — search filings by form type
|
||
3. EDGAR Archives — download actual filing documents
|
||
4. SEC Series/Class CSV — fund & share-class reference data
|
||
5. XBRL Risk/Return Datasets — structured prospectus extracts
|
||
"""
|
||
|
||
import csv
|
||
import io
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
import time
|
||
import zipfile
|
||
from dataclasses import dataclass, field, asdict
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
from urllib.parse import urljoin
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from tqdm import tqdm
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
)
|
||
log = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Configuration
|
||
# ---------------------------------------------------------------------------
|
||
|
||
SEC_BASE = "https://www.sec.gov"
|
||
DATA_SEC = "https://data.sec.gov"
|
||
EFTS_SEC = "https://efts.sec.gov/LATEST/search-index"
|
||
ARCHIVES = f"{SEC_BASE}/Archives/edgar/data"
|
||
|
||
SERIES_CLASS_CSV_URL = (
|
||
"https://www.sec.gov/files/investment/data/other/"
|
||
"investment-company-series-class-information/"
|
||
"investment-company-series-class-2025.csv"
|
||
)
|
||
|
||
PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A"}
|
||
|
||
USER_AGENT = "SECFundFetcher/1.0 research@university.edu"
|
||
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
|
||
REQUEST_INTERVAL = 0.12 # ~8 req/s to stay under SEC's 10/s limit
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Data classes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class ShareClass:
|
||
class_id: str # C######
|
||
class_name: str
|
||
ticker: str = ""
|
||
cusip: str = ""
|
||
|
||
|
||
@dataclass
|
||
class FundSeries:
|
||
series_id: str # S######
|
||
series_name: str
|
||
classes: list[ShareClass] = field(default_factory=list)
|
||
|
||
|
||
@dataclass
|
||
class FundTrust:
|
||
cik: str # 10-digit, zero-padded
|
||
trust_name: str
|
||
file_number: str = ""
|
||
series: list[FundSeries] = field(default_factory=list)
|
||
|
||
|
||
@dataclass
|
||
class Filing:
|
||
accession_number: str
|
||
form_type: str
|
||
filing_date: str
|
||
primary_document: str
|
||
description: str = ""
|
||
document_url: str = ""
|
||
text_content: str = ""
|
||
|
||
|
||
@dataclass
|
||
class FundDataset:
|
||
"""One record in the final dataset: trust + series + classes + filings."""
|
||
trust: FundTrust
|
||
prospectus_filings: list[Filing] = field(default_factory=list)
|
||
supplement_filings: list[Filing] = field(default_factory=list)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# SEC API helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
_last_request_time = 0.0
|
||
|
||
|
||
def _throttled_get(url: str, **kwargs) -> requests.Response:
|
||
"""GET with rate-limiting and proper User-Agent."""
|
||
global _last_request_time
|
||
elapsed = time.time() - _last_request_time
|
||
if elapsed < REQUEST_INTERVAL:
|
||
time.sleep(REQUEST_INTERVAL - elapsed)
|
||
|
||
kwargs.setdefault("headers", {}).update(HEADERS)
|
||
kwargs.setdefault("timeout", 30)
|
||
|
||
resp = requests.get(url, **kwargs)
|
||
_last_request_time = time.time()
|
||
|
||
if resp.status_code == 403:
|
||
log.warning("403 Forbidden for %s — check User-Agent header", url)
|
||
resp.raise_for_status()
|
||
return resp
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 1. Load the SEC Series/Class reference data (CSV)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def load_series_class_reference(csv_url: str = SERIES_CLASS_CSV_URL) -> dict[str, FundTrust]:
|
||
"""
|
||
Download the SEC Investment Company Series and Class CSV.
|
||
Returns a dict keyed by CIK -> FundTrust with nested series/classes.
|
||
|
||
Actual CSV columns (2025):
|
||
Reporting File Number, CIK Number, Entity Name, Entity Org Type,
|
||
Series ID, Series Name, Class ID, Class Name, Class Ticker,
|
||
Address_1, Address_2, City, State, Zip Code
|
||
"""
|
||
log.info("Downloading series/class reference CSV …")
|
||
resp = _throttled_get(csv_url)
|
||
resp.encoding = "utf-8"
|
||
|
||
reader = csv.DictReader(io.StringIO(resp.text))
|
||
trusts: dict[str, FundTrust] = {}
|
||
|
||
for row in reader:
|
||
cik = row.get("CIK Number", "").strip().zfill(10)
|
||
if not cik or cik == "0" * 10:
|
||
continue
|
||
|
||
if cik not in trusts:
|
||
trusts[cik] = FundTrust(
|
||
cik=cik,
|
||
trust_name=row.get("Entity Name", "").strip(),
|
||
file_number=row.get("Reporting File Number", "").strip(),
|
||
)
|
||
trust = trusts[cik]
|
||
|
||
series_id = row.get("Series ID", "").strip()
|
||
if not series_id:
|
||
continue
|
||
|
||
existing_series = {s.series_id: s for s in trust.series}
|
||
if series_id not in existing_series:
|
||
series = FundSeries(
|
||
series_id=series_id,
|
||
series_name=row.get("Series Name", "").strip(),
|
||
)
|
||
trust.series.append(series)
|
||
existing_series[series_id] = series
|
||
|
||
series = existing_series[series_id]
|
||
class_id = row.get("Class ID", "").strip()
|
||
if class_id and not any(c.class_id == class_id for c in series.classes):
|
||
series.classes.append(ShareClass(
|
||
class_id=class_id,
|
||
class_name=row.get("Class Name", "").strip(),
|
||
ticker=row.get("Class Ticker", "").strip(),
|
||
))
|
||
|
||
log.info("Loaded %d investment company trusts from CSV", len(trusts))
|
||
return trusts
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 2. Fetch filing history for a CIK via the Submissions API
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def fetch_submissions(cik: str) -> dict:
|
||
"""Return the full JSON from data.sec.gov/submissions/CIK{cik}.json."""
|
||
cik_padded = cik.zfill(10)
|
||
url = f"{DATA_SEC}/submissions/CIK{cik_padded}.json"
|
||
log.info("Fetching submissions for CIK %s", cik_padded)
|
||
resp = _throttled_get(url)
|
||
return resp.json()
|
||
|
||
|
||
def extract_filings(submissions_json: dict, form_types: set[str]) -> list[Filing]:
|
||
"""Extract filings of given form types from the submissions JSON."""
|
||
cik = str(submissions_json.get("cik", "")).zfill(10)
|
||
recent = submissions_json.get("filings", {}).get("recent", {})
|
||
filings = []
|
||
|
||
accessions = recent.get("accessionNumber", [])
|
||
forms = recent.get("form", [])
|
||
dates = recent.get("filingDate", [])
|
||
docs = recent.get("primaryDocument", [])
|
||
descs = recent.get("primaryDocDescription", [])
|
||
|
||
for i in range(len(accessions)):
|
||
if forms[i] not in form_types:
|
||
continue
|
||
|
||
accession_no_dashes = accessions[i].replace("-", "")
|
||
doc_url = f"{ARCHIVES}/{int(cik)}/{accession_no_dashes}/{docs[i]}"
|
||
|
||
filings.append(Filing(
|
||
accession_number=accessions[i],
|
||
form_type=forms[i],
|
||
filing_date=dates[i],
|
||
primary_document=docs[i],
|
||
description=descs[i] if i < len(descs) else "",
|
||
document_url=doc_url,
|
||
))
|
||
|
||
return filings
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 3. Download and parse a filing document (HTML/XML → text)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def download_filing_text(filing: Filing, max_bytes: int = 5_000_000) -> str:
|
||
"""Download a filing document and extract plain text from HTML/XML."""
|
||
if not filing.document_url:
|
||
return ""
|
||
|
||
try:
|
||
resp = _throttled_get(filing.document_url, stream=True)
|
||
content_type = resp.headers.get("Content-Type", "")
|
||
|
||
if "pdf" in content_type.lower():
|
||
log.info("Skipping PDF document: %s", filing.document_url)
|
||
return "[PDF — binary content not extracted]"
|
||
|
||
raw = resp.content[:max_bytes]
|
||
text = raw.decode("utf-8", errors="replace")
|
||
soup = BeautifulSoup(text, "lxml")
|
||
|
||
for tag in soup(["script", "style", "meta", "link"]):
|
||
tag.decompose()
|
||
|
||
plain = soup.get_text(separator="\n", strip=True)
|
||
plain = re.sub(r"\n{3,}", "\n\n", plain)
|
||
return plain
|
||
|
||
except Exception as e:
|
||
log.warning("Failed to download %s: %s", filing.document_url, e)
|
||
return ""
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 4. Search EDGAR full-text index for filings
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def search_filings(
|
||
query: str = "",
|
||
forms: str = "485BPOS,497,497K",
|
||
start_date: str = "2024-01-01",
|
||
end_date: str = "2025-12-31",
|
||
cik: str = "",
|
||
max_results: int = 20,
|
||
) -> list[dict]:
|
||
"""
|
||
Use the EDGAR Full-Text Search API (efts.sec.gov) to find filings.
|
||
Returns a list of hit dicts from the Elasticsearch response.
|
||
"""
|
||
params: dict = {
|
||
"q": query or "*",
|
||
"forms": forms,
|
||
"dateRange": "custom",
|
||
"startdt": start_date,
|
||
"enddt": end_date,
|
||
"from": 0,
|
||
"size": max_results,
|
||
}
|
||
if cik:
|
||
params["q"] = f'"{cik}"' if not query else f"{query} AND {cik}"
|
||
|
||
log.info("EFTS search: forms=%s, date=%s–%s, q=%s", forms, start_date, end_date, params["q"])
|
||
resp = _throttled_get(EFTS_SEC, params=params)
|
||
data = resp.json()
|
||
|
||
hits = data.get("hits", {}).get("hits", [])
|
||
log.info("EFTS returned %d hits", len(hits))
|
||
return [h.get("_source", {}) for h in hits]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 5. Build the full dataset for a list of CIKs
|
||
# ---------------------------------------------------------------------------
|
||
|
||
EXAMPLE_FUNDS = [
|
||
{
|
||
"cik": "0000036405",
|
||
"name": "Vanguard Index Funds (Vanguard 500 Index Fund, VOO, VFIAX)",
|
||
},
|
||
{
|
||
"cik": "0000024238",
|
||
"name": "Fidelity Contrafund (FCNTX, FCNKX)",
|
||
},
|
||
{
|
||
"cik": "0001100663",
|
||
"name": "iShares Trust (IVV, iShares Core S&P 500 ETF)",
|
||
},
|
||
{
|
||
"cik": "0000773757",
|
||
"name": "Columbia Funds Series Trust I",
|
||
},
|
||
{
|
||
"cik": "0001795351",
|
||
"name": "T. Rowe Price Exchange-Traded Funds, Inc.",
|
||
},
|
||
]
|
||
|
||
|
||
def build_dataset(
|
||
ciks: list[str],
|
||
output_dir: str = "dataset",
|
||
max_prospectus_filings: int = 5,
|
||
max_supplement_filings: int = 5,
|
||
download_text: bool = True,
|
||
reference_data: Optional[dict[str, FundTrust]] = None,
|
||
) -> list[FundDataset]:
|
||
"""
|
||
For each CIK:
|
||
1. Merge reference data (series/class info)
|
||
2. Fetch filing history from Submissions API
|
||
3. Extract prospectus & supplement filings
|
||
4. Optionally download filing text
|
||
5. Save to JSON
|
||
"""
|
||
out = Path(output_dir)
|
||
out.mkdir(parents=True, exist_ok=True)
|
||
|
||
datasets: list[FundDataset] = []
|
||
|
||
for cik in tqdm(ciks, desc="Processing funds"):
|
||
cik_padded = cik.zfill(10)
|
||
|
||
# Resolve reference data
|
||
if reference_data and cik_padded in reference_data:
|
||
trust = reference_data[cik_padded]
|
||
else:
|
||
trust = FundTrust(cik=cik_padded, trust_name="(unknown — CSV not loaded)")
|
||
|
||
# Fetch submissions
|
||
try:
|
||
subs = fetch_submissions(cik_padded)
|
||
except requests.HTTPError as e:
|
||
log.error("Could not fetch submissions for CIK %s: %s", cik_padded, e)
|
||
continue
|
||
|
||
api_name = subs.get("name", "")
|
||
if api_name and "(unknown" in trust.trust_name:
|
||
trust.trust_name = api_name
|
||
|
||
# Extract filings
|
||
all_filings = extract_filings(subs, PROSPECTUS_FORM_TYPES)
|
||
log.info("CIK %s: found %d prospectus-related filings", cik_padded, len(all_filings))
|
||
|
||
prospectus_filings = [
|
||
f for f in all_filings if f.form_type in ("485BPOS", "485APOS", "N-1A")
|
||
][:max_prospectus_filings]
|
||
|
||
supplement_filings = [
|
||
f for f in all_filings if f.form_type in ("497", "497K")
|
||
][:max_supplement_filings]
|
||
|
||
if download_text:
|
||
for f in prospectus_filings + supplement_filings:
|
||
f.text_content = download_filing_text(f)
|
||
log.info(
|
||
" %s %s → %d chars",
|
||
f.form_type, f.filing_date, len(f.text_content),
|
||
)
|
||
|
||
ds = FundDataset(
|
||
trust=trust,
|
||
prospectus_filings=prospectus_filings,
|
||
supplement_filings=supplement_filings,
|
||
)
|
||
datasets.append(ds)
|
||
|
||
# Save individual fund JSON
|
||
fund_file = out / f"{cik_padded}.json"
|
||
with open(fund_file, "w", encoding="utf-8") as fp:
|
||
json.dump(asdict(ds), fp, indent=2, ensure_ascii=False)
|
||
log.info("Saved %s", fund_file)
|
||
|
||
# Save combined manifest
|
||
manifest = []
|
||
for ds in datasets:
|
||
manifest.append({
|
||
"cik": ds.trust.cik,
|
||
"trust_name": ds.trust.trust_name,
|
||
"num_series": len(ds.trust.series),
|
||
"num_classes": sum(len(s.classes) for s in ds.trust.series),
|
||
"num_prospectus_filings": len(ds.prospectus_filings),
|
||
"num_supplement_filings": len(ds.supplement_filings),
|
||
})
|
||
|
||
manifest_file = out / "manifest.json"
|
||
with open(manifest_file, "w", encoding="utf-8") as fp:
|
||
json.dump(manifest, fp, indent=2)
|
||
log.info("Saved manifest with %d funds → %s", len(manifest), manifest_file)
|
||
|
||
return datasets
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# 6. Download XBRL Risk/Return data (quarterly ZIP)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def download_xbrl_risk_return(
|
||
quarter: str = "2025q2",
|
||
output_dir: str = "dataset/xbrl_rr",
|
||
) -> Path:
|
||
"""
|
||
Download a quarterly Mutual Fund Prospectus Risk/Return Summary
|
||
ZIP from the SEC and extract it.
|
||
"""
|
||
url = (
|
||
f"https://www.sec.gov/files/dera/data/"
|
||
f"mutual-fund-prospectus-risk/return-summary-data-sets/{quarter}_rr1.zip"
|
||
)
|
||
out = Path(output_dir)
|
||
out.mkdir(parents=True, exist_ok=True)
|
||
zip_path = out / f"{quarter}_rr1.zip"
|
||
|
||
log.info("Downloading XBRL Risk/Return dataset: %s", url)
|
||
resp = _throttled_get(url, stream=True)
|
||
|
||
with open(zip_path, "wb") as fp:
|
||
for chunk in resp.iter_content(chunk_size=8192):
|
||
fp.write(chunk)
|
||
|
||
with zipfile.ZipFile(zip_path, "r") as zf:
|
||
zf.extractall(out / quarter)
|
||
log.info("Extracted to %s", out / quarter)
|
||
|
||
return out / quarter
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def main():
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(
|
||
description="Fetch SEC fund prospectus + reference data for LLM training."
|
||
)
|
||
parser.add_argument(
|
||
"--ciks",
|
||
nargs="*",
|
||
help="CIK numbers to fetch (default: 5 example funds)",
|
||
)
|
||
parser.add_argument(
|
||
"--output", default="dataset",
|
||
help="Output directory (default: dataset)",
|
||
)
|
||
parser.add_argument(
|
||
"--max-prospectus", type=int, default=3,
|
||
help="Max prospectus filings per fund (default: 3)",
|
||
)
|
||
parser.add_argument(
|
||
"--max-supplements", type=int, default=5,
|
||
help="Max supplement filings per fund (default: 5)",
|
||
)
|
||
parser.add_argument(
|
||
"--no-download-text", action="store_true",
|
||
help="Skip downloading filing document text",
|
||
)
|
||
parser.add_argument(
|
||
"--load-reference-csv", action="store_true",
|
||
help="Download and load the full SEC series/class CSV (~15 MB)",
|
||
)
|
||
parser.add_argument(
|
||
"--download-xbrl-rr", type=str, default="",
|
||
help="Also download XBRL Risk/Return dataset for this quarter (e.g. 2025q2)",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
ciks = args.ciks or [f["cik"] for f in EXAMPLE_FUNDS]
|
||
|
||
ref_data = None
|
||
if args.load_reference_csv:
|
||
ref_data = load_series_class_reference()
|
||
|
||
datasets = build_dataset(
|
||
ciks=ciks,
|
||
output_dir=args.output,
|
||
max_prospectus_filings=args.max_prospectus,
|
||
max_supplement_filings=args.max_supplements,
|
||
download_text=not args.no_download_text,
|
||
reference_data=ref_data,
|
||
)
|
||
|
||
if args.download_xbrl_rr:
|
||
download_xbrl_risk_return(args.download_xbrl_rr, f"{args.output}/xbrl_rr")
|
||
|
||
print(f"\nDone. Processed {len(datasets)} funds → ./{args.output}/")
|
||
print("\nSummary:")
|
||
for ds in datasets:
|
||
n_classes = sum(len(s.classes) for s in ds.trust.series)
|
||
print(
|
||
f" {ds.trust.cik} | {ds.trust.trust_name[:50]:50s} | "
|
||
f"{len(ds.trust.series)} series, {n_classes} classes | "
|
||
f"{len(ds.prospectus_filings)} prospectus, "
|
||
f"{len(ds.supplement_filings)} supplements"
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|