fund_rfid_data/fetch_filings.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

270 lines
9.3 KiB
Python

"""
Step 2: Fetch prospectus filings for the fund universe.
For each trust (CIK) in the database, fetch filing history from the
SEC Submissions API and download prospectus documents (485BPOS, 497, 497K, N-1A).
Stores filing metadata and extracted text in the database.
"""
import logging
import re
import time
import warnings
from typing import Optional
import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from tqdm import tqdm
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
from fund_db import FundDatabase
log = logging.getLogger(__name__)
DATA_SEC = "https://data.sec.gov"
ARCHIVES = "https://www.sec.gov/Archives/edgar/data"
PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A", "N-1A/A"}
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0
def _throttled_get(url: str, **kwargs) -> requests.Response:
global _last_request_time
elapsed = time.time() - _last_request_time
if elapsed < REQUEST_INTERVAL:
time.sleep(REQUEST_INTERVAL - elapsed)
kwargs.setdefault("headers", {}).update(HEADERS)
kwargs.setdefault("timeout", 30)
resp = requests.get(url, **kwargs)
_last_request_time = time.time()
resp.raise_for_status()
return resp
def fetch_filing_metadata(db: FundDatabase, cik: str,
max_prospectus: int = 3,
max_supplements: int = 3) -> int:
"""Fetch filing metadata for a single CIK and store in database."""
url = f"{DATA_SEC}/submissions/CIK{cik}.json"
resp = _throttled_get(url)
data = resp.json()
recent = data.get("filings", {}).get("recent", {})
accessions = recent.get("accessionNumber", [])
forms = recent.get("form", [])
dates = recent.get("filingDate", [])
docs = recent.get("primaryDocument", [])
descs = recent.get("primaryDocDescription", [])
prospectus_count = 0
supplement_count = 0
total_saved = 0
for i in range(len(accessions)):
form_type = forms[i]
if form_type not in PROSPECTUS_FORM_TYPES:
continue
is_prospectus = form_type in ("485BPOS", "485APOS", "N-1A", "N-1A/A")
if is_prospectus and prospectus_count >= max_prospectus:
continue
if not is_prospectus and supplement_count >= max_supplements:
continue
accession_no_dashes = accessions[i].replace("-", "")
cik_int = str(int(cik))
doc_url = f"{ARCHIVES}/{cik_int}/{accession_no_dashes}/{docs[i]}"
db.insert_filing(
accession_number=accessions[i],
cik=cik,
form_type=form_type,
filing_date=dates[i],
primary_document=docs[i],
document_url=doc_url,
description=descs[i] if i < len(descs) else "",
)
total_saved += 1
if is_prospectus:
prospectus_count += 1
else:
supplement_count += 1
return total_saved
def download_filing_content(url: str, max_bytes: int = 30_000_000) -> tuple[str, str]:
"""
Download a filing document. Returns (plain_text, raw_html).
Raw HTML is preserved so table structure, inline XBRL tags,
and other markup remain available for downstream processing.
"""
try:
resp = _throttled_get(url, stream=True)
content_type = resp.headers.get("Content-Type", "")
if "pdf" in content_type.lower():
return "[PDF — binary content not extracted]", ""
raw = resp.content[:max_bytes]
html = raw.decode("utf-8", errors="replace")
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "meta", "link"]):
tag.decompose()
plain = soup.get_text(separator="\n", strip=True)
plain = re.sub(r"\n{3,}", "\n\n", plain)
return plain, html
except Exception as e:
log.warning("Failed to download %s: %s", url, e)
return "", ""
def fetch_filings_for_universe(db: FundDatabase,
ciks: list[str] = None,
limit: int = 0,
download_text: bool = True,
max_prospectus: int = 3,
max_supplements: int = 3):
"""Fetch filing metadata and text for a list of CIKs."""
if ciks is None:
ciks = db.get_pending_ciks("fetch_filings")
if limit > 0:
ciks = ciks[:limit]
log.info("Fetching filings for %d CIKs", len(ciks))
total_filings = 0
for cik in tqdm(ciks, desc="Fetching filings"):
status = db.get_pipeline_status(cik, "fetch_filings")
if status == "done":
continue
db.set_pipeline_status(cik, "fetch_filings", "running")
try:
count = fetch_filing_metadata(db, cik, max_prospectus, max_supplements)
db.set_pipeline_status(cik, "fetch_filings", "done", items_processed=count)
total_filings += count
except Exception as e:
db.set_pipeline_status(cik, "fetch_filings", "error",
error_message=str(e)[:500])
log.warning("Failed for CIK %s: %s", cik, e)
log.info("Saved %d filing records", total_filings)
if download_text:
download_pending_texts(db, limit=limit * 10 if limit else 0)
return total_filings
def download_pending_texts(db: FundDatabase, limit: int = 0):
"""Download text + raw HTML for filings that don't have text yet."""
with db.conn() as c:
query = """
SELECT f.accession_number, f.document_url, f.form_type
FROM filing f
LEFT JOIN filing_text ft ON f.accession_number = ft.accession_number
WHERE ft.accession_number IS NULL
AND f.document_url IS NOT NULL
AND f.document_url != ''
"""
if limit > 0:
query += f" LIMIT {limit}"
rows = c.execute(query).fetchall()
log.info("Downloading text + HTML for %d filings", len(rows))
for row in tqdm(rows, desc="Downloading filings"):
acc = row["accession_number"]
url = row["document_url"]
plain, html = download_filing_content(url)
if plain:
db.save_filing_text(acc, plain, html_content=html if html else None)
log.debug(" %s%d text chars, %d HTML chars",
acc, len(plain), len(html))
def backfill_html(db: FundDatabase, limit: int = 0):
"""Re-download raw HTML for filings that have text but no HTML stored."""
with db.conn() as c:
query = """
SELECT f.accession_number, f.document_url, f.form_type
FROM filing f
JOIN filing_text ft ON f.accession_number = ft.accession_number
LEFT JOIN filing_html fh ON f.accession_number = fh.accession_number
WHERE fh.accession_number IS NULL
AND f.document_url IS NOT NULL
AND f.document_url != ''
"""
if limit > 0:
query += f" LIMIT {limit}"
rows = c.execute(query).fetchall()
log.info("Backfilling HTML for %d filings", len(rows))
for row in tqdm(rows, desc="Backfilling HTML"):
acc = row["accession_number"]
url = row["document_url"]
plain, html = download_filing_content(url)
if html:
db.save_filing_text(acc, plain, html_content=html)
log.debug(" %s%d HTML chars", acc, len(html))
def main():
import argparse
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(description="Fetch SEC fund filings")
parser.add_argument("--db", default="fund_data.db", help="Database path")
parser.add_argument("--limit", type=int, default=0,
help="Max CIKs to process (0=all pending)")
parser.add_argument("--ciks", nargs="*", help="Specific CIKs to fetch")
parser.add_argument("--max-prospectus", type=int, default=3,
help="Max prospectus filings per trust")
parser.add_argument("--max-supplements", type=int, default=3,
help="Max supplement filings per trust")
parser.add_argument("--no-text", action="store_true",
help="Skip downloading filing text (metadata only)")
parser.add_argument("--text-only", action="store_true",
help="Only download text for existing filings")
parser.add_argument("--backfill-html", action="store_true",
help="Re-download HTML for filings missing raw HTML")
args = parser.parse_args()
db = FundDatabase(args.db)
if args.backfill_html:
backfill_html(db, limit=args.limit)
elif args.text_only:
download_pending_texts(db, limit=args.limit)
else:
ciks = [c.zfill(10) for c in args.ciks] if args.ciks else None
fetch_filings_for_universe(
db, ciks=ciks, limit=args.limit,
download_text=not args.no_text,
max_prospectus=args.max_prospectus,
max_supplements=args.max_supplements,
)
stats = db.get_stats()
print(f"\nDatabase stats:")
print(f" Filings: {stats['filing']:>10,}")
print(f" Filing texts: {stats['filing_text']:>10,}")
if __name__ == "__main__":
main()