Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
270 lines
9.3 KiB
Python
270 lines
9.3 KiB
Python
"""
|
|
Step 2: Fetch prospectus filings for the fund universe.
|
|
|
|
For each trust (CIK) in the database, fetch filing history from the
|
|
SEC Submissions API and download prospectus documents (485BPOS, 497, 497K, N-1A).
|
|
Stores filing metadata and extracted text in the database.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
import time
|
|
import warnings
|
|
from typing import Optional
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
|
|
from tqdm import tqdm
|
|
|
|
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
|
|
|
from fund_db import FundDatabase
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
DATA_SEC = "https://data.sec.gov"
|
|
ARCHIVES = "https://www.sec.gov/Archives/edgar/data"
|
|
PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A", "N-1A/A"}
|
|
|
|
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
|
|
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
|
|
REQUEST_INTERVAL = 0.12
|
|
|
|
_last_request_time = 0.0
|
|
|
|
|
|
def _throttled_get(url: str, **kwargs) -> requests.Response:
|
|
global _last_request_time
|
|
elapsed = time.time() - _last_request_time
|
|
if elapsed < REQUEST_INTERVAL:
|
|
time.sleep(REQUEST_INTERVAL - elapsed)
|
|
kwargs.setdefault("headers", {}).update(HEADERS)
|
|
kwargs.setdefault("timeout", 30)
|
|
resp = requests.get(url, **kwargs)
|
|
_last_request_time = time.time()
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def fetch_filing_metadata(db: FundDatabase, cik: str,
|
|
max_prospectus: int = 3,
|
|
max_supplements: int = 3) -> int:
|
|
"""Fetch filing metadata for a single CIK and store in database."""
|
|
url = f"{DATA_SEC}/submissions/CIK{cik}.json"
|
|
resp = _throttled_get(url)
|
|
data = resp.json()
|
|
|
|
recent = data.get("filings", {}).get("recent", {})
|
|
accessions = recent.get("accessionNumber", [])
|
|
forms = recent.get("form", [])
|
|
dates = recent.get("filingDate", [])
|
|
docs = recent.get("primaryDocument", [])
|
|
descs = recent.get("primaryDocDescription", [])
|
|
|
|
prospectus_count = 0
|
|
supplement_count = 0
|
|
total_saved = 0
|
|
|
|
for i in range(len(accessions)):
|
|
form_type = forms[i]
|
|
if form_type not in PROSPECTUS_FORM_TYPES:
|
|
continue
|
|
|
|
is_prospectus = form_type in ("485BPOS", "485APOS", "N-1A", "N-1A/A")
|
|
if is_prospectus and prospectus_count >= max_prospectus:
|
|
continue
|
|
if not is_prospectus and supplement_count >= max_supplements:
|
|
continue
|
|
|
|
accession_no_dashes = accessions[i].replace("-", "")
|
|
cik_int = str(int(cik))
|
|
doc_url = f"{ARCHIVES}/{cik_int}/{accession_no_dashes}/{docs[i]}"
|
|
|
|
db.insert_filing(
|
|
accession_number=accessions[i],
|
|
cik=cik,
|
|
form_type=form_type,
|
|
filing_date=dates[i],
|
|
primary_document=docs[i],
|
|
document_url=doc_url,
|
|
description=descs[i] if i < len(descs) else "",
|
|
)
|
|
total_saved += 1
|
|
|
|
if is_prospectus:
|
|
prospectus_count += 1
|
|
else:
|
|
supplement_count += 1
|
|
|
|
return total_saved
|
|
|
|
|
|
def download_filing_content(url: str, max_bytes: int = 30_000_000) -> tuple[str, str]:
|
|
"""
|
|
Download a filing document. Returns (plain_text, raw_html).
|
|
Raw HTML is preserved so table structure, inline XBRL tags,
|
|
and other markup remain available for downstream processing.
|
|
"""
|
|
try:
|
|
resp = _throttled_get(url, stream=True)
|
|
content_type = resp.headers.get("Content-Type", "")
|
|
|
|
if "pdf" in content_type.lower():
|
|
return "[PDF — binary content not extracted]", ""
|
|
|
|
raw = resp.content[:max_bytes]
|
|
html = raw.decode("utf-8", errors="replace")
|
|
|
|
soup = BeautifulSoup(html, "lxml")
|
|
for tag in soup(["script", "style", "meta", "link"]):
|
|
tag.decompose()
|
|
|
|
plain = soup.get_text(separator="\n", strip=True)
|
|
plain = re.sub(r"\n{3,}", "\n\n", plain)
|
|
|
|
return plain, html
|
|
except Exception as e:
|
|
log.warning("Failed to download %s: %s", url, e)
|
|
return "", ""
|
|
|
|
|
|
def fetch_filings_for_universe(db: FundDatabase,
|
|
ciks: list[str] = None,
|
|
limit: int = 0,
|
|
download_text: bool = True,
|
|
max_prospectus: int = 3,
|
|
max_supplements: int = 3):
|
|
"""Fetch filing metadata and text for a list of CIKs."""
|
|
if ciks is None:
|
|
ciks = db.get_pending_ciks("fetch_filings")
|
|
if limit > 0:
|
|
ciks = ciks[:limit]
|
|
|
|
log.info("Fetching filings for %d CIKs", len(ciks))
|
|
total_filings = 0
|
|
|
|
for cik in tqdm(ciks, desc="Fetching filings"):
|
|
status = db.get_pipeline_status(cik, "fetch_filings")
|
|
if status == "done":
|
|
continue
|
|
|
|
db.set_pipeline_status(cik, "fetch_filings", "running")
|
|
try:
|
|
count = fetch_filing_metadata(db, cik, max_prospectus, max_supplements)
|
|
db.set_pipeline_status(cik, "fetch_filings", "done", items_processed=count)
|
|
total_filings += count
|
|
except Exception as e:
|
|
db.set_pipeline_status(cik, "fetch_filings", "error",
|
|
error_message=str(e)[:500])
|
|
log.warning("Failed for CIK %s: %s", cik, e)
|
|
|
|
log.info("Saved %d filing records", total_filings)
|
|
|
|
if download_text:
|
|
download_pending_texts(db, limit=limit * 10 if limit else 0)
|
|
|
|
return total_filings
|
|
|
|
|
|
def download_pending_texts(db: FundDatabase, limit: int = 0):
|
|
"""Download text + raw HTML for filings that don't have text yet."""
|
|
with db.conn() as c:
|
|
query = """
|
|
SELECT f.accession_number, f.document_url, f.form_type
|
|
FROM filing f
|
|
LEFT JOIN filing_text ft ON f.accession_number = ft.accession_number
|
|
WHERE ft.accession_number IS NULL
|
|
AND f.document_url IS NOT NULL
|
|
AND f.document_url != ''
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
rows = c.execute(query).fetchall()
|
|
|
|
log.info("Downloading text + HTML for %d filings", len(rows))
|
|
|
|
for row in tqdm(rows, desc="Downloading filings"):
|
|
acc = row["accession_number"]
|
|
url = row["document_url"]
|
|
|
|
plain, html = download_filing_content(url)
|
|
if plain:
|
|
db.save_filing_text(acc, plain, html_content=html if html else None)
|
|
log.debug(" %s → %d text chars, %d HTML chars",
|
|
acc, len(plain), len(html))
|
|
|
|
|
|
def backfill_html(db: FundDatabase, limit: int = 0):
|
|
"""Re-download raw HTML for filings that have text but no HTML stored."""
|
|
with db.conn() as c:
|
|
query = """
|
|
SELECT f.accession_number, f.document_url, f.form_type
|
|
FROM filing f
|
|
JOIN filing_text ft ON f.accession_number = ft.accession_number
|
|
LEFT JOIN filing_html fh ON f.accession_number = fh.accession_number
|
|
WHERE fh.accession_number IS NULL
|
|
AND f.document_url IS NOT NULL
|
|
AND f.document_url != ''
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
rows = c.execute(query).fetchall()
|
|
|
|
log.info("Backfilling HTML for %d filings", len(rows))
|
|
|
|
for row in tqdm(rows, desc="Backfilling HTML"):
|
|
acc = row["accession_number"]
|
|
url = row["document_url"]
|
|
|
|
plain, html = download_filing_content(url)
|
|
if html:
|
|
db.save_filing_text(acc, plain, html_content=html)
|
|
log.debug(" %s → %d HTML chars", acc, len(html))
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s")
|
|
|
|
parser = argparse.ArgumentParser(description="Fetch SEC fund filings")
|
|
parser.add_argument("--db", default="fund_data.db", help="Database path")
|
|
parser.add_argument("--limit", type=int, default=0,
|
|
help="Max CIKs to process (0=all pending)")
|
|
parser.add_argument("--ciks", nargs="*", help="Specific CIKs to fetch")
|
|
parser.add_argument("--max-prospectus", type=int, default=3,
|
|
help="Max prospectus filings per trust")
|
|
parser.add_argument("--max-supplements", type=int, default=3,
|
|
help="Max supplement filings per trust")
|
|
parser.add_argument("--no-text", action="store_true",
|
|
help="Skip downloading filing text (metadata only)")
|
|
parser.add_argument("--text-only", action="store_true",
|
|
help="Only download text for existing filings")
|
|
parser.add_argument("--backfill-html", action="store_true",
|
|
help="Re-download HTML for filings missing raw HTML")
|
|
args = parser.parse_args()
|
|
|
|
db = FundDatabase(args.db)
|
|
|
|
if args.backfill_html:
|
|
backfill_html(db, limit=args.limit)
|
|
elif args.text_only:
|
|
download_pending_texts(db, limit=args.limit)
|
|
else:
|
|
ciks = [c.zfill(10) for c in args.ciks] if args.ciks else None
|
|
fetch_filings_for_universe(
|
|
db, ciks=ciks, limit=args.limit,
|
|
download_text=not args.no_text,
|
|
max_prospectus=args.max_prospectus,
|
|
max_supplements=args.max_supplements,
|
|
)
|
|
|
|
stats = db.get_stats()
|
|
print(f"\nDatabase stats:")
|
|
print(f" Filings: {stats['filing']:>10,}")
|
|
print(f" Filing texts: {stats['filing_text']:>10,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|