""" Step 2: Fetch prospectus filings for the fund universe. For each trust (CIK) in the database, fetch filing history from the SEC Submissions API and download prospectus documents (485BPOS, 497, 497K, N-1A). Stores filing metadata and extracted text in the database. """ import logging import re import time import warnings from typing import Optional import requests from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning from tqdm import tqdm warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) from fund_db import FundDatabase log = logging.getLogger(__name__) DATA_SEC = "https://data.sec.gov" ARCHIVES = "https://www.sec.gov/Archives/edgar/data" PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A", "N-1A/A"} USER_AGENT = "FundDataResearch/1.0 research@university.edu" HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"} REQUEST_INTERVAL = 0.12 _last_request_time = 0.0 def _throttled_get(url: str, **kwargs) -> requests.Response: global _last_request_time elapsed = time.time() - _last_request_time if elapsed < REQUEST_INTERVAL: time.sleep(REQUEST_INTERVAL - elapsed) kwargs.setdefault("headers", {}).update(HEADERS) kwargs.setdefault("timeout", 30) resp = requests.get(url, **kwargs) _last_request_time = time.time() resp.raise_for_status() return resp def fetch_filing_metadata(db: FundDatabase, cik: str, max_prospectus: int = 3, max_supplements: int = 3) -> int: """Fetch filing metadata for a single CIK and store in database.""" url = f"{DATA_SEC}/submissions/CIK{cik}.json" resp = _throttled_get(url) data = resp.json() recent = data.get("filings", {}).get("recent", {}) accessions = recent.get("accessionNumber", []) forms = recent.get("form", []) dates = recent.get("filingDate", []) docs = recent.get("primaryDocument", []) descs = recent.get("primaryDocDescription", []) prospectus_count = 0 supplement_count = 0 total_saved = 0 for i in range(len(accessions)): form_type = forms[i] if form_type not in PROSPECTUS_FORM_TYPES: continue is_prospectus = form_type in ("485BPOS", "485APOS", "N-1A", "N-1A/A") if is_prospectus and prospectus_count >= max_prospectus: continue if not is_prospectus and supplement_count >= max_supplements: continue accession_no_dashes = accessions[i].replace("-", "") cik_int = str(int(cik)) doc_url = f"{ARCHIVES}/{cik_int}/{accession_no_dashes}/{docs[i]}" db.insert_filing( accession_number=accessions[i], cik=cik, form_type=form_type, filing_date=dates[i], primary_document=docs[i], document_url=doc_url, description=descs[i] if i < len(descs) else "", ) total_saved += 1 if is_prospectus: prospectus_count += 1 else: supplement_count += 1 return total_saved def download_filing_content(url: str, max_bytes: int = 30_000_000) -> tuple[str, str]: """ Download a filing document. Returns (plain_text, raw_html). Raw HTML is preserved so table structure, inline XBRL tags, and other markup remain available for downstream processing. """ try: resp = _throttled_get(url, stream=True) content_type = resp.headers.get("Content-Type", "") if "pdf" in content_type.lower(): return "[PDF — binary content not extracted]", "" raw = resp.content[:max_bytes] html = raw.decode("utf-8", errors="replace") soup = BeautifulSoup(html, "lxml") for tag in soup(["script", "style", "meta", "link"]): tag.decompose() plain = soup.get_text(separator="\n", strip=True) plain = re.sub(r"\n{3,}", "\n\n", plain) return plain, html except Exception as e: log.warning("Failed to download %s: %s", url, e) return "", "" def fetch_filings_for_universe(db: FundDatabase, ciks: list[str] = None, limit: int = 0, download_text: bool = True, max_prospectus: int = 3, max_supplements: int = 3): """Fetch filing metadata and text for a list of CIKs.""" if ciks is None: ciks = db.get_pending_ciks("fetch_filings") if limit > 0: ciks = ciks[:limit] log.info("Fetching filings for %d CIKs", len(ciks)) total_filings = 0 for cik in tqdm(ciks, desc="Fetching filings"): status = db.get_pipeline_status(cik, "fetch_filings") if status == "done": continue db.set_pipeline_status(cik, "fetch_filings", "running") try: count = fetch_filing_metadata(db, cik, max_prospectus, max_supplements) db.set_pipeline_status(cik, "fetch_filings", "done", items_processed=count) total_filings += count except Exception as e: db.set_pipeline_status(cik, "fetch_filings", "error", error_message=str(e)[:500]) log.warning("Failed for CIK %s: %s", cik, e) log.info("Saved %d filing records", total_filings) if download_text: download_pending_texts(db, limit=limit * 10 if limit else 0) return total_filings def download_pending_texts(db: FundDatabase, limit: int = 0): """Download text + raw HTML for filings that don't have text yet.""" with db.conn() as c: query = """ SELECT f.accession_number, f.document_url, f.form_type FROM filing f LEFT JOIN filing_text ft ON f.accession_number = ft.accession_number WHERE ft.accession_number IS NULL AND f.document_url IS NOT NULL AND f.document_url != '' """ if limit > 0: query += f" LIMIT {limit}" rows = c.execute(query).fetchall() log.info("Downloading text + HTML for %d filings", len(rows)) for row in tqdm(rows, desc="Downloading filings"): acc = row["accession_number"] url = row["document_url"] plain, html = download_filing_content(url) if plain: db.save_filing_text(acc, plain, html_content=html if html else None) log.debug(" %s → %d text chars, %d HTML chars", acc, len(plain), len(html)) def backfill_html(db: FundDatabase, limit: int = 0): """Re-download raw HTML for filings that have text but no HTML stored.""" with db.conn() as c: query = """ SELECT f.accession_number, f.document_url, f.form_type FROM filing f JOIN filing_text ft ON f.accession_number = ft.accession_number LEFT JOIN filing_html fh ON f.accession_number = fh.accession_number WHERE fh.accession_number IS NULL AND f.document_url IS NOT NULL AND f.document_url != '' """ if limit > 0: query += f" LIMIT {limit}" rows = c.execute(query).fetchall() log.info("Backfilling HTML for %d filings", len(rows)) for row in tqdm(rows, desc="Backfilling HTML"): acc = row["accession_number"] url = row["document_url"] plain, html = download_filing_content(url) if html: db.save_filing_text(acc, plain, html_content=html) log.debug(" %s → %d HTML chars", acc, len(html)) def main(): import argparse logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") parser = argparse.ArgumentParser(description="Fetch SEC fund filings") parser.add_argument("--db", default="fund_data.db", help="Database path") parser.add_argument("--limit", type=int, default=0, help="Max CIKs to process (0=all pending)") parser.add_argument("--ciks", nargs="*", help="Specific CIKs to fetch") parser.add_argument("--max-prospectus", type=int, default=3, help="Max prospectus filings per trust") parser.add_argument("--max-supplements", type=int, default=3, help="Max supplement filings per trust") parser.add_argument("--no-text", action="store_true", help="Skip downloading filing text (metadata only)") parser.add_argument("--text-only", action="store_true", help="Only download text for existing filings") parser.add_argument("--backfill-html", action="store_true", help="Re-download HTML for filings missing raw HTML") args = parser.parse_args() db = FundDatabase(args.db) if args.backfill_html: backfill_html(db, limit=args.limit) elif args.text_only: download_pending_texts(db, limit=args.limit) else: ciks = [c.zfill(10) for c in args.ciks] if args.ciks else None fetch_filings_for_universe( db, ciks=ciks, limit=args.limit, download_text=not args.no_text, max_prospectus=args.max_prospectus, max_supplements=args.max_supplements, ) stats = db.get_stats() print(f"\nDatabase stats:") print(f" Filings: {stats['filing']:>10,}") print(f" Filing texts: {stats['filing_text']:>10,}") if __name__ == "__main__": main()