fund_rfid_data/fetch_filings.py

"""
Step 2: Fetch prospectus filings for the fund universe.

For each trust (CIK) in the database, fetch filing history from the
SEC Submissions API and download prospectus documents (485BPOS, 497, 497K, N-1A).
Stores filing metadata and extracted text in the database.
"""

import logging
import re
import time
import warnings
from typing import Optional

import requests
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
from tqdm import tqdm

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

from fund_db import FundDatabase

log = logging.getLogger(__name__)

DATA_SEC = "https://data.sec.gov"
ARCHIVES = "https://www.sec.gov/Archives/edgar/data"
PROSPECTUS_FORM_TYPES = {"485BPOS", "485APOS", "497", "497K", "N-1A", "N-1A/A"}

USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12

_last_request_time = 0.0


def _throttled_get(url: str, **kwargs) -> requests.Response:
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - elapsed)
    kwargs.setdefault("headers", {}).update(HEADERS)
    kwargs.setdefault("timeout", 30)
    resp = requests.get(url, **kwargs)
    _last_request_time = time.time()
    resp.raise_for_status()
    return resp


def fetch_filing_metadata(db: FundDatabase, cik: str,
                          max_prospectus: int = 3,
                          max_supplements: int = 3) -> int:
    """Fetch filing metadata for a single CIK and store in database."""
    url = f"{DATA_SEC}/submissions/CIK{cik}.json"
    resp = _throttled_get(url)
    data = resp.json()

    recent = data.get("filings", {}).get("recent", {})
    accessions = recent.get("accessionNumber", [])
    forms = recent.get("form", [])
    dates = recent.get("filingDate", [])
    docs = recent.get("primaryDocument", [])
    descs = recent.get("primaryDocDescription", [])

    prospectus_count = 0
    supplement_count = 0
    total_saved = 0

    for i in range(len(accessions)):
        form_type = forms[i]
        if form_type not in PROSPECTUS_FORM_TYPES:
            continue

        is_prospectus = form_type in ("485BPOS", "485APOS", "N-1A", "N-1A/A")
        if is_prospectus and prospectus_count >= max_prospectus:
            continue
        if not is_prospectus and supplement_count >= max_supplements:
            continue

        accession_no_dashes = accessions[i].replace("-", "")
        cik_int = str(int(cik))
        doc_url = f"{ARCHIVES}/{cik_int}/{accession_no_dashes}/{docs[i]}"

        db.insert_filing(
            accession_number=accessions[i],
            cik=cik,
            form_type=form_type,
            filing_date=dates[i],
            primary_document=docs[i],
            document_url=doc_url,
            description=descs[i] if i < len(descs) else "",
        )
        total_saved += 1

        if is_prospectus:
            prospectus_count += 1
        else:
            supplement_count += 1

    return total_saved


def download_filing_content(url: str, max_bytes: int = 30_000_000) -> tuple[str, str]:
    """
    Download a filing document. Returns (plain_text, raw_html).
    Raw HTML is preserved so table structure, inline XBRL tags,
    and other markup remain available for downstream processing.
    """
    try:
        resp = _throttled_get(url, stream=True)
        content_type = resp.headers.get("Content-Type", "")

        if "pdf" in content_type.lower():
            return "[PDF — binary content not extracted]", ""

        raw = resp.content[:max_bytes]
        html = raw.decode("utf-8", errors="replace")

        soup = BeautifulSoup(html, "lxml")
        for tag in soup(["script", "style", "meta", "link"]):
            tag.decompose()

        plain = soup.get_text(separator="\n", strip=True)
        plain = re.sub(r"\n{3,}", "\n\n", plain)

        return plain, html
    except Exception as e:
        log.warning("Failed to download %s: %s", url, e)
        return "", ""


def fetch_filings_for_universe(db: FundDatabase,
                                ciks: list[str] = None,
                                limit: int = 0,
                                download_text: bool = True,
                                max_prospectus: int = 3,
                                max_supplements: int = 3):
    """Fetch filing metadata and text for a list of CIKs."""
    if ciks is None:
        ciks = db.get_pending_ciks("fetch_filings")
    if limit > 0:
        ciks = ciks[:limit]

    log.info("Fetching filings for %d CIKs", len(ciks))
    total_filings = 0

    for cik in tqdm(ciks, desc="Fetching filings"):
        status = db.get_pipeline_status(cik, "fetch_filings")
        if status == "done":
            continue

        db.set_pipeline_status(cik, "fetch_filings", "running")
        try:
            count = fetch_filing_metadata(db, cik, max_prospectus, max_supplements)
            db.set_pipeline_status(cik, "fetch_filings", "done", items_processed=count)
            total_filings += count
        except Exception as e:
            db.set_pipeline_status(cik, "fetch_filings", "error",
                                   error_message=str(e)[:500])
            log.warning("Failed for CIK %s: %s", cik, e)

    log.info("Saved %d filing records", total_filings)

    if download_text:
        download_pending_texts(db, limit=limit * 10 if limit else 0)

    return total_filings


def download_pending_texts(db: FundDatabase, limit: int = 0):
    """Download text + raw HTML for filings that don't have text yet."""
    with db.conn() as c:
        query = """
            SELECT f.accession_number, f.document_url, f.form_type
            FROM filing f
            LEFT JOIN filing_text ft ON f.accession_number = ft.accession_number
            WHERE ft.accession_number IS NULL
              AND f.document_url IS NOT NULL
              AND f.document_url != ''
        """
        if limit > 0:
            query += f" LIMIT {limit}"
        rows = c.execute(query).fetchall()

    log.info("Downloading text + HTML for %d filings", len(rows))

    for row in tqdm(rows, desc="Downloading filings"):
        acc = row["accession_number"]
        url = row["document_url"]

        plain, html = download_filing_content(url)
        if plain:
            db.save_filing_text(acc, plain, html_content=html if html else None)
            log.debug("  %s → %d text chars, %d HTML chars",
                      acc, len(plain), len(html))


def backfill_html(db: FundDatabase, limit: int = 0):
    """Re-download raw HTML for filings that have text but no HTML stored."""
    with db.conn() as c:
        query = """
            SELECT f.accession_number, f.document_url, f.form_type
            FROM filing f
            JOIN filing_text ft ON f.accession_number = ft.accession_number
            LEFT JOIN filing_html fh ON f.accession_number = fh.accession_number
            WHERE fh.accession_number IS NULL
              AND f.document_url IS NOT NULL
              AND f.document_url != ''
        """
        if limit > 0:
            query += f" LIMIT {limit}"
        rows = c.execute(query).fetchall()

    log.info("Backfilling HTML for %d filings", len(rows))

    for row in tqdm(rows, desc="Backfilling HTML"):
        acc = row["accession_number"]
        url = row["document_url"]

        plain, html = download_filing_content(url)
        if html:
            db.save_filing_text(acc, plain, html_content=html)
            log.debug("  %s → %d HTML chars", acc, len(html))


def main():
    import argparse
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s")

    parser = argparse.ArgumentParser(description="Fetch SEC fund filings")
    parser.add_argument("--db", default="fund_data.db", help="Database path")
    parser.add_argument("--limit", type=int, default=0,
                        help="Max CIKs to process (0=all pending)")
    parser.add_argument("--ciks", nargs="*", help="Specific CIKs to fetch")
    parser.add_argument("--max-prospectus", type=int, default=3,
                        help="Max prospectus filings per trust")
    parser.add_argument("--max-supplements", type=int, default=3,
                        help="Max supplement filings per trust")
    parser.add_argument("--no-text", action="store_true",
                        help="Skip downloading filing text (metadata only)")
    parser.add_argument("--text-only", action="store_true",
                        help="Only download text for existing filings")
    parser.add_argument("--backfill-html", action="store_true",
                        help="Re-download HTML for filings missing raw HTML")
    args = parser.parse_args()

    db = FundDatabase(args.db)

    if args.backfill_html:
        backfill_html(db, limit=args.limit)
    elif args.text_only:
        download_pending_texts(db, limit=args.limit)
    else:
        ciks = [c.zfill(10) for c in args.ciks] if args.ciks else None
        fetch_filings_for_universe(
            db, ciks=ciks, limit=args.limit,
            download_text=not args.no_text,
            max_prospectus=args.max_prospectus,
            max_supplements=args.max_supplements,
        )

    stats = db.get_stats()
    print(f"\nDatabase stats:")
    print(f"  Filings:       {stats['filing']:>10,}")
    print(f"  Filing texts:  {stats['filing_text']:>10,}")


if __name__ == "__main__":
    main()