fund_rfid_data/fetch_universe.py

"""
Step 1: Build the fund universe from SEC Series/Class CSV.

Downloads the SEC Investment Company Series and Class Information CSV,
parses it, and loads all trusts, series, and share classes into the
SQLite database. This gives us the full universe of ~15K trusts,
~50K series (funds), and ~100K+ share classes.
"""

import csv
import io
import logging
import time

import requests
from tqdm import tqdm

from fund_db import FundDatabase

log = logging.getLogger(__name__)

SEC_BASE = "https://www.sec.gov"
DATA_SEC = "https://data.sec.gov"
SERIES_CLASS_CSV_URL = (
    "https://www.sec.gov/files/investment/data/other/"
    "investment-company-series-class-information/"
    "investment-company-series-class-2025.csv"
)

USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12

_last_request_time = 0.0


def _throttled_get(url: str, **kwargs) -> requests.Response:
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - elapsed)
    kwargs.setdefault("headers", {}).update(HEADERS)
    kwargs.setdefault("timeout", 60)
    resp = requests.get(url, **kwargs)
    _last_request_time = time.time()
    resp.raise_for_status()
    return resp


def load_series_class_csv(db: FundDatabase, csv_url: str = SERIES_CLASS_CSV_URL):
    """Download SEC Series/Class CSV and load into the database."""
    log.info("Downloading SEC Series/Class CSV from %s", csv_url)
    resp = _throttled_get(csv_url)
    resp.encoding = "utf-8"

    reader = csv.DictReader(io.StringIO(resp.text))

    trust_count = 0
    series_count = 0
    class_count = 0
    seen_trusts = set()
    seen_series = set()

    for row in reader:
        cik = row.get("CIK Number", "").strip().zfill(10)
        if not cik or cik == "0" * 10:
            continue

        if cik not in seen_trusts:
            db.upsert_trust_simple(
                cik=cik,
                trust_name=row.get("Entity Name", "").strip(),
                file_number=row.get("Reporting File Number", "").strip(),
                entity_type=row.get("Entity Org Type", "").strip(),
            )
            seen_trusts.add(cik)
            trust_count += 1

        series_id = row.get("Series ID", "").strip()
        if not series_id:
            continue

        if series_id not in seen_series:
            db.upsert_series(
                series_id=series_id,
                cik=cik,
                series_name=row.get("Series Name", "").strip(),
            )
            seen_series.add(series_id)
            series_count += 1

        class_id = row.get("Class ID", "").strip()
        if class_id:
            db.upsert_share_class(
                class_id=class_id,
                series_id=series_id,
                cik=cik,
                class_name=row.get("Class Name", "").strip(),
                ticker=row.get("Class Ticker", "").strip(),
            )
            class_count += 1

    log.info("Loaded %d trusts, %d series, %d share classes from CSV",
             trust_count, series_count, class_count)
    db.record_bulk_download("series_class_csv", "2025", csv_url, class_count)
    return trust_count, series_count, class_count


def enrich_from_submissions_api(db: FundDatabase, ciks: list[str] = None,
                                 limit: int = 0):
    """
    Enrich trust records with data from the SEC Submissions API.
    Adds: fiscal year end, SIC code, state of incorporation, website.
    """
    if ciks is None:
        ciks = db.get_all_ciks()
    if limit > 0:
        ciks = ciks[:limit]

    log.info("Enriching %d trusts from Submissions API", len(ciks))
    enriched = 0

    for cik in tqdm(ciks, desc="Enriching trusts"):
        if db.get_pipeline_status(cik, "enrich_submissions") == "done":
            continue

        db.set_pipeline_status(cik, "enrich_submissions", "running")
        try:
            url = f"{DATA_SEC}/submissions/CIK{cik}.json"
            resp = _throttled_get(url)
            data = resp.json()

            db.upsert_trust_simple(
                cik=cik,
                trust_name=data.get("name", ""),
                state_of_inc=data.get("stateOfIncorporation", ""),
                fiscal_year_end=data.get("fiscalYearEnd", ""),
                website=data.get("website", ""),
            )

            sic = data.get("sic", "")
            if sic:
                with db.conn() as c:
                    c.execute("UPDATE trust SET sic_code=? WHERE cik=?", (sic, cik))

            db.set_pipeline_status(cik, "enrich_submissions", "done", items_processed=1)
            enriched += 1
        except Exception as e:
            db.set_pipeline_status(cik, "enrich_submissions", "error",
                                   error_message=str(e)[:500])
            log.warning("Failed to enrich CIK %s: %s", cik, e)

    log.info("Enriched %d trusts", enriched)
    return enriched


def main():
    import argparse
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s")

    parser = argparse.ArgumentParser(description="Build SEC fund universe")
    parser.add_argument("--db", default="fund_data.db", help="Database path")
    parser.add_argument("--skip-csv", action="store_true",
                        help="Skip CSV download (if already loaded)")
    parser.add_argument("--enrich", action="store_true",
                        help="Enrich trusts from Submissions API")
    parser.add_argument("--enrich-limit", type=int, default=0,
                        help="Max trusts to enrich (0=all)")
    args = parser.parse_args()

    db = FundDatabase(args.db)

    if not args.skip_csv:
        t, s, c = load_series_class_csv(db)
        print(f"\nUniverse loaded: {t:,} trusts, {s:,} series, {c:,} share classes")

    if args.enrich:
        n = enrich_from_submissions_api(db, limit=args.enrich_limit)
        print(f"Enriched {n:,} trusts from Submissions API")

    stats = db.get_stats()
    print(f"\nDatabase stats:")
    for table, count in stats.items():
        if count > 0:
            print(f"  {table:30s} {count:>10,}")


if __name__ == "__main__":
    main()