""" Step 1: Build the fund universe from SEC Series/Class CSV. Downloads the SEC Investment Company Series and Class Information CSV, parses it, and loads all trusts, series, and share classes into the SQLite database. This gives us the full universe of ~15K trusts, ~50K series (funds), and ~100K+ share classes. """ import csv import io import logging import time import requests from tqdm import tqdm from fund_db import FundDatabase log = logging.getLogger(__name__) SEC_BASE = "https://www.sec.gov" DATA_SEC = "https://data.sec.gov" SERIES_CLASS_CSV_URL = ( "https://www.sec.gov/files/investment/data/other/" "investment-company-series-class-information/" "investment-company-series-class-2025.csv" ) USER_AGENT = "FundDataResearch/1.0 research@university.edu" HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"} REQUEST_INTERVAL = 0.12 _last_request_time = 0.0 def _throttled_get(url: str, **kwargs) -> requests.Response: global _last_request_time elapsed = time.time() - _last_request_time if elapsed < REQUEST_INTERVAL: time.sleep(REQUEST_INTERVAL - elapsed) kwargs.setdefault("headers", {}).update(HEADERS) kwargs.setdefault("timeout", 60) resp = requests.get(url, **kwargs) _last_request_time = time.time() resp.raise_for_status() return resp def load_series_class_csv(db: FundDatabase, csv_url: str = SERIES_CLASS_CSV_URL): """Download SEC Series/Class CSV and load into the database.""" log.info("Downloading SEC Series/Class CSV from %s", csv_url) resp = _throttled_get(csv_url) resp.encoding = "utf-8" reader = csv.DictReader(io.StringIO(resp.text)) trust_count = 0 series_count = 0 class_count = 0 seen_trusts = set() seen_series = set() for row in reader: cik = row.get("CIK Number", "").strip().zfill(10) if not cik or cik == "0" * 10: continue if cik not in seen_trusts: db.upsert_trust_simple( cik=cik, trust_name=row.get("Entity Name", "").strip(), file_number=row.get("Reporting File Number", "").strip(), entity_type=row.get("Entity Org Type", "").strip(), ) seen_trusts.add(cik) trust_count += 1 series_id = row.get("Series ID", "").strip() if not series_id: continue if series_id not in seen_series: db.upsert_series( series_id=series_id, cik=cik, series_name=row.get("Series Name", "").strip(), ) seen_series.add(series_id) series_count += 1 class_id = row.get("Class ID", "").strip() if class_id: db.upsert_share_class( class_id=class_id, series_id=series_id, cik=cik, class_name=row.get("Class Name", "").strip(), ticker=row.get("Class Ticker", "").strip(), ) class_count += 1 log.info("Loaded %d trusts, %d series, %d share classes from CSV", trust_count, series_count, class_count) db.record_bulk_download("series_class_csv", "2025", csv_url, class_count) return trust_count, series_count, class_count def enrich_from_submissions_api(db: FundDatabase, ciks: list[str] = None, limit: int = 0): """ Enrich trust records with data from the SEC Submissions API. Adds: fiscal year end, SIC code, state of incorporation, website. """ if ciks is None: ciks = db.get_all_ciks() if limit > 0: ciks = ciks[:limit] log.info("Enriching %d trusts from Submissions API", len(ciks)) enriched = 0 for cik in tqdm(ciks, desc="Enriching trusts"): if db.get_pipeline_status(cik, "enrich_submissions") == "done": continue db.set_pipeline_status(cik, "enrich_submissions", "running") try: url = f"{DATA_SEC}/submissions/CIK{cik}.json" resp = _throttled_get(url) data = resp.json() db.upsert_trust_simple( cik=cik, trust_name=data.get("name", ""), state_of_inc=data.get("stateOfIncorporation", ""), fiscal_year_end=data.get("fiscalYearEnd", ""), website=data.get("website", ""), ) sic = data.get("sic", "") if sic: with db.conn() as c: c.execute("UPDATE trust SET sic_code=? WHERE cik=?", (sic, cik)) db.set_pipeline_status(cik, "enrich_submissions", "done", items_processed=1) enriched += 1 except Exception as e: db.set_pipeline_status(cik, "enrich_submissions", "error", error_message=str(e)[:500]) log.warning("Failed to enrich CIK %s: %s", cik, e) log.info("Enriched %d trusts", enriched) return enriched def main(): import argparse logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") parser = argparse.ArgumentParser(description="Build SEC fund universe") parser.add_argument("--db", default="fund_data.db", help="Database path") parser.add_argument("--skip-csv", action="store_true", help="Skip CSV download (if already loaded)") parser.add_argument("--enrich", action="store_true", help="Enrich trusts from Submissions API") parser.add_argument("--enrich-limit", type=int, default=0, help="Max trusts to enrich (0=all)") args = parser.parse_args() db = FundDatabase(args.db) if not args.skip_csv: t, s, c = load_series_class_csv(db) print(f"\nUniverse loaded: {t:,} trusts, {s:,} series, {c:,} share classes") if args.enrich: n = enrich_from_submissions_api(db, limit=args.enrich_limit) print(f"Enriched {n:,} trusts from Submissions API") stats = db.get_stats() print(f"\nDatabase stats:") for table, count in stats.items(): if count > 0: print(f" {table:30s} {count:>10,}") if __name__ == "__main__": main()