""" Step 5: Load N-CEN data (service providers, fund classification, ETF info). Downloads quarterly N-CEN data set ZIPs from SEC, parses the TSV files, and loads fund classification and service provider data into the database. """ import csv import logging import time import zipfile from pathlib import Path import requests from tqdm import tqdm from fund_db import FundDatabase log = logging.getLogger(__name__) USER_AGENT = "FundDataResearch/1.0 research@university.edu" HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"} REQUEST_INTERVAL = 0.12 _last_request_time = 0.0 NCEN_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-cen-data-sets" def _throttled_get(url: str, **kwargs) -> requests.Response: global _last_request_time elapsed = time.time() - _last_request_time if elapsed < REQUEST_INTERVAL: time.sleep(REQUEST_INTERVAL - elapsed) kwargs.setdefault("headers", {}).update(HEADERS) kwargs.setdefault("timeout", 120) resp = requests.get(url, **kwargs) _last_request_time = time.time() resp.raise_for_status() return resp def download_ncen_zip(quarter: str, output_dir: str = "data/ncen") -> Path: """Download a quarterly N-CEN ZIP from SEC.""" url = f"{NCEN_BASE_URL}/{quarter}_ncen.zip" out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) zip_path = out / f"{quarter}_ncen.zip" extract_dir = out / quarter if extract_dir.exists() and any(extract_dir.iterdir()): log.info("Already extracted: %s", extract_dir) return extract_dir log.info("Downloading N-CEN data set: %s", url) resp = _throttled_get(url, stream=True) with open(zip_path, "wb") as fp: for chunk in resp.iter_content(chunk_size=65536): fp.write(chunk) with zipfile.ZipFile(zip_path, "r") as zf: zf.extractall(extract_dir) log.info("Extracted to %s", extract_dir) return extract_dir def _find_tsv(extract_dir: Path, pattern: str) -> Path: candidates = list(extract_dir.rglob(f"*{pattern}*")) tsv_candidates = [c for c in candidates if c.suffix.lower() in (".tsv", ".txt", "")] if tsv_candidates: return tsv_candidates[0] if candidates: return candidates[0] return extract_dir / f"{pattern}.tsv" def _read_tsv(filepath: Path) -> list[dict]: if not filepath.exists(): log.warning("File not found: %s", filepath) return [] with open(filepath, "r", encoding="utf-8", errors="replace") as f: reader = csv.DictReader(f, delimiter="\t") return list(reader) def load_etf_data(db: FundDatabase, extract_dir: Path, quarter: str): """Load ETF classification data from N-CEN.""" filepath = _find_tsv(extract_dir, "ETF") if not filepath.exists(): log.info("No ETF file found in %s", extract_dir) return 0 rows = _read_tsv(filepath) count = 0 with db.conn() as c: for row in rows: cik = row.get("CIK", "").strip().zfill(10) acc = row.get("ACCESSION_NUMBER", "").strip() series_id = row.get("SERIES_ID", "").strip() if not cik or not acc: continue try: c.execute(""" INSERT OR REPLACE INTO ncen_fund_info (accession_number, cik, series_id, report_period, is_etf, index_name, fund_type) VALUES (?, ?, ?, ?, 1, ?, 'ETF') """, ( acc, cik, series_id, quarter, row.get("INDEX_NAME", "").strip(), )) count += 1 except Exception as e: log.debug("ETF insert error: %s", e) log.info("Loaded %d ETF records from %s", count, quarter) return count def load_index_data(db: FundDatabase, extract_dir: Path, quarter: str): """Load index fund data from N-CEN.""" filepath = _find_tsv(extract_dir, "INDEX") if not filepath.exists(): log.info("No INDEX file found in %s", extract_dir) return 0 rows = _read_tsv(filepath) count = 0 with db.conn() as c: for row in rows: cik = row.get("CIK", "").strip().zfill(10) acc = row.get("ACCESSION_NUMBER", "").strip() series_id = row.get("SERIES_ID", "").strip() if not cik or not acc: continue try: c.execute(""" INSERT OR REPLACE INTO ncen_fund_info (accession_number, cik, series_id, report_period, is_index_fund, index_name) VALUES (?, ?, ?, ?, 1, ?) ON CONFLICT(accession_number, series_id) DO UPDATE SET is_index_fund=1, index_name=COALESCE(NULLIF(excluded.index_name,''), ncen_fund_info.index_name) """, ( acc, cik, series_id, quarter, row.get("INDEX_NAME", row.get("NAME", "")).strip(), )) count += 1 except Exception as e: log.debug("Index insert error: %s", e) log.info("Loaded %d index fund records from %s", count, quarter) return count def load_service_providers(db: FundDatabase, extract_dir: Path, quarter: str): """Load service provider data from various N-CEN tables.""" provider_files = { "CUSTODIAN": "custodian", "TRANSFER_AGENT": "transfer_agent", "ADVISOR": "adviser", "SUB_ADVISOR": "sub_adviser", "ADMINISTRATOR": "administrator", "AUDITOR": "auditor", } total = 0 for filename, role in provider_files.items(): filepath = _find_tsv(extract_dir, filename) if not filepath.exists(): continue rows = _read_tsv(filepath) count = 0 with db.conn() as c: for row in rows: cik = row.get("CIK", "").strip().zfill(10) name = (row.get("NAME", "") or row.get("COMPANY_NAME", "") or row.get("CUSTODIAN_NAME", "") or row.get("FIRM_NAME", "")).strip() lei = row.get("LEI", "").strip() if not cik or not name: continue try: c.execute(""" INSERT OR IGNORE INTO ncen_service_provider (cik, report_period, provider_role, provider_name, provider_lei) VALUES (?, ?, ?, ?, ?) """, (cik, quarter, role, name, lei)) count += 1 except Exception as e: log.debug("Service provider insert error: %s", e) log.info(" %s: %d records", role, count) total += count log.info("Loaded %d total service provider records from %s", total, quarter) return total def main(): import argparse logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") parser = argparse.ArgumentParser(description="Load N-CEN data") parser.add_argument("--db", default="fund_data.db", help="Database path") parser.add_argument("--quarters", nargs="+", default=["2025q3"], help="Quarters to download (e.g. 2025q3)") parser.add_argument("--data-dir", default="data/ncen", help="Directory for downloaded files") args = parser.parse_args() db = FundDatabase(args.db) for quarter in args.quarters: print(f"\n{'='*60}") print(f"Processing N-CEN {quarter}") print(f"{'='*60}") try: extract_dir = download_ncen_zip(quarter, args.data_dir) load_etf_data(db, extract_dir, quarter) load_index_data(db, extract_dir, quarter) load_service_providers(db, extract_dir, quarter) except Exception as e: log.error("Failed to process %s: %s", quarter, e) stats = db.get_stats() print(f"\nDatabase stats:") for table, count in stats.items(): if count > 0: print(f" {table:30s} {count:>10,}") if __name__ == "__main__": main()