fund_rfid_data/load_ncen.py

"""
Step 5: Load N-CEN data (service providers, fund classification, ETF info).

Downloads quarterly N-CEN data set ZIPs from SEC, parses the TSV files,
and loads fund classification and service provider data into the database.
"""

import csv
import logging
import time
import zipfile
from pathlib import Path

import requests
from tqdm import tqdm

from fund_db import FundDatabase

log = logging.getLogger(__name__)

USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0

NCEN_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-cen-data-sets"


def _throttled_get(url: str, **kwargs) -> requests.Response:
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - elapsed)
    kwargs.setdefault("headers", {}).update(HEADERS)
    kwargs.setdefault("timeout", 120)
    resp = requests.get(url, **kwargs)
    _last_request_time = time.time()
    resp.raise_for_status()
    return resp


def download_ncen_zip(quarter: str, output_dir: str = "data/ncen") -> Path:
    """Download a quarterly N-CEN ZIP from SEC."""
    url = f"{NCEN_BASE_URL}/{quarter}_ncen.zip"
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    zip_path = out / f"{quarter}_ncen.zip"
    extract_dir = out / quarter

    if extract_dir.exists() and any(extract_dir.iterdir()):
        log.info("Already extracted: %s", extract_dir)
        return extract_dir

    log.info("Downloading N-CEN data set: %s", url)
    resp = _throttled_get(url, stream=True)

    with open(zip_path, "wb") as fp:
        for chunk in resp.iter_content(chunk_size=65536):
            fp.write(chunk)

    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_dir)
    log.info("Extracted to %s", extract_dir)
    return extract_dir


def _find_tsv(extract_dir: Path, pattern: str) -> Path:
    candidates = list(extract_dir.rglob(f"*{pattern}*"))
    tsv_candidates = [c for c in candidates if c.suffix.lower() in (".tsv", ".txt", "")]
    if tsv_candidates:
        return tsv_candidates[0]
    if candidates:
        return candidates[0]
    return extract_dir / f"{pattern}.tsv"


def _read_tsv(filepath: Path) -> list[dict]:
    if not filepath.exists():
        log.warning("File not found: %s", filepath)
        return []
    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        reader = csv.DictReader(f, delimiter="\t")
        return list(reader)


def load_etf_data(db: FundDatabase, extract_dir: Path, quarter: str):
    """Load ETF classification data from N-CEN."""
    filepath = _find_tsv(extract_dir, "ETF")
    if not filepath.exists():
        log.info("No ETF file found in %s", extract_dir)
        return 0

    rows = _read_tsv(filepath)
    count = 0

    with db.conn() as c:
        for row in rows:
            cik = row.get("CIK", "").strip().zfill(10)
            acc = row.get("ACCESSION_NUMBER", "").strip()
            series_id = row.get("SERIES_ID", "").strip()

            if not cik or not acc:
                continue

            try:
                c.execute("""
                    INSERT OR REPLACE INTO ncen_fund_info
                    (accession_number, cik, series_id, report_period, is_etf,
                     index_name, fund_type)
                    VALUES (?, ?, ?, ?, 1, ?, 'ETF')
                """, (
                    acc, cik, series_id, quarter,
                    row.get("INDEX_NAME", "").strip(),
                ))
                count += 1
            except Exception as e:
                log.debug("ETF insert error: %s", e)

    log.info("Loaded %d ETF records from %s", count, quarter)
    return count


def load_index_data(db: FundDatabase, extract_dir: Path, quarter: str):
    """Load index fund data from N-CEN."""
    filepath = _find_tsv(extract_dir, "INDEX")
    if not filepath.exists():
        log.info("No INDEX file found in %s", extract_dir)
        return 0

    rows = _read_tsv(filepath)
    count = 0

    with db.conn() as c:
        for row in rows:
            cik = row.get("CIK", "").strip().zfill(10)
            acc = row.get("ACCESSION_NUMBER", "").strip()
            series_id = row.get("SERIES_ID", "").strip()

            if not cik or not acc:
                continue

            try:
                c.execute("""
                    INSERT OR REPLACE INTO ncen_fund_info
                    (accession_number, cik, series_id, report_period,
                     is_index_fund, index_name)
                    VALUES (?, ?, ?, ?, 1, ?)
                    ON CONFLICT(accession_number, series_id) DO UPDATE SET
                        is_index_fund=1,
                        index_name=COALESCE(NULLIF(excluded.index_name,''), ncen_fund_info.index_name)
                """, (
                    acc, cik, series_id, quarter,
                    row.get("INDEX_NAME", row.get("NAME", "")).strip(),
                ))
                count += 1
            except Exception as e:
                log.debug("Index insert error: %s", e)

    log.info("Loaded %d index fund records from %s", count, quarter)
    return count


def load_service_providers(db: FundDatabase, extract_dir: Path, quarter: str):
    """Load service provider data from various N-CEN tables."""
    provider_files = {
        "CUSTODIAN": "custodian",
        "TRANSFER_AGENT": "transfer_agent",
        "ADVISOR": "adviser",
        "SUB_ADVISOR": "sub_adviser",
        "ADMINISTRATOR": "administrator",
        "AUDITOR": "auditor",
    }

    total = 0
    for filename, role in provider_files.items():
        filepath = _find_tsv(extract_dir, filename)
        if not filepath.exists():
            continue

        rows = _read_tsv(filepath)
        count = 0

        with db.conn() as c:
            for row in rows:
                cik = row.get("CIK", "").strip().zfill(10)
                name = (row.get("NAME", "") or row.get("COMPANY_NAME", "") or
                        row.get("CUSTODIAN_NAME", "") or row.get("FIRM_NAME", "")).strip()
                lei = row.get("LEI", "").strip()

                if not cik or not name:
                    continue

                try:
                    c.execute("""
                        INSERT OR IGNORE INTO ncen_service_provider
                        (cik, report_period, provider_role, provider_name, provider_lei)
                        VALUES (?, ?, ?, ?, ?)
                    """, (cik, quarter, role, name, lei))
                    count += 1
                except Exception as e:
                    log.debug("Service provider insert error: %s", e)

        log.info("  %s: %d records", role, count)
        total += count

    log.info("Loaded %d total service provider records from %s", total, quarter)
    return total


def main():
    import argparse
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s")

    parser = argparse.ArgumentParser(description="Load N-CEN data")
    parser.add_argument("--db", default="fund_data.db", help="Database path")
    parser.add_argument("--quarters", nargs="+",
                        default=["2025q3"],
                        help="Quarters to download (e.g. 2025q3)")
    parser.add_argument("--data-dir", default="data/ncen",
                        help="Directory for downloaded files")
    args = parser.parse_args()

    db = FundDatabase(args.db)

    for quarter in args.quarters:
        print(f"\n{'='*60}")
        print(f"Processing N-CEN {quarter}")
        print(f"{'='*60}")
        try:
            extract_dir = download_ncen_zip(quarter, args.data_dir)
            load_etf_data(db, extract_dir, quarter)
            load_index_data(db, extract_dir, quarter)
            load_service_providers(db, extract_dir, quarter)
        except Exception as e:
            log.error("Failed to process %s: %s", quarter, e)

    stats = db.get_stats()
    print(f"\nDatabase stats:")
    for table, count in stats.items():
        if count > 0:
            print(f"  {table:30s} {count:>10,}")


if __name__ == "__main__":
    main()