fund_rfid_data/load_nport.py

"""
Step 4: Load N-PORT data (portfolio holdings, fund-level financials, returns).

Downloads quarterly N-PORT data set ZIPs from SEC, parses the TSV files,
and loads holdings, fund-level info, and monthly returns into the database.
"""

import csv
import logging
import time
import zipfile
from collections import defaultdict
from pathlib import Path

import requests
from tqdm import tqdm

from fund_db import FundDatabase

log = logging.getLogger(__name__)

USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0

NPORT_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-port-data-sets"


def _throttled_get(url: str, **kwargs) -> requests.Response:
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - elapsed)
    kwargs.setdefault("headers", {}).update(HEADERS)
    kwargs.setdefault("timeout", 300)
    resp = requests.get(url, **kwargs)
    _last_request_time = time.time()
    resp.raise_for_status()
    return resp


def download_nport_zip(quarter: str, output_dir: str = "data/nport") -> Path:
    """Download a quarterly N-PORT ZIP from SEC."""
    url = f"{NPORT_BASE_URL}/{quarter}_nport.zip"
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    zip_path = out / f"{quarter}_nport.zip"
    extract_dir = out / quarter

    if extract_dir.exists() and any(extract_dir.iterdir()):
        log.info("Already extracted: %s", extract_dir)
        return extract_dir

    log.info("Downloading N-PORT data set: %s (this may take several minutes)", url)
    resp = _throttled_get(url, stream=True)

    total_size = int(resp.headers.get("content-length", 0))
    with open(zip_path, "wb") as fp:
        with tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Downloading {quarter}") as pbar:
            for chunk in resp.iter_content(chunk_size=65536):
                fp.write(chunk)
                pbar.update(len(chunk))

    log.info("Extracting %s", zip_path)
    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_dir)
    log.info("Extracted to %s", extract_dir)
    return extract_dir


def _find_tsv(extract_dir: Path, pattern: str) -> Path:
    """Find a TSV file matching a pattern in the extract directory."""
    candidates = list(extract_dir.rglob(f"*{pattern}*"))
    if candidates:
        return candidates[0]
    return extract_dir / f"{pattern}.tsv"


def _read_tsv_streaming(filepath: Path, chunk_size: int = 10000):
    """Read a large TSV file in chunks, yielding lists of dicts."""
    if not filepath.exists():
        log.warning("File not found: %s", filepath)
        return

    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        reader = csv.DictReader(f, delimiter="\t")
        chunk = []
        for row in reader:
            chunk.append(row)
            if len(chunk) >= chunk_size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk


def load_fund_reported_info(db: FundDatabase, extract_dir: Path, quarter: str):
    """Load the FUND_REPORTED_INFO table from N-PORT data."""
    filepath = _find_tsv(extract_dir, "FUND_REPORTED_INFO")
    if not filepath.exists():
        log.warning("FUND_REPORTED_INFO not found in %s", extract_dir)
        return 0

    count = 0
    with db.conn() as c:
        for chunk in _read_tsv_streaming(filepath):
            for row in chunk:
                acc = row.get("ACCESSION_NUMBER", "").strip()
                if not acc:
                    continue
                try:
                    c.execute("""
                        INSERT OR IGNORE INTO nport_fund_info
                        (accession_number, cik, series_id, report_date,
                         total_assets, total_liabilities, net_assets)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    """, (
                        acc,
                        row.get("CIK", "").strip().zfill(10),
                        row.get("SERIES_ID", "").strip(),
                        row.get("REPORT_DATE", "").strip(),
                        _to_float(row.get("TOTAL_ASSETS")),
                        _to_float(row.get("TOTAL_LIABILITIES")),
                        _to_float(row.get("NET_ASSETS")),
                    ))
                    count += 1
                except Exception as e:
                    log.debug("Fund info insert error: %s", e)

    log.info("Loaded %d fund info records from %s", count, quarter)
    return count


def load_monthly_returns(db: FundDatabase, extract_dir: Path, quarter: str):
    """Load monthly total returns from N-PORT data."""
    filepath = _find_tsv(extract_dir, "MONTHLY_TOTAL_RETURN")
    if not filepath.exists():
        log.warning("MONTHLY_TOTAL_RETURN not found in %s", extract_dir)
        return 0

    count = 0
    with db.conn() as c:
        for chunk in _read_tsv_streaming(filepath):
            for row in chunk:
                acc = row.get("ACCESSION_NUMBER", "").strip()
                if not acc:
                    continue
                try:
                    c.execute("""
                        INSERT OR IGNORE INTO nport_monthly_return
                        (accession_number, cik, class_id, report_date,
                         month1_return, month2_return, month3_return)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    """, (
                        acc,
                        row.get("CIK", "").strip().zfill(10),
                        row.get("CLASS_ID", "").strip(),
                        row.get("REPORT_DATE", "").strip(),
                        _to_float(row.get("MONTHLY_TOTAL_RETURN1")),
                        _to_float(row.get("MONTHLY_TOTAL_RETURN2")),
                        _to_float(row.get("MONTHLY_TOTAL_RETURN3")),
                    ))
                    count += 1
                except Exception as e:
                    log.debug("Monthly return insert error: %s", e)

    log.info("Loaded %d monthly return records from %s", count, quarter)
    return count


def load_holdings(db: FundDatabase, extract_dir: Path, quarter: str,
                  cik_filter: set = None):
    """
    Load portfolio holdings from N-PORT data.
    If cik_filter is provided, only load holdings for those CIKs.
    """
    filepath = _find_tsv(extract_dir, "FUND_REPORTED_HOLDING")
    if not filepath.exists():
        log.warning("FUND_REPORTED_HOLDING not found in %s", extract_dir)
        return 0

    count = 0
    batch = []
    batch_size = 5000

    for chunk in _read_tsv_streaming(filepath, chunk_size=10000):
        for row in chunk:
            cik = row.get("CIK", "").strip().zfill(10)
            if cik_filter and cik not in cik_filter:
                continue

            holding = {
                "accession_number": row.get("ACCESSION_NUMBER", "").strip(),
                "cik": cik,
                "report_date": row.get("REPORT_DATE", "").strip(),
                "holding_name": row.get("NAME_OF_ISSUER", "").strip()[:200],
                "lei": row.get("LEI", "").strip(),
                "cusip": row.get("CUSIP", "").strip(),
                "isin": row.get("ISIN", "").strip(),
                "ticker": row.get("TICKER", "").strip(),
                "asset_category": row.get("ASSET_CAT", "").strip(),
                "issuer_category": row.get("ISSUER_CAT", "").strip(),
                "inv_country": row.get("INV_COUNTRY", "").strip(),
                "currency": row.get("CURRENCY_CODE", row.get("CUR_CD", "")).strip(),
                "quantity": _to_float(row.get("BALANCE")),
                "value_usd": _to_float(row.get("VAL_USD", row.get("VALUE_USD"))),
                "pct_val": _to_float(row.get("PCT_VAL")),
                "is_debt": 1 if row.get("ASSET_CAT", "").strip().startswith("D") else 0,
                "coupon_rate": _to_float(row.get("COUPON_RATE")),
                "maturity_date": row.get("MATURITY_DATE", "").strip(),
                "is_default": _to_int(row.get("IS_DEFAULT")),
                "fair_value_level": row.get("FAIR_VAL_LEVEL", "").strip(),
            }
            batch.append(holding)
            count += 1

            if len(batch) >= batch_size:
                db.bulk_insert_holdings(batch)
                batch = []

    if batch:
        db.bulk_insert_holdings(batch)

    log.info("Loaded %d holdings from %s", count, quarter)
    return count


def _to_float(val):
    if val is None:
        return None
    val = str(val).strip()
    if not val or val.lower() in ("", "n/a", "none"):
        return None
    try:
        return float(val)
    except (ValueError, TypeError):
        return None


def _to_int(val):
    if val is None:
        return None
    val = str(val).strip()
    if not val or val.lower() in ("", "n/a", "none"):
        return None
    try:
        return int(float(val))
    except (ValueError, TypeError):
        return None


def main():
    import argparse
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s")

    parser = argparse.ArgumentParser(description="Load N-PORT data")
    parser.add_argument("--db", default="fund_data.db", help="Database path")
    parser.add_argument("--quarters", nargs="+",
                        default=["2025q3"],
                        help="Quarters to download (e.g. 2025q3 2025q4)")
    parser.add_argument("--data-dir", default="data/nport",
                        help="Directory for downloaded files")
    parser.add_argument("--skip-holdings", action="store_true",
                        help="Skip loading individual holdings (large)")
    parser.add_argument("--holdings-cik-filter", action="store_true",
                        help="Only load holdings for CIKs already in DB")
    args = parser.parse_args()

    db = FundDatabase(args.db)

    cik_filter = None
    if args.holdings_cik_filter:
        cik_filter = set(db.get_all_ciks())
        log.info("Filtering holdings to %d CIKs in database", len(cik_filter))

    for quarter in args.quarters:
        print(f"\n{'='*60}")
        print(f"Processing N-PORT {quarter}")
        print(f"{'='*60}")
        try:
            extract_dir = download_nport_zip(quarter, args.data_dir)
            load_fund_reported_info(db, extract_dir, quarter)
            load_monthly_returns(db, extract_dir, quarter)

            if not args.skip_holdings:
                load_holdings(db, extract_dir, quarter, cik_filter=cik_filter)
        except Exception as e:
            log.error("Failed to process %s: %s", quarter, e)

    stats = db.get_stats()
    print(f"\nDatabase stats:")
    for table, count in stats.items():
        if count > 0:
            print(f"  {table:30s} {count:>10,}")


if __name__ == "__main__":
    main()