""" Step 4: Load N-PORT data (portfolio holdings, fund-level financials, returns). Downloads quarterly N-PORT data set ZIPs from SEC, parses the TSV files, and loads holdings, fund-level info, and monthly returns into the database. """ import csv import logging import time import zipfile from collections import defaultdict from pathlib import Path import requests from tqdm import tqdm from fund_db import FundDatabase log = logging.getLogger(__name__) USER_AGENT = "FundDataResearch/1.0 research@university.edu" HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"} REQUEST_INTERVAL = 0.12 _last_request_time = 0.0 NPORT_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-port-data-sets" def _throttled_get(url: str, **kwargs) -> requests.Response: global _last_request_time elapsed = time.time() - _last_request_time if elapsed < REQUEST_INTERVAL: time.sleep(REQUEST_INTERVAL - elapsed) kwargs.setdefault("headers", {}).update(HEADERS) kwargs.setdefault("timeout", 300) resp = requests.get(url, **kwargs) _last_request_time = time.time() resp.raise_for_status() return resp def download_nport_zip(quarter: str, output_dir: str = "data/nport") -> Path: """Download a quarterly N-PORT ZIP from SEC.""" url = f"{NPORT_BASE_URL}/{quarter}_nport.zip" out = Path(output_dir) out.mkdir(parents=True, exist_ok=True) zip_path = out / f"{quarter}_nport.zip" extract_dir = out / quarter if extract_dir.exists() and any(extract_dir.iterdir()): log.info("Already extracted: %s", extract_dir) return extract_dir log.info("Downloading N-PORT data set: %s (this may take several minutes)", url) resp = _throttled_get(url, stream=True) total_size = int(resp.headers.get("content-length", 0)) with open(zip_path, "wb") as fp: with tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Downloading {quarter}") as pbar: for chunk in resp.iter_content(chunk_size=65536): fp.write(chunk) pbar.update(len(chunk)) log.info("Extracting %s", zip_path) with zipfile.ZipFile(zip_path, "r") as zf: zf.extractall(extract_dir) log.info("Extracted to %s", extract_dir) return extract_dir def _find_tsv(extract_dir: Path, pattern: str) -> Path: """Find a TSV file matching a pattern in the extract directory.""" candidates = list(extract_dir.rglob(f"*{pattern}*")) if candidates: return candidates[0] return extract_dir / f"{pattern}.tsv" def _read_tsv_streaming(filepath: Path, chunk_size: int = 10000): """Read a large TSV file in chunks, yielding lists of dicts.""" if not filepath.exists(): log.warning("File not found: %s", filepath) return with open(filepath, "r", encoding="utf-8", errors="replace") as f: reader = csv.DictReader(f, delimiter="\t") chunk = [] for row in reader: chunk.append(row) if len(chunk) >= chunk_size: yield chunk chunk = [] if chunk: yield chunk def load_fund_reported_info(db: FundDatabase, extract_dir: Path, quarter: str): """Load the FUND_REPORTED_INFO table from N-PORT data.""" filepath = _find_tsv(extract_dir, "FUND_REPORTED_INFO") if not filepath.exists(): log.warning("FUND_REPORTED_INFO not found in %s", extract_dir) return 0 count = 0 with db.conn() as c: for chunk in _read_tsv_streaming(filepath): for row in chunk: acc = row.get("ACCESSION_NUMBER", "").strip() if not acc: continue try: c.execute(""" INSERT OR IGNORE INTO nport_fund_info (accession_number, cik, series_id, report_date, total_assets, total_liabilities, net_assets) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( acc, row.get("CIK", "").strip().zfill(10), row.get("SERIES_ID", "").strip(), row.get("REPORT_DATE", "").strip(), _to_float(row.get("TOTAL_ASSETS")), _to_float(row.get("TOTAL_LIABILITIES")), _to_float(row.get("NET_ASSETS")), )) count += 1 except Exception as e: log.debug("Fund info insert error: %s", e) log.info("Loaded %d fund info records from %s", count, quarter) return count def load_monthly_returns(db: FundDatabase, extract_dir: Path, quarter: str): """Load monthly total returns from N-PORT data.""" filepath = _find_tsv(extract_dir, "MONTHLY_TOTAL_RETURN") if not filepath.exists(): log.warning("MONTHLY_TOTAL_RETURN not found in %s", extract_dir) return 0 count = 0 with db.conn() as c: for chunk in _read_tsv_streaming(filepath): for row in chunk: acc = row.get("ACCESSION_NUMBER", "").strip() if not acc: continue try: c.execute(""" INSERT OR IGNORE INTO nport_monthly_return (accession_number, cik, class_id, report_date, month1_return, month2_return, month3_return) VALUES (?, ?, ?, ?, ?, ?, ?) """, ( acc, row.get("CIK", "").strip().zfill(10), row.get("CLASS_ID", "").strip(), row.get("REPORT_DATE", "").strip(), _to_float(row.get("MONTHLY_TOTAL_RETURN1")), _to_float(row.get("MONTHLY_TOTAL_RETURN2")), _to_float(row.get("MONTHLY_TOTAL_RETURN3")), )) count += 1 except Exception as e: log.debug("Monthly return insert error: %s", e) log.info("Loaded %d monthly return records from %s", count, quarter) return count def load_holdings(db: FundDatabase, extract_dir: Path, quarter: str, cik_filter: set = None): """ Load portfolio holdings from N-PORT data. If cik_filter is provided, only load holdings for those CIKs. """ filepath = _find_tsv(extract_dir, "FUND_REPORTED_HOLDING") if not filepath.exists(): log.warning("FUND_REPORTED_HOLDING not found in %s", extract_dir) return 0 count = 0 batch = [] batch_size = 5000 for chunk in _read_tsv_streaming(filepath, chunk_size=10000): for row in chunk: cik = row.get("CIK", "").strip().zfill(10) if cik_filter and cik not in cik_filter: continue holding = { "accession_number": row.get("ACCESSION_NUMBER", "").strip(), "cik": cik, "report_date": row.get("REPORT_DATE", "").strip(), "holding_name": row.get("NAME_OF_ISSUER", "").strip()[:200], "lei": row.get("LEI", "").strip(), "cusip": row.get("CUSIP", "").strip(), "isin": row.get("ISIN", "").strip(), "ticker": row.get("TICKER", "").strip(), "asset_category": row.get("ASSET_CAT", "").strip(), "issuer_category": row.get("ISSUER_CAT", "").strip(), "inv_country": row.get("INV_COUNTRY", "").strip(), "currency": row.get("CURRENCY_CODE", row.get("CUR_CD", "")).strip(), "quantity": _to_float(row.get("BALANCE")), "value_usd": _to_float(row.get("VAL_USD", row.get("VALUE_USD"))), "pct_val": _to_float(row.get("PCT_VAL")), "is_debt": 1 if row.get("ASSET_CAT", "").strip().startswith("D") else 0, "coupon_rate": _to_float(row.get("COUPON_RATE")), "maturity_date": row.get("MATURITY_DATE", "").strip(), "is_default": _to_int(row.get("IS_DEFAULT")), "fair_value_level": row.get("FAIR_VAL_LEVEL", "").strip(), } batch.append(holding) count += 1 if len(batch) >= batch_size: db.bulk_insert_holdings(batch) batch = [] if batch: db.bulk_insert_holdings(batch) log.info("Loaded %d holdings from %s", count, quarter) return count def _to_float(val): if val is None: return None val = str(val).strip() if not val or val.lower() in ("", "n/a", "none"): return None try: return float(val) except (ValueError, TypeError): return None def _to_int(val): if val is None: return None val = str(val).strip() if not val or val.lower() in ("", "n/a", "none"): return None try: return int(float(val)) except (ValueError, TypeError): return None def main(): import argparse logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") parser = argparse.ArgumentParser(description="Load N-PORT data") parser.add_argument("--db", default="fund_data.db", help="Database path") parser.add_argument("--quarters", nargs="+", default=["2025q3"], help="Quarters to download (e.g. 2025q3 2025q4)") parser.add_argument("--data-dir", default="data/nport", help="Directory for downloaded files") parser.add_argument("--skip-holdings", action="store_true", help="Skip loading individual holdings (large)") parser.add_argument("--holdings-cik-filter", action="store_true", help="Only load holdings for CIKs already in DB") args = parser.parse_args() db = FundDatabase(args.db) cik_filter = None if args.holdings_cik_filter: cik_filter = set(db.get_all_ciks()) log.info("Filtering holdings to %d CIKs in database", len(cik_filter)) for quarter in args.quarters: print(f"\n{'='*60}") print(f"Processing N-PORT {quarter}") print(f"{'='*60}") try: extract_dir = download_nport_zip(quarter, args.data_dir) load_fund_reported_info(db, extract_dir, quarter) load_monthly_returns(db, extract_dir, quarter) if not args.skip_holdings: load_holdings(db, extract_dir, quarter, cik_filter=cik_filter) except Exception as e: log.error("Failed to process %s: %s", quarter, e) stats = db.get_stats() print(f"\nDatabase stats:") for table, count in stats.items(): if count > 0: print(f" {table:30s} {count:>10,}") if __name__ == "__main__": main()