fund_rfid_data/load_nport.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

301 lines
11 KiB
Python

"""
Step 4: Load N-PORT data (portfolio holdings, fund-level financials, returns).
Downloads quarterly N-PORT data set ZIPs from SEC, parses the TSV files,
and loads holdings, fund-level info, and monthly returns into the database.
"""
import csv
import logging
import time
import zipfile
from collections import defaultdict
from pathlib import Path
import requests
from tqdm import tqdm
from fund_db import FundDatabase
log = logging.getLogger(__name__)
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0
NPORT_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-port-data-sets"
def _throttled_get(url: str, **kwargs) -> requests.Response:
global _last_request_time
elapsed = time.time() - _last_request_time
if elapsed < REQUEST_INTERVAL:
time.sleep(REQUEST_INTERVAL - elapsed)
kwargs.setdefault("headers", {}).update(HEADERS)
kwargs.setdefault("timeout", 300)
resp = requests.get(url, **kwargs)
_last_request_time = time.time()
resp.raise_for_status()
return resp
def download_nport_zip(quarter: str, output_dir: str = "data/nport") -> Path:
"""Download a quarterly N-PORT ZIP from SEC."""
url = f"{NPORT_BASE_URL}/{quarter}_nport.zip"
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
zip_path = out / f"{quarter}_nport.zip"
extract_dir = out / quarter
if extract_dir.exists() and any(extract_dir.iterdir()):
log.info("Already extracted: %s", extract_dir)
return extract_dir
log.info("Downloading N-PORT data set: %s (this may take several minutes)", url)
resp = _throttled_get(url, stream=True)
total_size = int(resp.headers.get("content-length", 0))
with open(zip_path, "wb") as fp:
with tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Downloading {quarter}") as pbar:
for chunk in resp.iter_content(chunk_size=65536):
fp.write(chunk)
pbar.update(len(chunk))
log.info("Extracting %s", zip_path)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)
log.info("Extracted to %s", extract_dir)
return extract_dir
def _find_tsv(extract_dir: Path, pattern: str) -> Path:
"""Find a TSV file matching a pattern in the extract directory."""
candidates = list(extract_dir.rglob(f"*{pattern}*"))
if candidates:
return candidates[0]
return extract_dir / f"{pattern}.tsv"
def _read_tsv_streaming(filepath: Path, chunk_size: int = 10000):
"""Read a large TSV file in chunks, yielding lists of dicts."""
if not filepath.exists():
log.warning("File not found: %s", filepath)
return
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
reader = csv.DictReader(f, delimiter="\t")
chunk = []
for row in reader:
chunk.append(row)
if len(chunk) >= chunk_size:
yield chunk
chunk = []
if chunk:
yield chunk
def load_fund_reported_info(db: FundDatabase, extract_dir: Path, quarter: str):
"""Load the FUND_REPORTED_INFO table from N-PORT data."""
filepath = _find_tsv(extract_dir, "FUND_REPORTED_INFO")
if not filepath.exists():
log.warning("FUND_REPORTED_INFO not found in %s", extract_dir)
return 0
count = 0
with db.conn() as c:
for chunk in _read_tsv_streaming(filepath):
for row in chunk:
acc = row.get("ACCESSION_NUMBER", "").strip()
if not acc:
continue
try:
c.execute("""
INSERT OR IGNORE INTO nport_fund_info
(accession_number, cik, series_id, report_date,
total_assets, total_liabilities, net_assets)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
acc,
row.get("CIK", "").strip().zfill(10),
row.get("SERIES_ID", "").strip(),
row.get("REPORT_DATE", "").strip(),
_to_float(row.get("TOTAL_ASSETS")),
_to_float(row.get("TOTAL_LIABILITIES")),
_to_float(row.get("NET_ASSETS")),
))
count += 1
except Exception as e:
log.debug("Fund info insert error: %s", e)
log.info("Loaded %d fund info records from %s", count, quarter)
return count
def load_monthly_returns(db: FundDatabase, extract_dir: Path, quarter: str):
"""Load monthly total returns from N-PORT data."""
filepath = _find_tsv(extract_dir, "MONTHLY_TOTAL_RETURN")
if not filepath.exists():
log.warning("MONTHLY_TOTAL_RETURN not found in %s", extract_dir)
return 0
count = 0
with db.conn() as c:
for chunk in _read_tsv_streaming(filepath):
for row in chunk:
acc = row.get("ACCESSION_NUMBER", "").strip()
if not acc:
continue
try:
c.execute("""
INSERT OR IGNORE INTO nport_monthly_return
(accession_number, cik, class_id, report_date,
month1_return, month2_return, month3_return)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (
acc,
row.get("CIK", "").strip().zfill(10),
row.get("CLASS_ID", "").strip(),
row.get("REPORT_DATE", "").strip(),
_to_float(row.get("MONTHLY_TOTAL_RETURN1")),
_to_float(row.get("MONTHLY_TOTAL_RETURN2")),
_to_float(row.get("MONTHLY_TOTAL_RETURN3")),
))
count += 1
except Exception as e:
log.debug("Monthly return insert error: %s", e)
log.info("Loaded %d monthly return records from %s", count, quarter)
return count
def load_holdings(db: FundDatabase, extract_dir: Path, quarter: str,
cik_filter: set = None):
"""
Load portfolio holdings from N-PORT data.
If cik_filter is provided, only load holdings for those CIKs.
"""
filepath = _find_tsv(extract_dir, "FUND_REPORTED_HOLDING")
if not filepath.exists():
log.warning("FUND_REPORTED_HOLDING not found in %s", extract_dir)
return 0
count = 0
batch = []
batch_size = 5000
for chunk in _read_tsv_streaming(filepath, chunk_size=10000):
for row in chunk:
cik = row.get("CIK", "").strip().zfill(10)
if cik_filter and cik not in cik_filter:
continue
holding = {
"accession_number": row.get("ACCESSION_NUMBER", "").strip(),
"cik": cik,
"report_date": row.get("REPORT_DATE", "").strip(),
"holding_name": row.get("NAME_OF_ISSUER", "").strip()[:200],
"lei": row.get("LEI", "").strip(),
"cusip": row.get("CUSIP", "").strip(),
"isin": row.get("ISIN", "").strip(),
"ticker": row.get("TICKER", "").strip(),
"asset_category": row.get("ASSET_CAT", "").strip(),
"issuer_category": row.get("ISSUER_CAT", "").strip(),
"inv_country": row.get("INV_COUNTRY", "").strip(),
"currency": row.get("CURRENCY_CODE", row.get("CUR_CD", "")).strip(),
"quantity": _to_float(row.get("BALANCE")),
"value_usd": _to_float(row.get("VAL_USD", row.get("VALUE_USD"))),
"pct_val": _to_float(row.get("PCT_VAL")),
"is_debt": 1 if row.get("ASSET_CAT", "").strip().startswith("D") else 0,
"coupon_rate": _to_float(row.get("COUPON_RATE")),
"maturity_date": row.get("MATURITY_DATE", "").strip(),
"is_default": _to_int(row.get("IS_DEFAULT")),
"fair_value_level": row.get("FAIR_VAL_LEVEL", "").strip(),
}
batch.append(holding)
count += 1
if len(batch) >= batch_size:
db.bulk_insert_holdings(batch)
batch = []
if batch:
db.bulk_insert_holdings(batch)
log.info("Loaded %d holdings from %s", count, quarter)
return count
def _to_float(val):
if val is None:
return None
val = str(val).strip()
if not val or val.lower() in ("", "n/a", "none"):
return None
try:
return float(val)
except (ValueError, TypeError):
return None
def _to_int(val):
if val is None:
return None
val = str(val).strip()
if not val or val.lower() in ("", "n/a", "none"):
return None
try:
return int(float(val))
except (ValueError, TypeError):
return None
def main():
import argparse
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(description="Load N-PORT data")
parser.add_argument("--db", default="fund_data.db", help="Database path")
parser.add_argument("--quarters", nargs="+",
default=["2025q3"],
help="Quarters to download (e.g. 2025q3 2025q4)")
parser.add_argument("--data-dir", default="data/nport",
help="Directory for downloaded files")
parser.add_argument("--skip-holdings", action="store_true",
help="Skip loading individual holdings (large)")
parser.add_argument("--holdings-cik-filter", action="store_true",
help="Only load holdings for CIKs already in DB")
args = parser.parse_args()
db = FundDatabase(args.db)
cik_filter = None
if args.holdings_cik_filter:
cik_filter = set(db.get_all_ciks())
log.info("Filtering holdings to %d CIKs in database", len(cik_filter))
for quarter in args.quarters:
print(f"\n{'='*60}")
print(f"Processing N-PORT {quarter}")
print(f"{'='*60}")
try:
extract_dir = download_nport_zip(quarter, args.data_dir)
load_fund_reported_info(db, extract_dir, quarter)
load_monthly_returns(db, extract_dir, quarter)
if not args.skip_holdings:
load_holdings(db, extract_dir, quarter, cik_filter=cik_filter)
except Exception as e:
log.error("Failed to process %s: %s", quarter, e)
stats = db.get_stats()
print(f"\nDatabase stats:")
for table, count in stats.items():
if count > 0:
print(f" {table:30s} {count:>10,}")
if __name__ == "__main__":
main()