Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
301 lines
11 KiB
Python
301 lines
11 KiB
Python
"""
|
|
Step 4: Load N-PORT data (portfolio holdings, fund-level financials, returns).
|
|
|
|
Downloads quarterly N-PORT data set ZIPs from SEC, parses the TSV files,
|
|
and loads holdings, fund-level info, and monthly returns into the database.
|
|
"""
|
|
|
|
import csv
|
|
import logging
|
|
import time
|
|
import zipfile
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from tqdm import tqdm
|
|
|
|
from fund_db import FundDatabase
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
|
|
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
|
|
REQUEST_INTERVAL = 0.12
|
|
_last_request_time = 0.0
|
|
|
|
NPORT_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-port-data-sets"
|
|
|
|
|
|
def _throttled_get(url: str, **kwargs) -> requests.Response:
|
|
global _last_request_time
|
|
elapsed = time.time() - _last_request_time
|
|
if elapsed < REQUEST_INTERVAL:
|
|
time.sleep(REQUEST_INTERVAL - elapsed)
|
|
kwargs.setdefault("headers", {}).update(HEADERS)
|
|
kwargs.setdefault("timeout", 300)
|
|
resp = requests.get(url, **kwargs)
|
|
_last_request_time = time.time()
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def download_nport_zip(quarter: str, output_dir: str = "data/nport") -> Path:
|
|
"""Download a quarterly N-PORT ZIP from SEC."""
|
|
url = f"{NPORT_BASE_URL}/{quarter}_nport.zip"
|
|
out = Path(output_dir)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
zip_path = out / f"{quarter}_nport.zip"
|
|
extract_dir = out / quarter
|
|
|
|
if extract_dir.exists() and any(extract_dir.iterdir()):
|
|
log.info("Already extracted: %s", extract_dir)
|
|
return extract_dir
|
|
|
|
log.info("Downloading N-PORT data set: %s (this may take several minutes)", url)
|
|
resp = _throttled_get(url, stream=True)
|
|
|
|
total_size = int(resp.headers.get("content-length", 0))
|
|
with open(zip_path, "wb") as fp:
|
|
with tqdm(total=total_size, unit="B", unit_scale=True, desc=f"Downloading {quarter}") as pbar:
|
|
for chunk in resp.iter_content(chunk_size=65536):
|
|
fp.write(chunk)
|
|
pbar.update(len(chunk))
|
|
|
|
log.info("Extracting %s", zip_path)
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
zf.extractall(extract_dir)
|
|
log.info("Extracted to %s", extract_dir)
|
|
return extract_dir
|
|
|
|
|
|
def _find_tsv(extract_dir: Path, pattern: str) -> Path:
|
|
"""Find a TSV file matching a pattern in the extract directory."""
|
|
candidates = list(extract_dir.rglob(f"*{pattern}*"))
|
|
if candidates:
|
|
return candidates[0]
|
|
return extract_dir / f"{pattern}.tsv"
|
|
|
|
|
|
def _read_tsv_streaming(filepath: Path, chunk_size: int = 10000):
|
|
"""Read a large TSV file in chunks, yielding lists of dicts."""
|
|
if not filepath.exists():
|
|
log.warning("File not found: %s", filepath)
|
|
return
|
|
|
|
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
|
reader = csv.DictReader(f, delimiter="\t")
|
|
chunk = []
|
|
for row in reader:
|
|
chunk.append(row)
|
|
if len(chunk) >= chunk_size:
|
|
yield chunk
|
|
chunk = []
|
|
if chunk:
|
|
yield chunk
|
|
|
|
|
|
def load_fund_reported_info(db: FundDatabase, extract_dir: Path, quarter: str):
|
|
"""Load the FUND_REPORTED_INFO table from N-PORT data."""
|
|
filepath = _find_tsv(extract_dir, "FUND_REPORTED_INFO")
|
|
if not filepath.exists():
|
|
log.warning("FUND_REPORTED_INFO not found in %s", extract_dir)
|
|
return 0
|
|
|
|
count = 0
|
|
with db.conn() as c:
|
|
for chunk in _read_tsv_streaming(filepath):
|
|
for row in chunk:
|
|
acc = row.get("ACCESSION_NUMBER", "").strip()
|
|
if not acc:
|
|
continue
|
|
try:
|
|
c.execute("""
|
|
INSERT OR IGNORE INTO nport_fund_info
|
|
(accession_number, cik, series_id, report_date,
|
|
total_assets, total_liabilities, net_assets)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
acc,
|
|
row.get("CIK", "").strip().zfill(10),
|
|
row.get("SERIES_ID", "").strip(),
|
|
row.get("REPORT_DATE", "").strip(),
|
|
_to_float(row.get("TOTAL_ASSETS")),
|
|
_to_float(row.get("TOTAL_LIABILITIES")),
|
|
_to_float(row.get("NET_ASSETS")),
|
|
))
|
|
count += 1
|
|
except Exception as e:
|
|
log.debug("Fund info insert error: %s", e)
|
|
|
|
log.info("Loaded %d fund info records from %s", count, quarter)
|
|
return count
|
|
|
|
|
|
def load_monthly_returns(db: FundDatabase, extract_dir: Path, quarter: str):
|
|
"""Load monthly total returns from N-PORT data."""
|
|
filepath = _find_tsv(extract_dir, "MONTHLY_TOTAL_RETURN")
|
|
if not filepath.exists():
|
|
log.warning("MONTHLY_TOTAL_RETURN not found in %s", extract_dir)
|
|
return 0
|
|
|
|
count = 0
|
|
with db.conn() as c:
|
|
for chunk in _read_tsv_streaming(filepath):
|
|
for row in chunk:
|
|
acc = row.get("ACCESSION_NUMBER", "").strip()
|
|
if not acc:
|
|
continue
|
|
try:
|
|
c.execute("""
|
|
INSERT OR IGNORE INTO nport_monthly_return
|
|
(accession_number, cik, class_id, report_date,
|
|
month1_return, month2_return, month3_return)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
""", (
|
|
acc,
|
|
row.get("CIK", "").strip().zfill(10),
|
|
row.get("CLASS_ID", "").strip(),
|
|
row.get("REPORT_DATE", "").strip(),
|
|
_to_float(row.get("MONTHLY_TOTAL_RETURN1")),
|
|
_to_float(row.get("MONTHLY_TOTAL_RETURN2")),
|
|
_to_float(row.get("MONTHLY_TOTAL_RETURN3")),
|
|
))
|
|
count += 1
|
|
except Exception as e:
|
|
log.debug("Monthly return insert error: %s", e)
|
|
|
|
log.info("Loaded %d monthly return records from %s", count, quarter)
|
|
return count
|
|
|
|
|
|
def load_holdings(db: FundDatabase, extract_dir: Path, quarter: str,
|
|
cik_filter: set = None):
|
|
"""
|
|
Load portfolio holdings from N-PORT data.
|
|
If cik_filter is provided, only load holdings for those CIKs.
|
|
"""
|
|
filepath = _find_tsv(extract_dir, "FUND_REPORTED_HOLDING")
|
|
if not filepath.exists():
|
|
log.warning("FUND_REPORTED_HOLDING not found in %s", extract_dir)
|
|
return 0
|
|
|
|
count = 0
|
|
batch = []
|
|
batch_size = 5000
|
|
|
|
for chunk in _read_tsv_streaming(filepath, chunk_size=10000):
|
|
for row in chunk:
|
|
cik = row.get("CIK", "").strip().zfill(10)
|
|
if cik_filter and cik not in cik_filter:
|
|
continue
|
|
|
|
holding = {
|
|
"accession_number": row.get("ACCESSION_NUMBER", "").strip(),
|
|
"cik": cik,
|
|
"report_date": row.get("REPORT_DATE", "").strip(),
|
|
"holding_name": row.get("NAME_OF_ISSUER", "").strip()[:200],
|
|
"lei": row.get("LEI", "").strip(),
|
|
"cusip": row.get("CUSIP", "").strip(),
|
|
"isin": row.get("ISIN", "").strip(),
|
|
"ticker": row.get("TICKER", "").strip(),
|
|
"asset_category": row.get("ASSET_CAT", "").strip(),
|
|
"issuer_category": row.get("ISSUER_CAT", "").strip(),
|
|
"inv_country": row.get("INV_COUNTRY", "").strip(),
|
|
"currency": row.get("CURRENCY_CODE", row.get("CUR_CD", "")).strip(),
|
|
"quantity": _to_float(row.get("BALANCE")),
|
|
"value_usd": _to_float(row.get("VAL_USD", row.get("VALUE_USD"))),
|
|
"pct_val": _to_float(row.get("PCT_VAL")),
|
|
"is_debt": 1 if row.get("ASSET_CAT", "").strip().startswith("D") else 0,
|
|
"coupon_rate": _to_float(row.get("COUPON_RATE")),
|
|
"maturity_date": row.get("MATURITY_DATE", "").strip(),
|
|
"is_default": _to_int(row.get("IS_DEFAULT")),
|
|
"fair_value_level": row.get("FAIR_VAL_LEVEL", "").strip(),
|
|
}
|
|
batch.append(holding)
|
|
count += 1
|
|
|
|
if len(batch) >= batch_size:
|
|
db.bulk_insert_holdings(batch)
|
|
batch = []
|
|
|
|
if batch:
|
|
db.bulk_insert_holdings(batch)
|
|
|
|
log.info("Loaded %d holdings from %s", count, quarter)
|
|
return count
|
|
|
|
|
|
def _to_float(val):
|
|
if val is None:
|
|
return None
|
|
val = str(val).strip()
|
|
if not val or val.lower() in ("", "n/a", "none"):
|
|
return None
|
|
try:
|
|
return float(val)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def _to_int(val):
|
|
if val is None:
|
|
return None
|
|
val = str(val).strip()
|
|
if not val or val.lower() in ("", "n/a", "none"):
|
|
return None
|
|
try:
|
|
return int(float(val))
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s")
|
|
|
|
parser = argparse.ArgumentParser(description="Load N-PORT data")
|
|
parser.add_argument("--db", default="fund_data.db", help="Database path")
|
|
parser.add_argument("--quarters", nargs="+",
|
|
default=["2025q3"],
|
|
help="Quarters to download (e.g. 2025q3 2025q4)")
|
|
parser.add_argument("--data-dir", default="data/nport",
|
|
help="Directory for downloaded files")
|
|
parser.add_argument("--skip-holdings", action="store_true",
|
|
help="Skip loading individual holdings (large)")
|
|
parser.add_argument("--holdings-cik-filter", action="store_true",
|
|
help="Only load holdings for CIKs already in DB")
|
|
args = parser.parse_args()
|
|
|
|
db = FundDatabase(args.db)
|
|
|
|
cik_filter = None
|
|
if args.holdings_cik_filter:
|
|
cik_filter = set(db.get_all_ciks())
|
|
log.info("Filtering holdings to %d CIKs in database", len(cik_filter))
|
|
|
|
for quarter in args.quarters:
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing N-PORT {quarter}")
|
|
print(f"{'='*60}")
|
|
try:
|
|
extract_dir = download_nport_zip(quarter, args.data_dir)
|
|
load_fund_reported_info(db, extract_dir, quarter)
|
|
load_monthly_returns(db, extract_dir, quarter)
|
|
|
|
if not args.skip_holdings:
|
|
load_holdings(db, extract_dir, quarter, cik_filter=cik_filter)
|
|
except Exception as e:
|
|
log.error("Failed to process %s: %s", quarter, e)
|
|
|
|
stats = db.get_stats()
|
|
print(f"\nDatabase stats:")
|
|
for table, count in stats.items():
|
|
if count > 0:
|
|
print(f" {table:30s} {count:>10,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|