fund_rfid_data/load_ncen.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

247 lines
8.1 KiB
Python

"""
Step 5: Load N-CEN data (service providers, fund classification, ETF info).
Downloads quarterly N-CEN data set ZIPs from SEC, parses the TSV files,
and loads fund classification and service provider data into the database.
"""
import csv
import logging
import time
import zipfile
from pathlib import Path
import requests
from tqdm import tqdm
from fund_db import FundDatabase
log = logging.getLogger(__name__)
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0
NCEN_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-cen-data-sets"
def _throttled_get(url: str, **kwargs) -> requests.Response:
global _last_request_time
elapsed = time.time() - _last_request_time
if elapsed < REQUEST_INTERVAL:
time.sleep(REQUEST_INTERVAL - elapsed)
kwargs.setdefault("headers", {}).update(HEADERS)
kwargs.setdefault("timeout", 120)
resp = requests.get(url, **kwargs)
_last_request_time = time.time()
resp.raise_for_status()
return resp
def download_ncen_zip(quarter: str, output_dir: str = "data/ncen") -> Path:
"""Download a quarterly N-CEN ZIP from SEC."""
url = f"{NCEN_BASE_URL}/{quarter}_ncen.zip"
out = Path(output_dir)
out.mkdir(parents=True, exist_ok=True)
zip_path = out / f"{quarter}_ncen.zip"
extract_dir = out / quarter
if extract_dir.exists() and any(extract_dir.iterdir()):
log.info("Already extracted: %s", extract_dir)
return extract_dir
log.info("Downloading N-CEN data set: %s", url)
resp = _throttled_get(url, stream=True)
with open(zip_path, "wb") as fp:
for chunk in resp.iter_content(chunk_size=65536):
fp.write(chunk)
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(extract_dir)
log.info("Extracted to %s", extract_dir)
return extract_dir
def _find_tsv(extract_dir: Path, pattern: str) -> Path:
candidates = list(extract_dir.rglob(f"*{pattern}*"))
tsv_candidates = [c for c in candidates if c.suffix.lower() in (".tsv", ".txt", "")]
if tsv_candidates:
return tsv_candidates[0]
if candidates:
return candidates[0]
return extract_dir / f"{pattern}.tsv"
def _read_tsv(filepath: Path) -> list[dict]:
if not filepath.exists():
log.warning("File not found: %s", filepath)
return []
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
reader = csv.DictReader(f, delimiter="\t")
return list(reader)
def load_etf_data(db: FundDatabase, extract_dir: Path, quarter: str):
"""Load ETF classification data from N-CEN."""
filepath = _find_tsv(extract_dir, "ETF")
if not filepath.exists():
log.info("No ETF file found in %s", extract_dir)
return 0
rows = _read_tsv(filepath)
count = 0
with db.conn() as c:
for row in rows:
cik = row.get("CIK", "").strip().zfill(10)
acc = row.get("ACCESSION_NUMBER", "").strip()
series_id = row.get("SERIES_ID", "").strip()
if not cik or not acc:
continue
try:
c.execute("""
INSERT OR REPLACE INTO ncen_fund_info
(accession_number, cik, series_id, report_period, is_etf,
index_name, fund_type)
VALUES (?, ?, ?, ?, 1, ?, 'ETF')
""", (
acc, cik, series_id, quarter,
row.get("INDEX_NAME", "").strip(),
))
count += 1
except Exception as e:
log.debug("ETF insert error: %s", e)
log.info("Loaded %d ETF records from %s", count, quarter)
return count
def load_index_data(db: FundDatabase, extract_dir: Path, quarter: str):
"""Load index fund data from N-CEN."""
filepath = _find_tsv(extract_dir, "INDEX")
if not filepath.exists():
log.info("No INDEX file found in %s", extract_dir)
return 0
rows = _read_tsv(filepath)
count = 0
with db.conn() as c:
for row in rows:
cik = row.get("CIK", "").strip().zfill(10)
acc = row.get("ACCESSION_NUMBER", "").strip()
series_id = row.get("SERIES_ID", "").strip()
if not cik or not acc:
continue
try:
c.execute("""
INSERT OR REPLACE INTO ncen_fund_info
(accession_number, cik, series_id, report_period,
is_index_fund, index_name)
VALUES (?, ?, ?, ?, 1, ?)
ON CONFLICT(accession_number, series_id) DO UPDATE SET
is_index_fund=1,
index_name=COALESCE(NULLIF(excluded.index_name,''), ncen_fund_info.index_name)
""", (
acc, cik, series_id, quarter,
row.get("INDEX_NAME", row.get("NAME", "")).strip(),
))
count += 1
except Exception as e:
log.debug("Index insert error: %s", e)
log.info("Loaded %d index fund records from %s", count, quarter)
return count
def load_service_providers(db: FundDatabase, extract_dir: Path, quarter: str):
"""Load service provider data from various N-CEN tables."""
provider_files = {
"CUSTODIAN": "custodian",
"TRANSFER_AGENT": "transfer_agent",
"ADVISOR": "adviser",
"SUB_ADVISOR": "sub_adviser",
"ADMINISTRATOR": "administrator",
"AUDITOR": "auditor",
}
total = 0
for filename, role in provider_files.items():
filepath = _find_tsv(extract_dir, filename)
if not filepath.exists():
continue
rows = _read_tsv(filepath)
count = 0
with db.conn() as c:
for row in rows:
cik = row.get("CIK", "").strip().zfill(10)
name = (row.get("NAME", "") or row.get("COMPANY_NAME", "") or
row.get("CUSTODIAN_NAME", "") or row.get("FIRM_NAME", "")).strip()
lei = row.get("LEI", "").strip()
if not cik or not name:
continue
try:
c.execute("""
INSERT OR IGNORE INTO ncen_service_provider
(cik, report_period, provider_role, provider_name, provider_lei)
VALUES (?, ?, ?, ?, ?)
""", (cik, quarter, role, name, lei))
count += 1
except Exception as e:
log.debug("Service provider insert error: %s", e)
log.info(" %s: %d records", role, count)
total += count
log.info("Loaded %d total service provider records from %s", total, quarter)
return total
def main():
import argparse
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(description="Load N-CEN data")
parser.add_argument("--db", default="fund_data.db", help="Database path")
parser.add_argument("--quarters", nargs="+",
default=["2025q3"],
help="Quarters to download (e.g. 2025q3)")
parser.add_argument("--data-dir", default="data/ncen",
help="Directory for downloaded files")
args = parser.parse_args()
db = FundDatabase(args.db)
for quarter in args.quarters:
print(f"\n{'='*60}")
print(f"Processing N-CEN {quarter}")
print(f"{'='*60}")
try:
extract_dir = download_ncen_zip(quarter, args.data_dir)
load_etf_data(db, extract_dir, quarter)
load_index_data(db, extract_dir, quarter)
load_service_providers(db, extract_dir, quarter)
except Exception as e:
log.error("Failed to process %s: %s", quarter, e)
stats = db.get_stats()
print(f"\nDatabase stats:")
for table, count in stats.items():
if count > 0:
print(f" {table:30s} {count:>10,}")
if __name__ == "__main__":
main()