Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
247 lines
8.1 KiB
Python
247 lines
8.1 KiB
Python
"""
|
|
Step 5: Load N-CEN data (service providers, fund classification, ETF info).
|
|
|
|
Downloads quarterly N-CEN data set ZIPs from SEC, parses the TSV files,
|
|
and loads fund classification and service provider data into the database.
|
|
"""
|
|
|
|
import csv
|
|
import logging
|
|
import time
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from tqdm import tqdm
|
|
|
|
from fund_db import FundDatabase
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
|
|
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
|
|
REQUEST_INTERVAL = 0.12
|
|
_last_request_time = 0.0
|
|
|
|
NCEN_BASE_URL = "https://www.sec.gov/files/dera/data/form-n-cen-data-sets"
|
|
|
|
|
|
def _throttled_get(url: str, **kwargs) -> requests.Response:
|
|
global _last_request_time
|
|
elapsed = time.time() - _last_request_time
|
|
if elapsed < REQUEST_INTERVAL:
|
|
time.sleep(REQUEST_INTERVAL - elapsed)
|
|
kwargs.setdefault("headers", {}).update(HEADERS)
|
|
kwargs.setdefault("timeout", 120)
|
|
resp = requests.get(url, **kwargs)
|
|
_last_request_time = time.time()
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def download_ncen_zip(quarter: str, output_dir: str = "data/ncen") -> Path:
|
|
"""Download a quarterly N-CEN ZIP from SEC."""
|
|
url = f"{NCEN_BASE_URL}/{quarter}_ncen.zip"
|
|
out = Path(output_dir)
|
|
out.mkdir(parents=True, exist_ok=True)
|
|
zip_path = out / f"{quarter}_ncen.zip"
|
|
extract_dir = out / quarter
|
|
|
|
if extract_dir.exists() and any(extract_dir.iterdir()):
|
|
log.info("Already extracted: %s", extract_dir)
|
|
return extract_dir
|
|
|
|
log.info("Downloading N-CEN data set: %s", url)
|
|
resp = _throttled_get(url, stream=True)
|
|
|
|
with open(zip_path, "wb") as fp:
|
|
for chunk in resp.iter_content(chunk_size=65536):
|
|
fp.write(chunk)
|
|
|
|
with zipfile.ZipFile(zip_path, "r") as zf:
|
|
zf.extractall(extract_dir)
|
|
log.info("Extracted to %s", extract_dir)
|
|
return extract_dir
|
|
|
|
|
|
def _find_tsv(extract_dir: Path, pattern: str) -> Path:
|
|
candidates = list(extract_dir.rglob(f"*{pattern}*"))
|
|
tsv_candidates = [c for c in candidates if c.suffix.lower() in (".tsv", ".txt", "")]
|
|
if tsv_candidates:
|
|
return tsv_candidates[0]
|
|
if candidates:
|
|
return candidates[0]
|
|
return extract_dir / f"{pattern}.tsv"
|
|
|
|
|
|
def _read_tsv(filepath: Path) -> list[dict]:
|
|
if not filepath.exists():
|
|
log.warning("File not found: %s", filepath)
|
|
return []
|
|
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
|
|
reader = csv.DictReader(f, delimiter="\t")
|
|
return list(reader)
|
|
|
|
|
|
def load_etf_data(db: FundDatabase, extract_dir: Path, quarter: str):
|
|
"""Load ETF classification data from N-CEN."""
|
|
filepath = _find_tsv(extract_dir, "ETF")
|
|
if not filepath.exists():
|
|
log.info("No ETF file found in %s", extract_dir)
|
|
return 0
|
|
|
|
rows = _read_tsv(filepath)
|
|
count = 0
|
|
|
|
with db.conn() as c:
|
|
for row in rows:
|
|
cik = row.get("CIK", "").strip().zfill(10)
|
|
acc = row.get("ACCESSION_NUMBER", "").strip()
|
|
series_id = row.get("SERIES_ID", "").strip()
|
|
|
|
if not cik or not acc:
|
|
continue
|
|
|
|
try:
|
|
c.execute("""
|
|
INSERT OR REPLACE INTO ncen_fund_info
|
|
(accession_number, cik, series_id, report_period, is_etf,
|
|
index_name, fund_type)
|
|
VALUES (?, ?, ?, ?, 1, ?, 'ETF')
|
|
""", (
|
|
acc, cik, series_id, quarter,
|
|
row.get("INDEX_NAME", "").strip(),
|
|
))
|
|
count += 1
|
|
except Exception as e:
|
|
log.debug("ETF insert error: %s", e)
|
|
|
|
log.info("Loaded %d ETF records from %s", count, quarter)
|
|
return count
|
|
|
|
|
|
def load_index_data(db: FundDatabase, extract_dir: Path, quarter: str):
|
|
"""Load index fund data from N-CEN."""
|
|
filepath = _find_tsv(extract_dir, "INDEX")
|
|
if not filepath.exists():
|
|
log.info("No INDEX file found in %s", extract_dir)
|
|
return 0
|
|
|
|
rows = _read_tsv(filepath)
|
|
count = 0
|
|
|
|
with db.conn() as c:
|
|
for row in rows:
|
|
cik = row.get("CIK", "").strip().zfill(10)
|
|
acc = row.get("ACCESSION_NUMBER", "").strip()
|
|
series_id = row.get("SERIES_ID", "").strip()
|
|
|
|
if not cik or not acc:
|
|
continue
|
|
|
|
try:
|
|
c.execute("""
|
|
INSERT OR REPLACE INTO ncen_fund_info
|
|
(accession_number, cik, series_id, report_period,
|
|
is_index_fund, index_name)
|
|
VALUES (?, ?, ?, ?, 1, ?)
|
|
ON CONFLICT(accession_number, series_id) DO UPDATE SET
|
|
is_index_fund=1,
|
|
index_name=COALESCE(NULLIF(excluded.index_name,''), ncen_fund_info.index_name)
|
|
""", (
|
|
acc, cik, series_id, quarter,
|
|
row.get("INDEX_NAME", row.get("NAME", "")).strip(),
|
|
))
|
|
count += 1
|
|
except Exception as e:
|
|
log.debug("Index insert error: %s", e)
|
|
|
|
log.info("Loaded %d index fund records from %s", count, quarter)
|
|
return count
|
|
|
|
|
|
def load_service_providers(db: FundDatabase, extract_dir: Path, quarter: str):
|
|
"""Load service provider data from various N-CEN tables."""
|
|
provider_files = {
|
|
"CUSTODIAN": "custodian",
|
|
"TRANSFER_AGENT": "transfer_agent",
|
|
"ADVISOR": "adviser",
|
|
"SUB_ADVISOR": "sub_adviser",
|
|
"ADMINISTRATOR": "administrator",
|
|
"AUDITOR": "auditor",
|
|
}
|
|
|
|
total = 0
|
|
for filename, role in provider_files.items():
|
|
filepath = _find_tsv(extract_dir, filename)
|
|
if not filepath.exists():
|
|
continue
|
|
|
|
rows = _read_tsv(filepath)
|
|
count = 0
|
|
|
|
with db.conn() as c:
|
|
for row in rows:
|
|
cik = row.get("CIK", "").strip().zfill(10)
|
|
name = (row.get("NAME", "") or row.get("COMPANY_NAME", "") or
|
|
row.get("CUSTODIAN_NAME", "") or row.get("FIRM_NAME", "")).strip()
|
|
lei = row.get("LEI", "").strip()
|
|
|
|
if not cik or not name:
|
|
continue
|
|
|
|
try:
|
|
c.execute("""
|
|
INSERT OR IGNORE INTO ncen_service_provider
|
|
(cik, report_period, provider_role, provider_name, provider_lei)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
""", (cik, quarter, role, name, lei))
|
|
count += 1
|
|
except Exception as e:
|
|
log.debug("Service provider insert error: %s", e)
|
|
|
|
log.info(" %s: %d records", role, count)
|
|
total += count
|
|
|
|
log.info("Loaded %d total service provider records from %s", total, quarter)
|
|
return total
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s")
|
|
|
|
parser = argparse.ArgumentParser(description="Load N-CEN data")
|
|
parser.add_argument("--db", default="fund_data.db", help="Database path")
|
|
parser.add_argument("--quarters", nargs="+",
|
|
default=["2025q3"],
|
|
help="Quarters to download (e.g. 2025q3)")
|
|
parser.add_argument("--data-dir", default="data/ncen",
|
|
help="Directory for downloaded files")
|
|
args = parser.parse_args()
|
|
|
|
db = FundDatabase(args.db)
|
|
|
|
for quarter in args.quarters:
|
|
print(f"\n{'='*60}")
|
|
print(f"Processing N-CEN {quarter}")
|
|
print(f"{'='*60}")
|
|
try:
|
|
extract_dir = download_ncen_zip(quarter, args.data_dir)
|
|
load_etf_data(db, extract_dir, quarter)
|
|
load_index_data(db, extract_dir, quarter)
|
|
load_service_providers(db, extract_dir, quarter)
|
|
except Exception as e:
|
|
log.error("Failed to process %s: %s", quarter, e)
|
|
|
|
stats = db.get_stats()
|
|
print(f"\nDatabase stats:")
|
|
for table, count in stats.items():
|
|
if count > 0:
|
|
print(f" {table:30s} {count:>10,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|