fund_rfid_data/fetch_universe.py
Florian Herzog 1993658fb2 Add SEC fund prospectus -> RDF triple dataset pipeline
Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.

- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
  all books per trust), samples (per-fund segmentation, marker + plain
  serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:31:35 +02:00

191 lines
6.1 KiB
Python

"""
Step 1: Build the fund universe from SEC Series/Class CSV.
Downloads the SEC Investment Company Series and Class Information CSV,
parses it, and loads all trusts, series, and share classes into the
SQLite database. This gives us the full universe of ~15K trusts,
~50K series (funds), and ~100K+ share classes.
"""
import csv
import io
import logging
import time
import requests
from tqdm import tqdm
from fund_db import FundDatabase
log = logging.getLogger(__name__)
SEC_BASE = "https://www.sec.gov"
DATA_SEC = "https://data.sec.gov"
SERIES_CLASS_CSV_URL = (
"https://www.sec.gov/files/investment/data/other/"
"investment-company-series-class-information/"
"investment-company-series-class-2025.csv"
)
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0
def _throttled_get(url: str, **kwargs) -> requests.Response:
global _last_request_time
elapsed = time.time() - _last_request_time
if elapsed < REQUEST_INTERVAL:
time.sleep(REQUEST_INTERVAL - elapsed)
kwargs.setdefault("headers", {}).update(HEADERS)
kwargs.setdefault("timeout", 60)
resp = requests.get(url, **kwargs)
_last_request_time = time.time()
resp.raise_for_status()
return resp
def load_series_class_csv(db: FundDatabase, csv_url: str = SERIES_CLASS_CSV_URL):
"""Download SEC Series/Class CSV and load into the database."""
log.info("Downloading SEC Series/Class CSV from %s", csv_url)
resp = _throttled_get(csv_url)
resp.encoding = "utf-8"
reader = csv.DictReader(io.StringIO(resp.text))
trust_count = 0
series_count = 0
class_count = 0
seen_trusts = set()
seen_series = set()
for row in reader:
cik = row.get("CIK Number", "").strip().zfill(10)
if not cik or cik == "0" * 10:
continue
if cik not in seen_trusts:
db.upsert_trust_simple(
cik=cik,
trust_name=row.get("Entity Name", "").strip(),
file_number=row.get("Reporting File Number", "").strip(),
entity_type=row.get("Entity Org Type", "").strip(),
)
seen_trusts.add(cik)
trust_count += 1
series_id = row.get("Series ID", "").strip()
if not series_id:
continue
if series_id not in seen_series:
db.upsert_series(
series_id=series_id,
cik=cik,
series_name=row.get("Series Name", "").strip(),
)
seen_series.add(series_id)
series_count += 1
class_id = row.get("Class ID", "").strip()
if class_id:
db.upsert_share_class(
class_id=class_id,
series_id=series_id,
cik=cik,
class_name=row.get("Class Name", "").strip(),
ticker=row.get("Class Ticker", "").strip(),
)
class_count += 1
log.info("Loaded %d trusts, %d series, %d share classes from CSV",
trust_count, series_count, class_count)
db.record_bulk_download("series_class_csv", "2025", csv_url, class_count)
return trust_count, series_count, class_count
def enrich_from_submissions_api(db: FundDatabase, ciks: list[str] = None,
limit: int = 0):
"""
Enrich trust records with data from the SEC Submissions API.
Adds: fiscal year end, SIC code, state of incorporation, website.
"""
if ciks is None:
ciks = db.get_all_ciks()
if limit > 0:
ciks = ciks[:limit]
log.info("Enriching %d trusts from Submissions API", len(ciks))
enriched = 0
for cik in tqdm(ciks, desc="Enriching trusts"):
if db.get_pipeline_status(cik, "enrich_submissions") == "done":
continue
db.set_pipeline_status(cik, "enrich_submissions", "running")
try:
url = f"{DATA_SEC}/submissions/CIK{cik}.json"
resp = _throttled_get(url)
data = resp.json()
db.upsert_trust_simple(
cik=cik,
trust_name=data.get("name", ""),
state_of_inc=data.get("stateOfIncorporation", ""),
fiscal_year_end=data.get("fiscalYearEnd", ""),
website=data.get("website", ""),
)
sic = data.get("sic", "")
if sic:
with db.conn() as c:
c.execute("UPDATE trust SET sic_code=? WHERE cik=?", (sic, cik))
db.set_pipeline_status(cik, "enrich_submissions", "done", items_processed=1)
enriched += 1
except Exception as e:
db.set_pipeline_status(cik, "enrich_submissions", "error",
error_message=str(e)[:500])
log.warning("Failed to enrich CIK %s: %s", cik, e)
log.info("Enriched %d trusts", enriched)
return enriched
def main():
import argparse
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s")
parser = argparse.ArgumentParser(description="Build SEC fund universe")
parser.add_argument("--db", default="fund_data.db", help="Database path")
parser.add_argument("--skip-csv", action="store_true",
help="Skip CSV download (if already loaded)")
parser.add_argument("--enrich", action="store_true",
help="Enrich trusts from Submissions API")
parser.add_argument("--enrich-limit", type=int, default=0,
help="Max trusts to enrich (0=all)")
args = parser.parse_args()
db = FundDatabase(args.db)
if not args.skip_csv:
t, s, c = load_series_class_csv(db)
print(f"\nUniverse loaded: {t:,} trusts, {s:,} series, {c:,} share classes")
if args.enrich:
n = enrich_from_submissions_api(db, limit=args.enrich_limit)
print(f"Enriched {n:,} trusts from Submissions API")
stats = db.get_stats()
print(f"\nDatabase stats:")
for table, count in stats.items():
if count > 0:
print(f" {table:30s} {count:>10,}")
if __name__ == "__main__":
main()