Builds a relationship-rich finance dataset for text-to-RDF-triple extraction
from SEC fund disclosures, the dataset for the thesis 'Magical RDF Triples and
how to synthetize them'.
- build_rdf_dataset.py: gold (N-CEN graphs), fetch (EDGAR prospectus prose,
all books per trust), samples (per-fund segmentation, marker + plain
serializations), split (trust-level 80/10/10, no leakage)
- score_baseline.py: no-model string-match baseline + strong-model scorer
- dataset_description.{tex,pdf}: scientific description of the dataset
- data/rdf_poc/gold_graphs.jsonl: structured gold knowledge graph (2025Q3)
- Large prose/sample files and raw SEC downloads are gitignored (reproducible)
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
191 lines
6.1 KiB
Python
191 lines
6.1 KiB
Python
"""
|
|
Step 1: Build the fund universe from SEC Series/Class CSV.
|
|
|
|
Downloads the SEC Investment Company Series and Class Information CSV,
|
|
parses it, and loads all trusts, series, and share classes into the
|
|
SQLite database. This gives us the full universe of ~15K trusts,
|
|
~50K series (funds), and ~100K+ share classes.
|
|
"""
|
|
|
|
import csv
|
|
import io
|
|
import logging
|
|
import time
|
|
|
|
import requests
|
|
from tqdm import tqdm
|
|
|
|
from fund_db import FundDatabase
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
SEC_BASE = "https://www.sec.gov"
|
|
DATA_SEC = "https://data.sec.gov"
|
|
SERIES_CLASS_CSV_URL = (
|
|
"https://www.sec.gov/files/investment/data/other/"
|
|
"investment-company-series-class-information/"
|
|
"investment-company-series-class-2025.csv"
|
|
)
|
|
|
|
USER_AGENT = "FundDataResearch/1.0 research@university.edu"
|
|
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
|
|
REQUEST_INTERVAL = 0.12
|
|
|
|
_last_request_time = 0.0
|
|
|
|
|
|
def _throttled_get(url: str, **kwargs) -> requests.Response:
|
|
global _last_request_time
|
|
elapsed = time.time() - _last_request_time
|
|
if elapsed < REQUEST_INTERVAL:
|
|
time.sleep(REQUEST_INTERVAL - elapsed)
|
|
kwargs.setdefault("headers", {}).update(HEADERS)
|
|
kwargs.setdefault("timeout", 60)
|
|
resp = requests.get(url, **kwargs)
|
|
_last_request_time = time.time()
|
|
resp.raise_for_status()
|
|
return resp
|
|
|
|
|
|
def load_series_class_csv(db: FundDatabase, csv_url: str = SERIES_CLASS_CSV_URL):
|
|
"""Download SEC Series/Class CSV and load into the database."""
|
|
log.info("Downloading SEC Series/Class CSV from %s", csv_url)
|
|
resp = _throttled_get(csv_url)
|
|
resp.encoding = "utf-8"
|
|
|
|
reader = csv.DictReader(io.StringIO(resp.text))
|
|
|
|
trust_count = 0
|
|
series_count = 0
|
|
class_count = 0
|
|
seen_trusts = set()
|
|
seen_series = set()
|
|
|
|
for row in reader:
|
|
cik = row.get("CIK Number", "").strip().zfill(10)
|
|
if not cik or cik == "0" * 10:
|
|
continue
|
|
|
|
if cik not in seen_trusts:
|
|
db.upsert_trust_simple(
|
|
cik=cik,
|
|
trust_name=row.get("Entity Name", "").strip(),
|
|
file_number=row.get("Reporting File Number", "").strip(),
|
|
entity_type=row.get("Entity Org Type", "").strip(),
|
|
)
|
|
seen_trusts.add(cik)
|
|
trust_count += 1
|
|
|
|
series_id = row.get("Series ID", "").strip()
|
|
if not series_id:
|
|
continue
|
|
|
|
if series_id not in seen_series:
|
|
db.upsert_series(
|
|
series_id=series_id,
|
|
cik=cik,
|
|
series_name=row.get("Series Name", "").strip(),
|
|
)
|
|
seen_series.add(series_id)
|
|
series_count += 1
|
|
|
|
class_id = row.get("Class ID", "").strip()
|
|
if class_id:
|
|
db.upsert_share_class(
|
|
class_id=class_id,
|
|
series_id=series_id,
|
|
cik=cik,
|
|
class_name=row.get("Class Name", "").strip(),
|
|
ticker=row.get("Class Ticker", "").strip(),
|
|
)
|
|
class_count += 1
|
|
|
|
log.info("Loaded %d trusts, %d series, %d share classes from CSV",
|
|
trust_count, series_count, class_count)
|
|
db.record_bulk_download("series_class_csv", "2025", csv_url, class_count)
|
|
return trust_count, series_count, class_count
|
|
|
|
|
|
def enrich_from_submissions_api(db: FundDatabase, ciks: list[str] = None,
|
|
limit: int = 0):
|
|
"""
|
|
Enrich trust records with data from the SEC Submissions API.
|
|
Adds: fiscal year end, SIC code, state of incorporation, website.
|
|
"""
|
|
if ciks is None:
|
|
ciks = db.get_all_ciks()
|
|
if limit > 0:
|
|
ciks = ciks[:limit]
|
|
|
|
log.info("Enriching %d trusts from Submissions API", len(ciks))
|
|
enriched = 0
|
|
|
|
for cik in tqdm(ciks, desc="Enriching trusts"):
|
|
if db.get_pipeline_status(cik, "enrich_submissions") == "done":
|
|
continue
|
|
|
|
db.set_pipeline_status(cik, "enrich_submissions", "running")
|
|
try:
|
|
url = f"{DATA_SEC}/submissions/CIK{cik}.json"
|
|
resp = _throttled_get(url)
|
|
data = resp.json()
|
|
|
|
db.upsert_trust_simple(
|
|
cik=cik,
|
|
trust_name=data.get("name", ""),
|
|
state_of_inc=data.get("stateOfIncorporation", ""),
|
|
fiscal_year_end=data.get("fiscalYearEnd", ""),
|
|
website=data.get("website", ""),
|
|
)
|
|
|
|
sic = data.get("sic", "")
|
|
if sic:
|
|
with db.conn() as c:
|
|
c.execute("UPDATE trust SET sic_code=? WHERE cik=?", (sic, cik))
|
|
|
|
db.set_pipeline_status(cik, "enrich_submissions", "done", items_processed=1)
|
|
enriched += 1
|
|
except Exception as e:
|
|
db.set_pipeline_status(cik, "enrich_submissions", "error",
|
|
error_message=str(e)[:500])
|
|
log.warning("Failed to enrich CIK %s: %s", cik, e)
|
|
|
|
log.info("Enriched %d trusts", enriched)
|
|
return enriched
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
logging.basicConfig(level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s")
|
|
|
|
parser = argparse.ArgumentParser(description="Build SEC fund universe")
|
|
parser.add_argument("--db", default="fund_data.db", help="Database path")
|
|
parser.add_argument("--skip-csv", action="store_true",
|
|
help="Skip CSV download (if already loaded)")
|
|
parser.add_argument("--enrich", action="store_true",
|
|
help="Enrich trusts from Submissions API")
|
|
parser.add_argument("--enrich-limit", type=int, default=0,
|
|
help="Max trusts to enrich (0=all)")
|
|
args = parser.parse_args()
|
|
|
|
db = FundDatabase(args.db)
|
|
|
|
if not args.skip_csv:
|
|
t, s, c = load_series_class_csv(db)
|
|
print(f"\nUniverse loaded: {t:,} trusts, {s:,} series, {c:,} share classes")
|
|
|
|
if args.enrich:
|
|
n = enrich_from_submissions_api(db, limit=args.enrich_limit)
|
|
print(f"Enriched {n:,} trusts from Submissions API")
|
|
|
|
stats = db.get_stats()
|
|
print(f"\nDatabase stats:")
|
|
for table, count in stats.items():
|
|
if count > 0:
|
|
print(f" {table:30s} {count:>10,}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|