fund_rfid_data/load_xbrl_rr.py

"""
Step 3: Load XBRL Risk/Return Summary data into the database.

Downloads quarterly ZIP files from SEC containing structured fee,
performance, and objective data extracted from mutual fund prospectuses.
Parses the flat files (SUB, TAG, NUM, TXT) and loads structured records
into the xbrl_fee, xbrl_performance, and xbrl_objective tables.
"""

import csv
import io
import logging
import os
import time
import zipfile
from collections import defaultdict
from pathlib import Path

import requests
from tqdm import tqdm

from fund_db import FundDatabase

log = logging.getLogger(__name__)

USER_AGENT = "FundDataResearch/1.0 research@university.edu"
HEADERS = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip, deflate"}
REQUEST_INTERVAL = 0.12
_last_request_time = 0.0

XBRL_RR_BASE_URL = (
    "https://www.sec.gov/files/dera/data/"
    "mutual-fund-prospectus-risk/return-summary-data-sets"
)

FEE_TAGS = {
    "MaximumSalesChargeImposedOnPurchasesOverOfferingPrice": "max_sales_charge_pct",
    "MaximumDeferredSalesChargeOverOther": "max_deferred_charge_pct",
    "RedemptionFeeOverRedemption": "redemption_fee_pct",
    "ManagementFeesOverAssets": "management_fee_pct",
    "Distribution12b1FeesOverAssets": "dist_12b1_fee_pct",
    "OtherExpensesOverAssets": "other_expenses_pct",
    "AcquiredFundFeesAndExpensesOverAssets": "acquired_fund_fees_pct",
    "TotalAnnualFundOperatingExpensesOverAssets": "total_expenses_pct",
    "FeeWaiverOrReimbursementOverAssets": "fee_waiver_pct",
    "TotalAnnualFundOperatingExpensesAfterFeeWaiverOverAssets": "net_expenses_pct",
    "ExpenseExampleYear01": "expense_example_1yr",
    "ExpenseExampleYear03": "expense_example_3yr",
    "ExpenseExampleYear05": "expense_example_5yr",
    "ExpenseExampleYear10": "expense_example_10yr",
}

PERFORMANCE_TAGS = {
    "AverageAnnualReturnYear01": "return_year_1",
    "AverageAnnualReturnYear05": "return_year_5",
    "AverageAnnualReturnYear10": "return_year_10",
    "AverageAnnualReturnSinceInception": "return_since_incep",
    "HighestQuarterlyReturnValue": "best_quarter_return",
    "LowestQuarterlyReturnValue": "worst_quarter_return",
    "AnnualTurnover": "portfolio_turnover",
}

PERFORMANCE_TEXT_TAGS = {
    "HighestQuarterlyReturnLabel": "best_quarter_label",
    "LowestQuarterlyReturnLabel": "worst_quarter_label",
    "ShareClassInceptionDate": "inception_date",
}

OBJECTIVE_TEXT_TAGS = {
    "ObjectivePrimaryTextBlock": "objective_text",
    "StrategyNarrativeTextBlock": "strategy_text",
    "RiskNarrativeTextBlock": "risk_text",
}


def _throttled_get(url: str, **kwargs) -> requests.Response:
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < REQUEST_INTERVAL:
        time.sleep(REQUEST_INTERVAL - elapsed)
    kwargs.setdefault("headers", {}).update(HEADERS)
    kwargs.setdefault("timeout", 120)
    resp = requests.get(url, **kwargs)
    _last_request_time = time.time()
    resp.raise_for_status()
    return resp


def download_xbrl_rr_zip(quarter: str, output_dir: str = "data/xbrl_rr") -> Path:
    """Download a quarterly XBRL Risk/Return ZIP from SEC."""
    url = f"{XBRL_RR_BASE_URL}/{quarter}_rr1.zip"
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    zip_path = out / f"{quarter}_rr1.zip"
    extract_dir = out / quarter

    if extract_dir.exists() and any(extract_dir.iterdir()):
        log.info("Already extracted: %s", extract_dir)
        return extract_dir

    log.info("Downloading XBRL Risk/Return: %s", url)
    resp = _throttled_get(url, stream=True)
    with open(zip_path, "wb") as fp:
        for chunk in resp.iter_content(chunk_size=65536):
            fp.write(chunk)

    with zipfile.ZipFile(zip_path, "r") as zf:
        zf.extractall(extract_dir)
    log.info("Extracted to %s", extract_dir)
    return extract_dir


def _read_tsv(filepath: Path) -> list[dict]:
    """Read a TSV file into a list of dicts."""
    if not filepath.exists():
        log.warning("File not found: %s", filepath)
        return []
    with open(filepath, "r", encoding="utf-8", errors="replace") as f:
        reader = csv.DictReader(f, delimiter="\t")
        return list(reader)


def parse_and_load_xbrl_rr(db: FundDatabase, extract_dir: Path, quarter: str):
    """
    Parse the XBRL Risk/Return flat files and load into the database.

    The data set has:
      - sub.tsv: submission metadata (accession, CIK, filing date, etc.)
      - num.tsv: numeric values (tag, value, per accession + dimension)
      - txt.tsv: text values (tag, value, per accession + dimension)
    """
    sub_file = extract_dir / "sub.tsv"
    num_file = extract_dir / "num.tsv"
    txt_file = extract_dir / "txt.tsv"

    for f in [sub_file, num_file]:
        if not f.exists():
            sub_file = next(extract_dir.glob("*sub*"), None)
            num_file = next(extract_dir.glob("*num*"), None)
            txt_file = next(extract_dir.glob("*txt*"), None)
            break

    if not sub_file or not sub_file.exists():
        log.error("Cannot find sub file in %s", extract_dir)
        return

    log.info("Reading submission metadata from %s", sub_file)
    subs = _read_tsv(sub_file)
    sub_map = {}
    for s in subs:
        adsh = s.get("adsh", "").strip()
        if adsh:
            sub_map[adsh] = {
                "cik": str(s.get("cik", "")).strip().zfill(10),
                "filing_date": s.get("filed", "").strip(),
                "form": s.get("form", "").strip(),
                "name": s.get("name", "").strip(),
            }

    log.info("Reading numeric data from %s", num_file)
    nums = _read_tsv(num_file) if num_file and num_file.exists() else []

    fee_data = defaultdict(dict)
    perf_data = defaultdict(dict)

    for row in tqdm(nums, desc="Parsing numeric data"):
        adsh = row.get("adsh", "").strip()
        tag = row.get("tag", "").strip()
        value_str = row.get("value", "").strip()

        if not adsh or not tag or not value_str:
            continue

        try:
            value = float(value_str)
        except (ValueError, TypeError):
            continue

        sub_info = sub_map.get(adsh, {})
        cik = sub_info.get("cik", "")
        key = (adsh, cik)

        if tag in FEE_TAGS:
            fee_data[key][FEE_TAGS[tag]] = value
            fee_data[key]["_sub"] = sub_info
        elif tag in PERFORMANCE_TAGS:
            perf_data[key][PERFORMANCE_TAGS[tag]] = value
            perf_data[key]["_sub"] = sub_info

    log.info("Reading text data from %s", txt_file)
    txts = _read_tsv(txt_file) if txt_file and txt_file.exists() else []

    obj_data = defaultdict(dict)
    for row in tqdm(txts, desc="Parsing text data"):
        adsh = row.get("adsh", "").strip()
        tag = row.get("tag", "").strip()
        value = row.get("value", "").strip()

        if not adsh or not tag or not value:
            continue

        sub_info = sub_map.get(adsh, {})
        cik = sub_info.get("cik", "")
        key = (adsh, cik)

        if tag in PERFORMANCE_TEXT_TAGS:
            perf_data[key][PERFORMANCE_TEXT_TAGS[tag]] = value
            perf_data[key]["_sub"] = sub_info
        elif tag in OBJECTIVE_TEXT_TAGS:
            obj_data[key][OBJECTIVE_TEXT_TAGS[tag]] = value
            obj_data[key]["_sub"] = sub_info

    fee_count = 0
    perf_count = 0
    obj_count = 0

    with db.conn() as c:
        for (adsh, cik), vals in fee_data.items():
            sub_info = vals.pop("_sub", {})
            try:
                c.execute("""
                    INSERT OR REPLACE INTO xbrl_fee
                    (accession_number, cik, filing_date, fund_name,
                     max_sales_charge_pct, max_deferred_charge_pct,
                     redemption_fee_pct, management_fee_pct,
                     dist_12b1_fee_pct, other_expenses_pct,
                     acquired_fund_fees_pct, total_expenses_pct,
                     fee_waiver_pct, net_expenses_pct,
                     expense_example_1yr, expense_example_3yr,
                     expense_example_5yr, expense_example_10yr)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    adsh, cik, sub_info.get("filing_date", ""),
                    sub_info.get("name", ""),
                    vals.get("max_sales_charge_pct"),
                    vals.get("max_deferred_charge_pct"),
                    vals.get("redemption_fee_pct"),
                    vals.get("management_fee_pct"),
                    vals.get("dist_12b1_fee_pct"),
                    vals.get("other_expenses_pct"),
                    vals.get("acquired_fund_fees_pct"),
                    vals.get("total_expenses_pct"),
                    vals.get("fee_waiver_pct"),
                    vals.get("net_expenses_pct"),
                    vals.get("expense_example_1yr"),
                    vals.get("expense_example_3yr"),
                    vals.get("expense_example_5yr"),
                    vals.get("expense_example_10yr"),
                ))
                fee_count += 1
            except Exception as e:
                log.debug("Fee insert error for %s: %s", adsh, e)

        for (adsh, cik), vals in perf_data.items():
            sub_info = vals.pop("_sub", {})
            try:
                c.execute("""
                    INSERT OR REPLACE INTO xbrl_performance
                    (accession_number, cik, filing_date, fund_name,
                     inception_date,
                     return_year_1, return_year_5, return_year_10,
                     return_since_incep,
                     best_quarter_return, best_quarter_label,
                     worst_quarter_return, worst_quarter_label,
                     portfolio_turnover)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """, (
                    adsh, cik, sub_info.get("filing_date", ""),
                    sub_info.get("name", ""),
                    vals.get("inception_date"),
                    vals.get("return_year_1"),
                    vals.get("return_year_5"),
                    vals.get("return_year_10"),
                    vals.get("return_since_incep"),
                    vals.get("best_quarter_return"),
                    vals.get("best_quarter_label"),
                    vals.get("worst_quarter_return"),
                    vals.get("worst_quarter_label"),
                    vals.get("portfolio_turnover"),
                ))
                perf_count += 1
            except Exception as e:
                log.debug("Performance insert error for %s: %s", adsh, e)

        for (adsh, cik), vals in obj_data.items():
            sub_info = vals.pop("_sub", {})
            try:
                c.execute("""
                    INSERT OR REPLACE INTO xbrl_objective
                    (accession_number, cik, filing_date, fund_name,
                     objective_text, strategy_text, risk_text)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                """, (
                    adsh, cik, sub_info.get("filing_date", ""),
                    sub_info.get("name", ""),
                    vals.get("objective_text"),
                    vals.get("strategy_text"),
                    vals.get("risk_text"),
                ))
                obj_count += 1
            except Exception as e:
                log.debug("Objective insert error for %s: %s", adsh, e)

    log.info("Loaded %d fee records, %d performance records, %d objective records",
             fee_count, perf_count, obj_count)
    db.record_bulk_download("xbrl_rr", quarter, str(extract_dir), fee_count + perf_count + obj_count)
    return fee_count, perf_count, obj_count


def main():
    import argparse
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s [%(levelname)s] %(message)s")

    parser = argparse.ArgumentParser(description="Load XBRL Risk/Return data")
    parser.add_argument("--db", default="fund_data.db", help="Database path")
    parser.add_argument("--quarters", nargs="+",
                        default=["2025q1", "2025q2", "2025q3", "2025q4"],
                        help="Quarters to download (e.g. 2025q1 2025q2)")
    parser.add_argument("--data-dir", default="data/xbrl_rr",
                        help="Directory for downloaded files")
    args = parser.parse_args()

    db = FundDatabase(args.db)

    total_fees = 0
    total_perf = 0
    total_obj = 0

    for quarter in args.quarters:
        print(f"\n{'='*60}")
        print(f"Processing {quarter}")
        print(f"{'='*60}")
        try:
            extract_dir = download_xbrl_rr_zip(quarter, args.data_dir)
            f, p, o = parse_and_load_xbrl_rr(db, extract_dir, quarter)
            total_fees += f
            total_perf += p
            total_obj += o
        except Exception as e:
            log.error("Failed to process %s: %s", quarter, e)

    print(f"\nTotal loaded: {total_fees} fee records, {total_perf} performance, {total_obj} objectives")

    stats = db.get_stats()
    print(f"\nDatabase stats:")
    for table, count in stats.items():
        if count > 0:
            print(f"  {table:30s} {count:>10,}")


if __name__ == "__main__":
    main()