#!/usr/bin/env bash # ------------------------------------------------------------------ # finalize_dataset.sh — konsolidiert alle Match-Teile und baut den # trainingsfertigen Datensatz: label -> trainset -> split. # # bash finalize_dataset.sh # # Idempotent: konsolidiert nur SAUBERE Trusts (n_failed_windows==0) aus # allen match_*.jsonl Teil-Dateien in match_all.jsonl, dann Pipeline. # ------------------------------------------------------------------ set -euo pipefail cd "$(dirname "$0")" python3 - <<'PY' import json, glob, os parts = (["data/rdf_poc/match_all_clean79.jsonl", "data/rdf_poc/match_remaining.jsonl"] + glob.glob("data/rdf_poc/match_remaining_*.jsonl") + glob.glob("data/rdf_poc/match_remaining_final.jsonl")) good = {} for p in parts: if not os.path.exists(p): continue for l in open(p): try: r = json.loads(l) except: continue if r.get("n_failed_windows", 0) == 0 and r.get("triples") is not None: good[r["cik"]] = r with open("data/rdf_poc/match_all.jsonl", "w") as f: for r in good.values(): f.write(json.dumps(r, ensure_ascii=False) + "\n") allin = [json.loads(l) for l in open("data/rdf_poc/match_input.jsonl")] remaining = [r for r in allin if r["cik"] not in good] tg = tt = 0 for r in good.values(): for t in r["triples"]: tt += 1; tg += 1 if t.get("llm_grounded") else 0 print(f"konsolidiert: {len(good)}/335 Trusts, {100*tg/max(1,tt):.0f}% Uebereinstimmung, " f"{len(remaining)} noch offen") PY echo "" echo "=== label (FULL/PARTIAL/NONE) ===" python3 build_rdf_dataset.py label echo "" echo "=== trainset (Auszuege + grounded Tripel) ===" python3 build_rdf_dataset.py trainset echo "" echo "Fertig. trainset.jsonl ist der trainingsfertige Datensatz." echo "Split: python3 build_rdf_dataset.py split (auf samples-Basis)"