#!/usr/bin/env python3 from __future__ import annotations import asyncio import logging import os import random from pathlib import Path from typing import Any, List, Dict import json from librarian_vspace.vecembed.embedder_worker import EmbedderWorker, EmbedderInput from librarian_core.workers.base import FlowArtifact from librarian_core.temp_payloads.chunk_data import ChunkData # ------------------------------------------------------------------ # # Configuration # ------------------------------------------------------------------ # # Folder with the small sample dataset (3 × .md files) DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/data/FS25/Effiziente_Algorithmen").expanduser() #DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/chunk_md").expanduser() # Where to write the concatenated text file # (one level above the dataset folder keeps things tidy) COURSE_ID_POOL = [16301, 16091, 17505, 18239, 17503, 15512] logger = logging.getLogger(__name__) INPUT_MODEL=Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/result.json") # ------------------------------------------------------------------ # def _load_env(path: Path) -> None: """Load KEY=VALUE pairs from a .env file if present.""" if not path.is_file(): return for line in path.read_text().splitlines(): if line.strip() and not line.startswith("#") and "=" in line: k, v = [p.strip() for p in line.split("=", 1)] os.environ.setdefault(k, v) def discover_chunks(root: Path) -> List[Path]: """Return all markdown files in the dataset folder.""" return sorted(root.glob("*.md")) def build_course(root: Path) -> Dict[str, Any]: """Minimal dict that satisfies EmbedderWorker's `chunk_course`.""" files = [ {"file_name": p.name, "file_id": str(random.getrandbits(24))} for p in discover_chunks(root) ] if not files: raise FileNotFoundError(f"No .md files found in {root}") return { "path": str(root), "files": files, #"course_id": str(random.choice(COURSE_ID_POOL)), "course_id": "18240" } # ------------------------------------------------------------------ # async def _main() -> None: course = build_course(DEMO_PATH) concat_path = DEMO_PATH with open(INPUT_MODEL, 'r') as file: json_data = json.load(file) #payload = EmbedderInput(chunk_course=course, concat_path=concat_path) payload = ChunkData.model_validate_json(json_data) worker = EmbedderWorker() logger.info("🔨 Launching EmbedderWorker …") art = FlowArtifact.new(run_id="", dir=concat_path, data=payload) result = await worker.flow()(art) # type: ignore[arg-type] logger.info("✅ Worker finished: %s", result) # ------------------------------------------------------------------ # if __name__ == "__main__": APP_DIR = Path(__file__).resolve().parent _load_env(APP_DIR / ".env") logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") asyncio.run(_main())