CDS202-Atlas/librarian/plugins/librarian-vspace/examples/demo_run_embedder.py

#!/usr/bin/env python3

from __future__ import annotations

import asyncio
import logging
import os
import random
from pathlib import Path
from typing import Any, List, Dict
import json

from librarian_vspace.vecembed.embedder_worker import EmbedderWorker, EmbedderInput
from librarian_core.workers.base import FlowArtifact
from librarian_core.temp_payloads.chunk_data import ChunkData

# ------------------------------------------------------------------ #
# Configuration
# ------------------------------------------------------------------ #
# Folder with the small sample dataset (3 × .md files)
DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/data/FS25/Effiziente_Algorithmen").expanduser()
#DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/chunk_md").expanduser()

# Where to write the concatenated text file
# (one level above the dataset folder keeps things tidy)
COURSE_ID_POOL = [16301, 16091, 17505, 18239, 17503, 15512]

logger = logging.getLogger(__name__)
INPUT_MODEL=Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/result.json")

# ------------------------------------------------------------------ #
def _load_env(path: Path) -> None:
    """Load KEY=VALUE pairs from a .env file if present."""
    if not path.is_file():
        return
    for line in path.read_text().splitlines():
        if line.strip() and not line.startswith("#") and "=" in line:
            k, v = [p.strip() for p in line.split("=", 1)]
            os.environ.setdefault(k, v)


def discover_chunks(root: Path) -> List[Path]:
    """Return all markdown files in the dataset folder."""
    return sorted(root.glob("*.md"))


def build_course(root: Path) -> Dict[str, Any]:
    """Minimal dict that satisfies EmbedderWorker's `chunk_course`."""
    files = [
        {"file_name": p.name, "file_id": str(random.getrandbits(24))}
        for p in discover_chunks(root)
    ]
    if not files:
        raise FileNotFoundError(f"No .md files found in {root}")
    return {
        "path": str(root),
        "files": files,
        #"course_id": str(random.choice(COURSE_ID_POOL)),
        "course_id": "18240"
    }


# ------------------------------------------------------------------ #
async def _main() -> None:
    course = build_course(DEMO_PATH)
    concat_path = DEMO_PATH

    with open(INPUT_MODEL, 'r') as file:
        json_data = json.load(file)

    #payload = EmbedderInput(chunk_course=course, concat_path=concat_path)
    payload = ChunkData.model_validate_json(json_data)
    worker = EmbedderWorker()
    logger.info("🔨 Launching EmbedderWorker …")
    art = FlowArtifact.new(run_id="", dir=concat_path, data=payload)
    result = await worker.flow()(art)  # type: ignore[arg-type]

    logger.info("✅ Worker finished: %s", result)

# ------------------------------------------------------------------ #
if __name__ == "__main__":
    APP_DIR = Path(__file__).resolve().parent
    _load_env(APP_DIR / ".env")

    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
    asyncio.run(_main())