2025-05-24 12:15:48 +02:00

87 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
from __future__ import annotations
import asyncio
import logging
import os
import random
from pathlib import Path
from typing import Any, List, Dict
import json
from librarian_vspace.vecembed.embedder_worker import EmbedderWorker, EmbedderInput
from librarian_core.workers.base import FlowArtifact
from librarian_core.temp_payloads.chunk_data import ChunkData
# ------------------------------------------------------------------ #
# Configuration
# ------------------------------------------------------------------ #
# Folder with the small sample dataset (3 × .md files)
DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/data/FS25/Effiziente_Algorithmen").expanduser()
#DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/chunk_md").expanduser()
# Where to write the concatenated text file
# (one level above the dataset folder keeps things tidy)
COURSE_ID_POOL = [16301, 16091, 17505, 18239, 17503, 15512]
logger = logging.getLogger(__name__)
INPUT_MODEL=Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/result.json")
# ------------------------------------------------------------------ #
def _load_env(path: Path) -> None:
"""Load KEY=VALUE pairs from a .env file if present."""
if not path.is_file():
return
for line in path.read_text().splitlines():
if line.strip() and not line.startswith("#") and "=" in line:
k, v = [p.strip() for p in line.split("=", 1)]
os.environ.setdefault(k, v)
def discover_chunks(root: Path) -> List[Path]:
"""Return all markdown files in the dataset folder."""
return sorted(root.glob("*.md"))
def build_course(root: Path) -> Dict[str, Any]:
"""Minimal dict that satisfies EmbedderWorker's `chunk_course`."""
files = [
{"file_name": p.name, "file_id": str(random.getrandbits(24))}
for p in discover_chunks(root)
]
if not files:
raise FileNotFoundError(f"No .md files found in {root}")
return {
"path": str(root),
"files": files,
#"course_id": str(random.choice(COURSE_ID_POOL)),
"course_id": "18240"
}
# ------------------------------------------------------------------ #
async def _main() -> None:
course = build_course(DEMO_PATH)
concat_path = DEMO_PATH
with open(INPUT_MODEL, 'r') as file:
json_data = json.load(file)
#payload = EmbedderInput(chunk_course=course, concat_path=concat_path)
payload = ChunkData.model_validate_json(json_data)
worker = EmbedderWorker()
logger.info("🔨 Launching EmbedderWorker …")
art = FlowArtifact.new(run_id="", dir=concat_path, data=payload)
result = await worker.flow()(art) # type: ignore[arg-type]
logger.info("✅ Worker finished: %s", result)
# ------------------------------------------------------------------ #
if __name__ == "__main__":
APP_DIR = Path(__file__).resolve().parent
_load_env(APP_DIR / ".env")
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
asyncio.run(_main())