87 lines
3.2 KiB
Python
87 lines
3.2 KiB
Python
#!/usr/bin/env python3
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
import random
|
||
from pathlib import Path
|
||
from typing import Any, List, Dict
|
||
import json
|
||
|
||
from librarian_vspace.vecembed.embedder_worker import EmbedderWorker, EmbedderInput
|
||
from librarian_core.workers.base import FlowArtifact
|
||
from librarian_core.temp_payloads.chunk_data import ChunkData
|
||
|
||
# ------------------------------------------------------------------ #
|
||
# Configuration
|
||
# ------------------------------------------------------------------ #
|
||
# Folder with the small sample dataset (3 × .md files)
|
||
DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/data/FS25/Effiziente_Algorithmen").expanduser()
|
||
#DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/chunk_md").expanduser()
|
||
|
||
# Where to write the concatenated text file
|
||
# (one level above the dataset folder keeps things tidy)
|
||
COURSE_ID_POOL = [16301, 16091, 17505, 18239, 17503, 15512]
|
||
|
||
logger = logging.getLogger(__name__)
|
||
INPUT_MODEL=Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/result.json")
|
||
|
||
# ------------------------------------------------------------------ #
|
||
def _load_env(path: Path) -> None:
|
||
"""Load KEY=VALUE pairs from a .env file if present."""
|
||
if not path.is_file():
|
||
return
|
||
for line in path.read_text().splitlines():
|
||
if line.strip() and not line.startswith("#") and "=" in line:
|
||
k, v = [p.strip() for p in line.split("=", 1)]
|
||
os.environ.setdefault(k, v)
|
||
|
||
|
||
def discover_chunks(root: Path) -> List[Path]:
|
||
"""Return all markdown files in the dataset folder."""
|
||
return sorted(root.glob("*.md"))
|
||
|
||
|
||
def build_course(root: Path) -> Dict[str, Any]:
|
||
"""Minimal dict that satisfies EmbedderWorker's `chunk_course`."""
|
||
files = [
|
||
{"file_name": p.name, "file_id": str(random.getrandbits(24))}
|
||
for p in discover_chunks(root)
|
||
]
|
||
if not files:
|
||
raise FileNotFoundError(f"No .md files found in {root}")
|
||
return {
|
||
"path": str(root),
|
||
"files": files,
|
||
#"course_id": str(random.choice(COURSE_ID_POOL)),
|
||
"course_id": "18240"
|
||
}
|
||
|
||
|
||
# ------------------------------------------------------------------ #
|
||
async def _main() -> None:
|
||
course = build_course(DEMO_PATH)
|
||
concat_path = DEMO_PATH
|
||
|
||
with open(INPUT_MODEL, 'r') as file:
|
||
json_data = json.load(file)
|
||
|
||
#payload = EmbedderInput(chunk_course=course, concat_path=concat_path)
|
||
payload = ChunkData.model_validate_json(json_data)
|
||
worker = EmbedderWorker()
|
||
logger.info("🔨 Launching EmbedderWorker …")
|
||
art = FlowArtifact.new(run_id="", dir=concat_path, data=payload)
|
||
result = await worker.flow()(art) # type: ignore[arg-type]
|
||
|
||
logger.info("✅ Worker finished: %s", result)
|
||
|
||
# ------------------------------------------------------------------ #
|
||
if __name__ == "__main__":
|
||
APP_DIR = Path(__file__).resolve().parent
|
||
_load_env(APP_DIR / ".env")
|
||
|
||
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
|
||
asyncio.run(_main())
|