Initialize Monorepo

2025-05-24 12:15:48 +02:00 · 2025-05-24 12:15:48 +02:00 · f80792d739
commit f80792d739
106 changed files with 17076 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,364 @@
+# Created by https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,web,pycharm+all
+# Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,linux,python,web,pycharm+all
+
+### Linux ###
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+
+### PyCharm+all ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### PyCharm+all Patch ###
+# Ignore everything but code style settings and run configurations
+# that are supposed to be shared within teams.
+
+.idea/*
+
+!.idea/codeStyles
+!.idea/runConfigurations
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+### Web ###
+*.asp
+*.cer
+*.csr
+*.css
+*.htm
+*.html
+*.js
+*.jsp
+*.php
+*.rss
+*.wasm
+*.wat
+*.xhtml
+
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+# End of https://www.toptal.com/developers/gitignore/api/macos,windows,linux,python,web,pycharm+all
+# local env files
+**/.env*.local
+**/.env
+!**/.env.example
+
+# vercel
+.vercel
+
+# typescript
+*.tsbuildinfo
+next-env.d.ts
--- a/librarian/librarian-core/README.md
+++ b/librarian/librarian-core/README.md
@ -0,0 +1,13 @@
+# Usage
+
+In your `pyproject.toml` add the following code:
+
+```toml
+dependencies = [
+  "librarian-core",
+  "...other dependencies"
+]
+
+[tool.uv.sources]
+librarian-core = { git = "https://github.com/DotNaos/librarian-core", rev = "dev" }
+```
--- a/librarian/librarian-core/pyproject.toml
+++ b/librarian/librarian-core/pyproject.toml
@ -0,0 +1,38 @@
+[project]
+name = "librarian-core"
+version = "0.1.6"
+readme = "README.md"
+description = "Shared datamodel & utils for the Librarian project"
+requires-python = ">=3.10"
+authors = [
+    { name = "DotNaos", email = "schuetzoliver00@gmail.com" }
+]
+dependencies = [
+    "pandas>=2.2.3",
+    "platformdirs>=4.3.7",
+    "pydantic-settings>=2.9.1",
+    "supabase",
+    "tabulate>=0.9.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0", # Testing framework
+    "pytest-cov",   # Coverage reporting
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+# src/ layout
+[tool.hatch.build.targets.wheel]
+packages = ["src/librarian_core"]
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]
+addopts = "--cov=librarian_core --cov-report=term-missing"
+
+[tool.coverage.run]
+source = ["librarian_core"]
--- a/librarian/librarian-core/src/librarian_core/init.py
+++ b/librarian/librarian-core/src/librarian_core/init.py
@ -0,0 +1,20 @@
+import pkgutil
+import importlib
+
+__all__ = []
+
+# Iterate over all modules in this package
+for finder, module_name, is_pkg in pkgutil.iter_modules(__path__):
+    # import the sub-module
+    module = importlib.import_module(f"{__name__}.{module_name}")
+
+    # decide which names to re-export:
+    #  use module.__all__ if it exists, otherwise every non-private attribute
+    public_names = getattr(
+        module, "__all__", [n for n in dir(module) if not n.startswith("_")]
+    )
+
+    # bring each name into the package namespace
+    for name in public_names:
+        globals()[name] = getattr(module, name)
+        __all__.append(name) # type: ignore
--- a/librarian/librarian-core/src/librarian_core/py.typed
+++ b/librarian/librarian-core/src/librarian_core/py.typed
--- a/librarian/librarian-core/src/librarian_core/storage/init.py
+++ b/librarian/librarian-core/src/librarian_core/storage/init.py
@ -0,0 +1,5 @@
+from librarian_core.storage.worker_store import WorkerStore
+
+__all__ = [
+    "WorkerStore",
+]
--- a/librarian/librarian-core/src/librarian_core/storage/worker_store.py
+++ b/librarian/librarian-core/src/librarian_core/storage/worker_store.py
@ -0,0 +1,243 @@
+"""
+librarian_core.storage.worker_store
+===================================
+
+Persistent directory layout
+---------------------------
+<data_root>/flows/<worker>/<run_id>/
+    meta.json         # worker_name, state, timestamps …
+    result.json       # pydantic-serialised return model
+    data/             # files staged by the worker
+"""
+
+from __future__ import annotations
+
+import json
+import shutil
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Optional, Type
+
+from pydantic import BaseModel, TypeAdapter
+
+from librarian_core.utils import path_utils
+
+
+class WorkerStore:
+    """Never exposed to worker code – all access is via helper methods."""
+
+    # ------------------------------------------------------------------ #
+    # constructors                                                       #
+    # ------------------------------------------------------------------ #
+    @classmethod
+    def new(cls, *, worker_name: str, flow_id: str) -> "WorkerStore":
+        run_dir = path_utils.get_run_dir(worker_name, flow_id, create=True)
+        store = cls(run_dir, worker_name, flow_id)
+        store._write_meta(state="RUNNING")
+        return store
+
+    @classmethod
+    def open(cls, run_id: str) -> "WorkerStore":
+        """
+        Locate `<flows>/<worker>/<run_id>` regardless of worker name.
+        """
+        flows_dir = path_utils.get_flows_dir()
+        for worker_dir in flows_dir.iterdir():
+            candidate = worker_dir / run_id
+            if candidate.exists():
+                meta_path = candidate / "meta.json"
+                if not meta_path.is_file():
+                    continue
+                meta = json.loads(meta_path.read_text())
+                return cls(candidate, meta["worker_name"], run_id)
+        raise FileNotFoundError(run_id)
+
+    # ------------------------------------------------------------------ #
+    # life-cycle                                                         #
+    # ------------------------------------------------------------------ #
+    def __init__(self, run_dir: Path, worker_name: str, flow_id: str):
+        self._run_dir = run_dir
+        self._worker_name = worker_name
+        self._flow_id = flow_id
+
+        cache_root = path_utils.get_cache_root()
+        self._work_dir = Path(
+            tempfile.mkdtemp(prefix=f"{self._flow_id}-", dir=cache_root)
+        )
+
+        self._entry_dir = self._work_dir / "entry"
+        self._exit_dir = self._work_dir / "exit"
+        self._entry_dir.mkdir(parents=True, exist_ok=True)
+        self._exit_dir.mkdir(parents=True, exist_ok=True)
+
+    # ------------------------------------------------------------------ #
+    # entry / exit handling                                              #
+    # ------------------------------------------------------------------ #
+    @property
+    def entry_dir(self) -> Path:
+        return self._entry_dir
+
+    def prime_with_input(self, src: Optional[Path]) -> None:
+        if src and src.exists():
+            shutil.copytree(src, self._entry_dir, dirs_exist_ok=True)
+
+    def stage(
+        self,
+        src: Path | str,
+        *,
+        new_name: str | None = None,
+        sanitize: bool = True,
+        move: bool = False,
+    ) -> Path:
+        src_path = Path(src).expanduser().resolve()
+        if not src_path.exists():
+            raise FileNotFoundError(src_path)
+
+        name = new_name or src_path.name
+        if sanitize:
+            name = path_utils._sanitize(name)
+
+        dst = self._exit_dir / name
+        if dst.exists():
+            if dst.is_file():
+                dst.unlink()
+            else:
+                shutil.rmtree(dst)
+
+        if move:
+            src_path.rename(dst)
+        else:
+            if src_path.is_dir():
+                shutil.copytree(src_path, dst)
+            else:
+                dst.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy2(src_path, dst)
+        return dst
+
+    # ------------------------------------------------------------------ #
+    # result persistence                                                 #
+    # ------------------------------------------------------------------ #
+    def save_model(
+        self,
+        model: BaseModel,
+        *,
+        filename: str = "result.json",
+        **json_kwargs: Any,
+    ) -> Path:
+        json_kwargs.setdefault("indent", 2)
+        target = self._run_dir / filename
+        target.write_text(model.model_dump_json(**json_kwargs))
+        return target
+
+    def persist_exit(self) -> Path:
+        """
+        Move the *exit* directory to the persistent *data* slot and mark
+        the run completed.
+        """
+        data_dir = self.data_dir
+        if data_dir.exists():
+            shutil.rmtree(data_dir)
+        self._exit_dir.rename(data_dir)
+        self._write_meta(state="COMPLETED")
+        return data_dir
+
+    def cleanup(self) -> None:
+        shutil.rmtree(self._work_dir, ignore_errors=True)
+
+    # ------------------------------------------------------------------ #
+    # public helpers (API needs these)                                   #
+    # ------------------------------------------------------------------ #
+    @property
+    def data_dir(self) -> Path:
+        return self._run_dir / "data"
+
+    @property
+    def meta_path(self) -> Path:
+        return self._run_dir / "meta.json"
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        return json.loads(self.meta_path.read_text())
+
+    def load_model(self, *, as_dict: bool = False) -> dict | BaseModel | None:
+        res_file = self._run_dir / "result.json"
+        if not res_file.is_file():
+            return None
+        data = json.loads(res_file.read_text())
+        if as_dict:
+            return data
+        # try to reconstruct a Pydantic model if possible
+        try:
+            OutputModel: Type[BaseModel] | None = self._guess_output_model()
+            if OutputModel:
+                return TypeAdapter(OutputModel).validate_python(data)
+        except Exception:
+            pass
+        return data
+
+    @staticmethod
+    # TODO: Should return a Flowartifact, but circular import is messing
+    def load_latest(worker_name: str) -> dict[str, Any] | None:
+        flows_dir = path_utils.get_flows_dir()
+        worker_dir = flows_dir / worker_name
+        if not worker_dir.exists():
+            return None
+
+        runs: list[tuple[datetime, Path]] = []
+        for run_id in worker_dir.iterdir():
+            if not run_id.is_dir():
+                continue
+            meta_path = run_id / "meta.json"
+            if not meta_path.is_file():
+                continue
+            meta = json.loads(meta_path.read_text())
+            if meta["state"] == "COMPLETED":
+                runs.append((datetime.fromisoformat(meta["timestamp"]), run_id))
+
+        if not runs:
+            return None
+        sorted_runs = sorted(runs, key=lambda x: x[0])
+
+        latest_run_dir = sorted_runs[-1][1]
+
+        # Load the model
+        return {  # That is a FlowArtifact
+            "run_id": latest_run_dir.name,
+            "dir": latest_run_dir / "data",
+            "data": WorkerStore.open(latest_run_dir.name).load_model(as_dict=True),  # type: ignore
+        }
+
+    # ------------------------------------------------------------------ #
+    # internals                                                          #
+    # ------------------------------------------------------------------ #
+    def _write_meta(self, *, state: str) -> None:
+        meta = {
+            "worker_name": self._worker_name,
+            "run_id": self._flow_id,
+            "state": state,
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+        }
+        self.meta_path.write_text(json.dumps(meta, indent=2))
+
+    def _guess_output_model(self) -> Optional[Type[BaseModel]]:
+        """
+        Best-effort import of `<worker_name>.output_model`.
+        """
+        try:
+            from importlib import import_module
+
+            # workers are registered with dotted names in plugin_loader
+            mod = import_module(self._worker_name)
+            return getattr(mod, "output_model", None)
+        except Exception:
+            return None
+
+    # ------------------------------------------------------------------ #
+    # clean-up                                                           #
+    # ------------------------------------------------------------------ #
+    def __del__(self) -> None:
+        try:
+            shutil.rmtree(self._work_dir, ignore_errors=True)
+        except Exception:
+            pass
--- a/librarian/librarian-core/src/librarian_core/supabase/init.py
+++ b/librarian/librarian-core/src/librarian_core/supabase/init.py
@ -0,0 +1,3 @@
+from .client import get_client, SupabaseGateway
+
+__all__ = ["get_client", "SupabaseGateway"]
--- a/librarian/librarian-core/src/librarian_core/supabase/client.py
+++ b/librarian/librarian-core/src/librarian_core/supabase/client.py
@ -0,0 +1,60 @@
+from __future__ import annotations
+from typing import Any, Dict
+from pydantic import BaseModel
+from supabase import create_client, Client
+import os, logging
+
+log = logging.getLogger(__name__)
+
+
+class _Cfg(BaseModel):
+    url: str
+    key: str
+    db_schema: str = "library"
+
+
+def _load_cfg() -> _Cfg:
+    return _Cfg(
+        url=os.getenv("SUPABASE_URL", ""),
+        key=os.getenv("SUPABASE_API_KEY", ""),
+    )
+
+
+_client: Client | None = None
+_cfg: _Cfg | None = None
+
+
+def get_client() -> Client:
+    global _client, _cfg
+    if _client:
+        return _client
+    _cfg = _load_cfg()
+    if not _cfg.url or not _cfg.key:
+        raise RuntimeError("SUPABASE_URL or SUPABASE_API_KEY missing")
+    _client = create_client(_cfg.url, _cfg.key)
+    return _client
+
+
+class SupabaseGateway:
+    """
+    Thin wrapper around Client with `schema()` pre-selected
+    and a helper `_rpc()` that raises RuntimeError on error.
+    """
+
+    def __init__(self) -> None:
+        self.client = get_client()
+        self.schema = _cfg.db_schema if _cfg else "library"
+
+    # ---------- internal ----------
+    def _rpc(self, fn: str, payload: Dict[str, Any] | None = None):
+        resp = (
+            self.client.schema(self.schema)
+            .rpc(fn, payload or {})
+            .execute()
+            .model_dump()
+        )
+        if resp.get("error"):
+            log.error("%s error: %s", fn, resp["error"])
+            raise RuntimeError(resp["error"])
+        log.debug("%s OK", fn)
+        return resp.get("data")
--- a/librarian/librarian-core/src/librarian_core/supabase/py.typed
+++ b/librarian/librarian-core/src/librarian_core/supabase/py.typed
--- a/librarian/librarian-core/src/librarian_core/supabase/rpc.py
+++ b/librarian/librarian-core/src/librarian_core/supabase/rpc.py
@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import List
+
+from librarian_scraper.models import CrawlCourse, CrawlTerm, MoodleIndex
+
+from librarian_core.supabase.client import SupabaseGateway
+
+gw = SupabaseGateway()  # singleton gateway
+
+
+# -------- public API --------
+def upload_index(index: MoodleIndex) -> None:
+    dp = index.degree_program
+    _upsert_degree_program(dp.id, dp.name)
+    for term in dp.terms:
+        _upsert_term(term)
+        _upsert_courses(term.courses, term_id=term.id, prog_id=dp.id)
+
+
+def upload_modules(modules_index) -> None:
+    # TODO – same pattern
+    ...
+
+
+# -------- helpers --------
+def _upsert_degree_program(dp_id: str, name: str):
+    gw._rpc(
+        "upsert_degree_program",
+        {
+            "p_program_id": dp_id,
+            "p_program_name": name,
+        },
+    )
+
+
+def _upsert_term(term: CrawlTerm):
+    # TODO: Change to term, when supabase is updated
+    gw._rpc(
+        "upsert_semester",
+        {
+            "p_semester_id": term.id,
+            "p_semester_name": term.name,
+        },
+    )
+
+
+def _upsert_courses(courses: List[CrawlCourse], *, term_id: str, prog_id: str):
+    # TODO: Change to term, when supabase is updated
+    for c in courses:
+        gw._rpc(
+            "upsert_course",
+            {
+                "p_course_id": c.id,
+                "p_course_name": c.name,
+                "p_semester_id": term_id,
+                "p_program_id": prog_id,
+                "p_hero_image": c.hero_image,
+                "p_content_ressource_id": c.content_ressource_id,
+            },
+        )
--- a/librarian/librarian-core/src/librarian_core/temp_payloads/init.py
+++ b/librarian/librarian-core/src/librarian_core/temp_payloads/init.py
@ -0,0 +1,11 @@
+from .chunk_data import (
+    ChunkCourse,
+    ChunkFile,
+    ChunkData,
+)
+
+__all__ = [
+    "ChunkData",
+    "ChunkCourse",
+    "ChunkFile",
+]
--- a/librarian/librarian-core/src/librarian_core/temp_payloads/chunk_data.py
+++ b/librarian/librarian-core/src/librarian_core/temp_payloads/chunk_data.py
@ -0,0 +1,19 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+# TODO: Move to librarian-chunker
+
+class ChunkFile(BaseModel):
+    name: str = Field(..., description="Name of the file")
+    id: str = Field(..., description="ID of the file")
+
+
+class ChunkCourse(BaseModel):
+    id: str = Field(..., description="ID of the course")
+    name: str = Field(..., description="Name of the course")
+    files: List[ChunkFile] = Field(..., description="List of files in the course")
+
+
+class ChunkData(BaseModel):
+    courses: List[ChunkCourse] = Field(..., description="List of courses")
--- a/librarian/librarian-core/src/librarian_core/temp_payloads/py.typed
+++ b/librarian/librarian-core/src/librarian_core/temp_payloads/py.typed
--- a/librarian/librarian-core/src/librarian_core/utils/init.py
+++ b/librarian/librarian-core/src/librarian_core/utils/init.py
@ -0,0 +1,25 @@
+from librarian_core.utils.path_utils import (
+    copy_to_temp_dir,
+    get_cache_root,
+    get_config_root,
+    get_data_root,
+    get_flow_name_from_id,
+    get_flows_dir,
+    get_run_dir,
+    get_temp_path,
+    get_workers_dir,
+)
+from librarian_core.utils.secrets_loader import load_env
+
+__all__ = [
+    "load_env",
+    "get_temp_path",
+    "get_run_dir",
+    "get_flow_name_from_id",
+    "copy_to_temp_dir",
+    "get_cache_root",
+    "get_data_root",
+    "get_config_root",
+    "get_flows_dir",
+    "get_workers_dir",
+]
--- a/librarian/librarian-core/src/librarian_core/utils/path_utils.py
+++ b/librarian/librarian-core/src/librarian_core/utils/path_utils.py
@ -0,0 +1,196 @@
+"""
+librarian_core/utils/path_utils.py
+==================================
+
+Unified helpers for every path the Atlas-Librarian project uses.
+
+Key features
+------------
+* XDG- and ENV-aware roots for **data**, **config**, and **cache**.
+* Dedicated sub-trees for *flows* (per-worker run directories) and
+  *workers* (registrations, static assets, …).
+* Convenience helpers:
+    - `get_run_dir(worker, run_id)`
+    - `get_flow_name_from_id(run_id)`  ← Prefect lookup (lazy import)
+    - `get_temp_path()` / `copy_to_temp_dir()`
+* **Single source of truth** – change the root once, everything follows.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import tempfile
+import uuid
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+
+from platformdirs import (
+    user_cache_dir,
+    user_config_dir,
+    user_data_dir,
+)
+
+# --------------------------------------------------------------------------- #
+# Root directories (honours $LIBRARIAN_*_DIR, falls back to XDG)              #
+# --------------------------------------------------------------------------- #
+
+_APP_NAME = "atlas-librarian"
+
+_DATA_ROOT = Path(
+    os.getenv("LIBRARIAN_DATA_DIR", user_data_dir(_APP_NAME))
+).expanduser()
+_CONFIG_ROOT = Path(
+    os.getenv("LIBRARIAN_CONFIG_DIR", user_config_dir(_APP_NAME))
+).expanduser()
+_CACHE_ROOT = Path(
+    os.getenv("LIBRARIAN_CACHE_DIR", user_cache_dir(_APP_NAME))
+).expanduser()
+
+# Project-specific sub-trees
+_FLOWS_DIR = _DATA_ROOT / "flows"  # <data>/flows/<worker>/<run_id>/
+_WORKERS_DIR = _DATA_ROOT / "workers"  # static registration cache, etc.
+
+# Ensure that the basic tree always exists
+for p in (_DATA_ROOT, _CONFIG_ROOT, _CACHE_ROOT, _FLOWS_DIR, _WORKERS_DIR):
+    p.mkdir(parents=True, exist_ok=True)
+
+# --------------------------------------------------------------------------- #
+# Public helpers                                                              #
+# --------------------------------------------------------------------------- #
+
+# -- roots --
+
+
+def get_data_root() -> Path:
+    return _DATA_ROOT
+
+
+def get_config_root() -> Path:
+    return _CONFIG_ROOT
+
+
+def get_cache_root() -> Path:
+    return _CACHE_ROOT
+
+
+def get_flows_dir() -> Path:
+    return _FLOWS_DIR
+
+
+def get_workers_dir() -> Path:
+    return _WORKERS_DIR
+
+
+# -- flow-run directories ---------------------------------------------------- #
+
+
+def get_run_dir(worker_name: str, run_id: str, *, create: bool = True) -> Path:
+    """
+    Absolute path for one specific Prefect flow-run.
+
+    Example
+    -------
+    >>> get_run_dir("downloader", "1234abcd")
+    ~/.local/share/atlas-librarian/flows/downloader/1234abcd
+    """
+    safe_worker = _sanitize(worker_name)
+    path = _FLOWS_DIR / safe_worker / run_id
+    if create:
+        path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def get_flow_name_from_id(run_id: str) -> Optional[str]:
+    """
+    Resolve a Prefect *run-id* → *flow name*.
+
+    Returns
+    -------
+    The flow (worker) name or *None* if the ID cannot be found.
+    """
+    try:
+        from prefect.client.orchestration import get_client
+    except ImportError:  # Prefect not installed in caller env
+        return None
+
+    try:
+        import anyio
+
+        async def _lookup() -> Optional[str]:
+            async with get_client() as client:
+                fr = await client.read_flow_run(uuid.UUID(run_id))
+                return fr.flow_name  # type: ignore[attr-defined]
+
+        return anyio.run(_lookup)
+    except Exception:
+        return None
+
+
+# -- temporary workspace helpers -------------------------------------------- #
+
+
+def get_temp_path(prefix: str = "atlas") -> Path:
+    """
+    Create a *unique* temporary directory inside the user cache.
+
+    The directory is **not** deleted automatically – callers decide.
+    """
+    ts = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    rand = uuid.uuid4().hex[:8]
+    tmp_root = _CACHE_ROOT / "tmp"
+    tmp_root.mkdir(parents=True, exist_ok=True)
+
+    path = Path(
+        tempfile.mkdtemp(
+            dir=tmp_root,
+            prefix=f"{prefix}-{ts}-{rand}-",
+        )
+    )
+    return path
+
+
+def copy_to_temp_dir(src: Path | str, *, prefix: str = "atlas") -> Path:
+    """
+    Recursively copy *src* into a fresh temporary directory.
+
+    Returns the destination path.
+    """
+    src_path = Path(src).expanduser().resolve()
+    if not src_path.exists():
+        raise FileNotFoundError(src_path)
+
+    dst = get_temp_path(prefix=prefix)
+    shutil.copytree(src_path, dst, dirs_exist_ok=True)
+    return dst
+
+
+# --------------------------------------------------------------------------- #
+# internal helpers                                                            #
+# --------------------------------------------------------------------------- #
+
+
+def _sanitize(name: str) -> str:
+    """Replace path-hostile characters – keeps things safe across OSes."""
+    return "".join(c if c.isalnum() or c in "-._" else "_" for c in name)
+
+
+# --------------------------------------------------------------------------- #
+# exports                                                                     #
+# --------------------------------------------------------------------------- #
+
+__all__ = [
+    # roots
+    "get_data_root",
+    "get_config_root",
+    "get_cache_root",
+    "get_flows_dir",
+    "get_workers_dir",
+    # flow-run helpers
+    "get_run_dir",
+    "get_flow_name_from_id",
+    # temporary space
+    "get_temp_path",
+    "copy_to_temp_dir",
+]
--- a/librarian/librarian-core/src/librarian_core/utils/secrets_loader.py
+++ b/librarian/librarian-core/src/librarian_core/utils/secrets_loader.py
@ -0,0 +1,25 @@
+"""
+Secrets live in a classic .env **outside** the JSON settings file.
+Load order:
+
+1. ENV  LIBRARIAN_CREDENTIALS_PATH  (override)
+2. ~/.config/atlas-librarian/credentials.env   (XDG path)
+"""
+
+from pathlib import Path
+import os
+import logging
+import dotenv
+from librarian_core.utils.path_utils import get_config_root
+
+log = logging.getLogger(__name__)
+
+
+def load_env() -> None:
+    path = Path(os.getenv("LIBRARIAN_CREDENTIALS_PATH", get_config_root() / "credentials.env"))
+
+    if path.exists():
+        dotenv.load_dotenv(path)
+        log.debug("Secrets loaded from %s", path)
+    else:
+        log.debug("No credentials.env found (looked in %s)", path)
--- a/librarian/librarian-core/src/librarian_core/workers/init.py
+++ b/librarian/librarian-core/src/librarian_core/workers/init.py
@ -0,0 +1,3 @@
+from librarian_core.workers.base import Worker
+
+__all__ = ["Worker"]
--- a/librarian/librarian-core/src/librarian_core/workers/base.py
+++ b/librarian/librarian-core/src/librarian_core/workers/base.py
@ -0,0 +1,192 @@
+from __future__ import annotations
+
+import inspect
+import uuid
+from collections.abc import Awaitable, Callable
+from pathlib import Path
+from typing import ClassVar, Generic, TypeVar
+
+import pandas as pd
+import anyio
+from prefect import flow, get_run_logger
+from prefect.runtime import flow_run
+from pydantic import BaseModel, Field, create_model
+from prefect.artifacts import acreate_markdown_artifact
+
+from librarian_core.storage.worker_store import WorkerStore
+
+# --------------------------------------------------------------------------- #
+# type parameters                                                             #
+# --------------------------------------------------------------------------- #
+InT = TypeVar("InT", bound=BaseModel)
+OutT = TypeVar("OutT", bound=BaseModel)
+
+
+# --------------------------------------------------------------------------- #
+# envelope returned by every worker flow                                      #
+# --------------------------------------------------------------------------- #
+class FlowArtifact(BaseModel, Generic[OutT]):
+    run_id: str | None = None
+    dir: Path | None = None
+    data: OutT | None = None
+
+    @classmethod
+    def new(cls, run_id: str | None = None, dir: Path | None = None, data: OutT | None = None) -> FlowArtifact:
+        if not data:
+            raise ValueError("data is required")
+        # Intermediate Worker
+        if run_id and dir:
+            return FlowArtifact(run_id=run_id, dir=dir, data=data)
+
+        # Initial Worker
+        else:
+            return FlowArtifact(data=data)
+
+# --------------------------------------------------------------------------- #
+# metaclass: adds a Prefect flow + envelope to each Worker                    #
+# --------------------------------------------------------------------------- #
+class _WorkerMeta(type):
+    def __new__(mcls, name, bases, ns, **kw):
+        cls = super().__new__(mcls, name, bases, dict(ns))
+
+        if name == "Worker" and cls.__module__ == __name__:
+            return cls  # abstract base
+
+        if not (hasattr(cls, "input_model") and hasattr(cls, "output_model")):
+            raise TypeError(f"{name}: declare 'input_model' / 'output_model'.")
+        if "__run__" not in cls.__dict__:
+            raise TypeError(f"{name}: implement async '__run__(payload)'.")
+
+        cls.worker_name = name  # type: ignore
+        cls._create_input_artifact()
+        cls._prefect_flow = mcls._build_prefect_flow(cls) # type: ignore
+        return cls
+
+    # --------------------------------------------------------------------- #
+    @staticmethod
+    def _build_prefect_flow(cls_ref):
+        """Create the Prefect flow and return it."""
+        InArt = cls_ref.input_artifact  # noqa: F841
+        OutModel: type[BaseModel] = cls_ref.output_model  # noqa: F841
+        worker_name: str = cls_ref.worker_name
+
+        async def _core(in_art: FlowArtifact[InT]):  # type: ignore[name-defined]
+            logger = get_run_logger()
+            run_id = flow_run.get_id() or uuid.uuid4().hex
+            logger.info("%s started (run_id=%s)", worker_name, run_id)
+
+            store = WorkerStore.new(worker_name=worker_name, flow_id=run_id)
+
+            if in_art.dir and in_art.dir.exists() and in_art.dir != Path("."):
+                store.prime_with_input(in_art.dir)
+
+            inst = cls_ref()
+            inst._inject_store(store)
+            # run worker ------------------------------------------------
+            run_res = inst.__run__(in_art.data)
+            # allow sync or async implementations
+            if inspect.iscoroutine(run_res):
+                result = await run_res
+            else:
+                result = run_res
+
+            store.save_model(result)
+            store.persist_exit()
+            store.cleanup()
+            logger.info("%s finished", worker_name)
+
+            artifact = FlowArtifact(run_id=run_id, dir=store.data_dir, data=result)
+
+            md_table = await inst._to_markdown(result)
+            await acreate_markdown_artifact(
+                key=f"{worker_name.lower()}-artifact",
+                markdown=md_table,
+                description=f"{worker_name} output"
+            )
+
+            # save the markdown artifact in the flow directory
+            md_file = store._run_dir / "artifact.md"
+            md_file.write_text(md_table)
+
+            return artifact
+
+        return flow(name=worker_name, log_prints=True)(_core)
+
+    # --------------------------------------------------------------------- #
+    def _create_input_artifact(cls):
+        """Create & attach a pydantic model ‹InputArtifact› = {dir?, data}."""
+        DirField = (Path | None, None)
+        DataField = (cls.input_model, ...)  # type: ignore # required
+        art_name = f"{cls.__name__}InputArtifact"
+
+        artifact = create_model(art_name, dir=DirField, data=DataField)  # type: ignore[arg-type]
+        artifact.__doc__ = f"Artifact for {cls.__name__} input"
+        cls.input_artifact = artifact  # type: ignore[attr-defined]
+
+
+# --------------------------------------------------------------------------- #
+# public Worker base                                                          #
+# --------------------------------------------------------------------------- #
+class Worker(Generic[InT, OutT], metaclass=_WorkerMeta):
+    """
+    Derive from this class, set *input_model* / *output_model*, and implement
+    an **async** ``__run__(payload: input_model)``.
+    """
+
+    input_model: ClassVar[type[BaseModel]]
+    output_model: ClassVar[type[BaseModel]]
+    input_artifact: ClassVar[type[BaseModel]]  # injected by metaclass
+    worker_name: ClassVar[str]
+    _prefect_flow: ClassVar[Callable[[FlowArtifact[InT]], Awaitable[FlowArtifact[OutT]]]]
+
+    # injected at runtime
+    entry: Path
+    _store: WorkerStore
+
+    # ------------------------------------------------------------------ #
+    # internal wiring                                                    #
+    # ------------------------------------------------------------------ #
+    def _inject_store(self, store: WorkerStore) -> None:
+        self._store = store
+        self.entry = store.entry_dir
+
+    # ------------------------------------------------------------------ #
+    # developer helper                                                   #
+    # ------------------------------------------------------------------ #
+    def stage(
+        self,
+        src: Path | str,
+        *,
+        new_name: str | None = None,
+        sanitize: bool = True,
+        move: bool = False,
+    ) -> Path:
+        return self._store.stage(src, new_name=new_name, sanitize=sanitize, move=move)
+
+    # ------------------------------------------------------------------ #
+    # convenience wrappers                                               #
+    # ------------------------------------------------------------------ #
+    @classmethod
+    def flow(cls):
+        """Return the auto-generated Prefect flow."""
+        return cls._prefect_flow
+
+    # submit variants --------------------------------------------------- #
+    @classmethod
+    def submit(cls, payload: FlowArtifact[InT]) -> FlowArtifact[OutT]:
+        async def _runner():
+            art = await cls._prefect_flow(payload)  # type: ignore[arg-type]
+            return art
+
+        return anyio.run(_runner)
+
+    # ------------------------------------------------------------------ #
+    # abstract                                                           #
+    # ------------------------------------------------------------------ #
+    async def __run__(self, payload: InT) -> OutT: ...
+
+
+    # Should be overridden by the worker
+    async def _to_markdown(self, data: OutT) -> str:
+        md_table = pd.DataFrame([data.dict()]).to_markdown(index=False)
+        return md_table
--- a/librarian/librarian-core/uv.lock
+++ b/librarian/librarian-core/uv.lock
--- a/librarian/plugins/librarian-chunker/README.md
+++ b/librarian/plugins/librarian-chunker/README.md
@ -0,0 +1,21 @@
+# Chunker
+
+Extract text, chunk it, and save images from a PDF.
+
+chunks is a List[str] of ~800-token strings (100-token overlap).
+Outputs (text files and images) are written under extracted_content/<pdf_basename>/.
+## Usage
+
+```python
+from chunker import Chunker
+
+chunker = Chunker("path/to/file.pdf")
+chunks = chunker.run()
+
+
+
+Setup:
+    pip install -r requirements.txt
+    python -m spacy download xx_ent_wiki_sm
+
+
--- a/librarian/plugins/librarian-chunker/pyproject.toml
+++ b/librarian/plugins/librarian-chunker/pyproject.toml
@ -0,0 +1,40 @@
+[project]
+name = "librarian-chunker"
+version = "0.1.0"
+description = "Chunker for Librarian"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "pdfplumber",
+    "pymupdf",
+    "tiktoken",
+    "spacy",
+    "sentence-transformers",
+    "pydantic",
+    "prefect",
+    "librarian-core",
+    "python-pptx",
+    "python-docx",
+]
+
+[build-system]
+requires = ["hatchling>=1.21"]
+build-backend = "hatchling.build"
+
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/librarian_chunker"]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.uv.sources]
+#librarian-core = { git = "https://github.com/DotNaos/librarian-core", rev = "dev" }
+
+[project.entry-points."librarian.workers"]
+chunker = "librarian_chunker.chunker:Chunker"
+
+
+# ───────── optional: dev / test extras ─────────
+[project.optional-dependencies]
+dev = ["ruff", "pytest", "mypy"]
--- a/librarian/plugins/librarian-chunker/src/librarian_chunker/init.py
+++ b/librarian/plugins/librarian-chunker/src/librarian_chunker/init.py
@ -0,0 +1,3 @@
+from .chunker import Chunker
+
+__all__ = ["Chunker"]
--- a/librarian/plugins/librarian-chunker/src/librarian_chunker/chunker.py
+++ b/librarian/plugins/librarian-chunker/src/librarian_chunker/chunker.py
@ -0,0 +1,217 @@
+import os
+import shutil
+from pathlib import Path
+
+import pdfplumber
+import pymupdf
+import spacy
+import tiktoken
+from librarian_core.utils.path_utils import get_temp_path
+from librarian_core.workers.base import Worker
+from librarian_extractor.models.extract_data import ExtractData, ExtractedFile
+from prefect import get_run_logger, task
+from prefect.cache_policies import NO_CACHE
+from prefect.futures import wait
+
+from librarian_chunker.models.chunk_data import (
+    Chunk,
+    ChunkData,
+    ChunkedCourse,
+    ChunkedTerm,
+)
+
+MAX_TOKENS = 800
+OVERLAP_TOKENS = 100
+
+
+class Chunker(Worker[ExtractData, ChunkData]):
+    input_model = ExtractData
+    output_model = ChunkData
+
+    async def __run__(self, payload: ExtractData) -> ChunkData:  # noqa: D401
+        lg = get_run_logger()
+        lg.info("Chunker started")
+
+        working_dir = get_temp_path()
+
+        # load NLP and tokenizer
+        Chunker.nlp = spacy.load("xx_ent_wiki_sm")
+        Chunker.nlp.add_pipe("sentencizer")
+        Chunker.enc = tiktoken.get_encoding("cl100k_base")
+
+        # chunk parameters
+        Chunker.max_tokens = MAX_TOKENS
+        Chunker.overlap_tokens = OVERLAP_TOKENS
+
+        result = ChunkData(terms=[])
+
+        # Loading files
+        for term in payload.terms:
+            chunked_term = ChunkedTerm(id=term.id, name=term.name)
+            in_term_dir = self.entry / term.name
+
+            out_term_dir = working_dir / term.name
+            out_term_dir.mkdir(parents=True, exist_ok=True)
+
+            for course in term.courses:
+                chunked_course = ChunkedCourse(
+                    id=course.id, name=course.name, chunks=[]
+                )
+                in_course_dir = in_term_dir / course.name
+
+                out_course_dir = out_term_dir / course.name
+                out_course_dir.mkdir(parents=True, exist_ok=True)
+
+                futs = []
+                # All chunks are just in the course dir, so no new dir
+                for chap in course.chapters:
+                    chapter_path = in_course_dir / chap.name
+
+                    for f in chap.content_files:
+                        futs.append(
+                            self._chunk_file.submit(f, chapter_path, out_course_dir)
+                        )
+                wait(futs)
+                for fut in futs:
+                    chunks, images = fut.result()
+                    chunked_course.chunks.extend(chunks)
+                    chunked_course.images.extend(images)
+
+                chunked_term.courses.append(chunked_course)
+
+            # Add the chunked term to the result
+            result.terms.append(chunked_term)
+            self.stage(out_term_dir)
+
+        return result
+    @staticmethod
+    @task(log_prints=True)
+    def _chunk_file(
+        f: ExtractedFile, chapter_path: Path, out_course_dir: Path
+    ) -> tuple[list[Chunk], list[str]]:
+        lg = get_run_logger()
+        lg.info(f"Chunking file {f.name}")
+        lg.info(f"Chapter path: {chapter_path}")
+        lg.info(f"Out course dir: {out_course_dir}")
+
+        # Extract the Text
+        file_text = Chunker._extract_text(chapter_path / f.name)
+
+        # Chunk the Text
+        chunks = Chunker._chunk_text(file_text, f.name, out_course_dir)
+
+        images_dir = out_course_dir / "images"
+        images_dir.mkdir(parents=True, exist_ok=True)
+
+        # Extract the Images
+        images = Chunker._extract_images(chapter_path / f.name, images_dir)
+
+        return chunks, images
+
+
+    @staticmethod
+    def _extract_text(file_path: Path) -> str:
+        if not file_path.suffix == ".pdf":
+            return ""
+
+        extracted_text = ""
+
+        with pdfplumber.open(file_path) as pdf:
+            for i in range(len(pdf.pages)):
+                current_page = pdf.pages[i]
+                text = current_page.extract_text() or ""
+                extracted_text += text
+
+        return extracted_text
+
+    @staticmethod
+    def _chunk_text(text: str, f_name: str, out_course_dir: Path) -> list[Chunk]:
+        lg = get_run_logger()
+        lg.info(f"Chunking text for file {f_name}")
+        # split text into sentences and get tokens
+        nlp_doc = Chunker.nlp(text)
+        sentences = [sent.text.strip() for sent in nlp_doc.sents]
+        sentence_token_counts = [len(Chunker.enc.encode(s)) for s in sentences]
+        lg.info(f"Extracted {len(sentences)} sentences with token counts: {sentence_token_counts}")
+
+        # Buffers
+        chunks: list[Chunk] = []
+        current_chunk = []
+        current_token_total = 0
+
+        chunk_id = 0
+
+        for s, tc in zip(sentences, sentence_token_counts):  # Pair sentences and tokens
+            if tc + current_token_total <= MAX_TOKENS:  # Check Token limit
+                # Add sentences to chunk
+                current_chunk.append(s)
+                current_token_total += tc
+            else:
+                # Flush Chunk
+                chunk_text = "\n\n".join(current_chunk)
+
+                chunk_name = f"{f_name}_{chunk_id}"
+                with open(
+                    out_course_dir / f"{chunk_name}.md", "w", encoding="utf-8"
+                ) as f:
+                    f.write(chunk_text)
+                    chunk_id += 1
+
+                chunks.append(
+                    Chunk(
+                        id=f"{f_name}_{chunk_id}",
+                        name=f"{f_name}_{chunk_id}.md",
+                        tokens=len(Chunker.enc.encode(chunk_text)),
+                    )
+                )
+
+                # Get Overlap from Chunk
+                token_ids = Chunker.enc.encode(chunk_text)
+                overlap_ids = token_ids[-OVERLAP_TOKENS :]
+                overlap_text = Chunker.enc.decode(overlap_ids)
+                overlap_doc = Chunker.nlp(overlap_text)
+                overlap_sents = [sent.text for sent in overlap_doc.sents]
+
+                # Start new Chunk
+                current_chunk = overlap_sents + [s]
+                current_token_total = sum(
+                    len(Chunker.enc.encode(s)) for s in current_chunk
+                )
+
+        if current_chunk:
+            chunk_text = "\n\n".join(current_chunk)
+            chunk_name = f"{f_name}_{chunk_id}"
+            with open(out_course_dir / f"{chunk_name}.md", "w", encoding="utf-8") as f:
+                f.write(chunk_text)
+            chunks.append(
+                Chunk(
+                    id=f"{f_name}_{chunk_id}",
+                    name=f"{f_name}_{chunk_id}",
+                    tokens=len(Chunker.enc.encode(chunk_text)),
+                )
+            )
+        lg.info(f"Created {len(chunks)} chunks for file {f_name}")
+        return chunks
+
+    @staticmethod
+    def _extract_images(file: Path, img_folder: Path) -> list[str]:
+        images_list = []
+        if not file.suffix == ".pdf":
+            return []
+
+        with pymupdf.open(file) as doc:
+            for i in range(len(doc)):
+                images = doc.get_page_images(i)
+
+                for img in images:
+                    img_xref = img[0]
+                    image = doc.extract_image(img_xref)
+                    img_content = image["image"]
+                    img_ext = image["ext"]
+                    img_name = f"img_page{i + 1}_{img_xref}.{img_ext}"
+                    img_file_path = img_folder / img_name
+
+                    with open(img_file_path, "wb") as img_file:
+                        img_file.write(img_content)
+                    images_list.append(img_name)
+        return images_list
--- a/librarian/plugins/librarian-chunker/src/librarian_chunker/models/init.py
+++ b/librarian/plugins/librarian-chunker/src/librarian_chunker/models/init.py
@ -0,0 +1,3 @@
+from .chunk_data import Chunk, ChunkedCourse, ChunkedTerm, ChunkData
+
+__all__ = ["Chunk", "ChunkedCourse", "ChunkedTerm", "ChunkData"]
--- a/librarian/plugins/librarian-chunker/src/librarian_chunker/models/chunk_data.py
+++ b/librarian/plugins/librarian-chunker/src/librarian_chunker/models/chunk_data.py
@ -0,0 +1,29 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+# --------------------------------------------------------------------------- #
+# Output models                                                               #
+# --------------------------------------------------------------------------- #
+
+
+class Chunk(BaseModel):
+    id: str
+    name: str
+    tokens: int
+
+class ChunkedCourse(BaseModel):
+    id: str
+    name: str
+    chunks: List[Chunk] = Field(default_factory=list)
+    images: List[str] = Field(default_factory=list)
+
+
+class ChunkedTerm(BaseModel):
+    id: str
+    name: str
+    courses: List[ChunkedCourse] = Field(default_factory=list)
+
+
+class ChunkData(BaseModel):
+    terms: List[ChunkedTerm]
--- a/librarian/plugins/librarian-chunker/src/librarian_chunker/models/py.typed
+++ b/librarian/plugins/librarian-chunker/src/librarian_chunker/models/py.typed
--- a/librarian/plugins/librarian-chunker/src/librarian_chunker/py.typed
+++ b/librarian/plugins/librarian-chunker/src/librarian_chunker/py.typed
--- a/librarian/plugins/librarian-chunker/uv.lock
+++ b/librarian/plugins/librarian-chunker/uv.lock
--- a/librarian/plugins/librarian-extractor/README.md
+++ b/librarian/plugins/librarian-extractor/README.md
--- a/librarian/plugins/librarian-extractor/pyproject.toml
+++ b/librarian/plugins/librarian-extractor/pyproject.toml
@ -0,0 +1,40 @@
+[project]
+name = "librarian-extractor"
+version = "0.1.0"
+description = "Librarian extractor plugin"
+readme = "README.md"
+authors = [
+    { name = "DotNaos", email = "schuetzoliver00@gmail.com" },
+]
+requires-python = ">=3.10"
+dependencies = [
+  "librarian-core",
+  "importlib_metadata; python_version<'3.10'",
+  "ollama>=0.4.8",
+  "parsel>=1.10.0",
+  "prefect>=3.4.1",
+  "openai>=1.78.1",
+]
+
+#[tool.uv.sources]
+#librarian-core = { git = "https://github.com/DotNaos/librarian-core", rev = "main" }
+
+[build-system]
+requires = ["hatchling>=1.21"]
+build-backend = "hatchling.build"
+
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/librarian_extractor/"]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+
+# ───────── optional: dev / test extras ─────────
+[project.optional-dependencies]
+dev   = ["ruff", "pytest", "mypy"]
+
+[project.entry-points."librarian.workers"]
+extractor = "librarian_extractor.extractor:Extractor"
+ai_sanitizer = "librarian_extractor.ai_sanitizer:AISanitizer"
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/init.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/init.py
@ -0,0 +1,4 @@
+from librarian_extractor.ai_sanitizer.ai_sanitizer import AISanitizer
+from librarian_extractor.extractor.extractor import Extractor
+
+__all__ = ["Extractor", "AISanitizer"]
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/ai_sanitizer/init.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/ai_sanitizer/init.py
@ -0,0 +1,3 @@
+from librarian_extractor.ai_sanitizer.ai_sanitizer import AISanitizer
+
+__all__ = ["AISanitizer"]
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/ai_sanitizer/ai_sanitizer.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/ai_sanitizer/ai_sanitizer.py
@ -0,0 +1,215 @@
+"""
+AI-powered sanitizer
+====================
+• in  : ExtractData (tree from Extractor)
+• out : ExtractData (same graph but with prettier names)
+
+Changes vs. previous revision
+-----------------------------
+✓ Media files resolved at course-level `media/` folder
+✓ Missing sources are warned, not raised
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+from typing import List
+
+import openai
+from prefect import get_run_logger, task
+from prefect.futures import PrefectFuture, wait
+from pydantic import ValidationError
+
+from librarian_core.workers.base import Worker
+from librarian_extractor.prompts import PROMPT_COURSE
+from librarian_extractor.models.extract_data import (
+    ExtractData,
+    ExtractedCourse,
+    ExtractedFile,
+    ExtractedTerm,
+)
+
+
+# --------------------------------------------------------------------------- #
+# helpers                                                                     #
+# --------------------------------------------------------------------------- #
+def _clean_json(txt: str) -> str:
+    txt = txt.strip()
+    if txt.startswith("```"):
+        txt = txt.lstrip("`")
+        if "\n" in txt:
+            txt = txt.split("\n", 1)[1]
+        if txt.rstrip().endswith("```"):
+            txt = txt.rstrip()[:-3]
+    return txt.strip()
+
+
+def _safe_json_load(txt: str) -> dict:
+    return json.loads(_clean_json(txt))
+
+
+def _merge_with_original(src: ExtractedCourse, patch: dict, lg) -> ExtractedCourse:
+    """Return *patch* merged with *src* so every id is preserved."""
+    try:
+        tgt = ExtractedCourse.model_validate(patch)
+    except ValidationError as err:
+        lg.warning("LLM payload invalid – keeping original (%s)", err)
+        return src
+
+    if not tgt.id:
+        tgt.id = src.id
+
+    for ch_src, ch_tgt in zip(src.chapters, tgt.chapters):
+        if not ch_tgt.name:
+            ch_tgt.name = ch_src.name
+        for f_src, f_tgt in zip(ch_src.content_files, ch_tgt.content_files):
+            if not f_tgt.id:
+                f_tgt.id = f_src.id
+        for f_src, f_tgt in zip(ch_src.media_files, ch_tgt.media_files):
+            if not f_tgt.id:
+                f_tgt.id = f_src.id
+    return tgt
+
+
+# --------------------------------------------------------------------------- #
+# OpenAI call – Prefect task                                                  #
+# --------------------------------------------------------------------------- #
+@task(
+    name="sanitize_course_json",
+    retries=2,
+    retry_delay_seconds=5,
+    log_prints=True,
+)
+def sanitize_course_json(course_json: str, model: str, temperature: float) -> dict:
+    rsp = openai.chat.completions.create(
+        model=model,
+        temperature=temperature,
+        messages=[
+            {"role": "system", "content": PROMPT_COURSE},
+            {"role": "user", "content": course_json},
+        ],
+    )
+    usage = rsp.usage
+    get_run_logger().info(
+        "LLM tokens – prompt: %s, completion: %s",
+        usage.prompt_tokens,
+        usage.completion_tokens,
+    )
+    return _safe_json_load(rsp.choices[0].message.content or "{}")
+
+
+# --------------------------------------------------------------------------- #
+# Worker                                                                      #
+# --------------------------------------------------------------------------- #
+class AISanitizer(Worker[ExtractData, ExtractData]):
+    input_model = ExtractData
+    output_model = ExtractData
+
+    def __init__(self, model_name: str | None = None, temperature: float = 0.0):
+        super().__init__()
+        self.model_name = model_name or os.getenv("OPENAI_MODEL", "gpt-4o-mini")
+        self.temperature = temperature
+
+    # ------------------------------------------------------------------ #
+    def __run__(self, data: ExtractData) -> ExtractData:
+        lg = get_run_logger()
+
+        futures: List[PrefectFuture] = []
+        originals: List[ExtractedCourse] = []
+
+        # 1) submit all courses to the LLM
+        for term in data.terms:
+            for course in term.courses:
+                futures.append(
+                    sanitize_course_json.submit(
+                        json.dumps(course.model_dump(), ensure_ascii=False),
+                        self.model_name,
+                        self.temperature,
+                    )
+                )
+                originals.append(course)
+
+        wait(futures)
+
+        # 2) build new graph with merged results
+        terms_out: List[ExtractedTerm] = []
+        idx = 0
+        for term in data.terms:
+            new_courses: List[ExtractedCourse] = []
+            for _ in term.courses:
+                clean_dict = futures[idx].result()
+                merged = _merge_with_original(originals[idx], clean_dict, lg)
+                new_courses.append(merged)
+                idx += 1
+            terms_out.append(
+                ExtractedTerm(id=term.id, name=term.name, courses=new_courses)
+            )
+
+        renamed = ExtractData(terms=terms_out)
+
+        # 3) stage files with their new names
+        self._export_with_new_names(data, renamed, lg)
+
+        return renamed
+
+    # ------------------------------------------------------------------ #
+    # staging helpers                                                    #
+    # ------------------------------------------------------------------ #
+    def _stage_or_warn(self, src: Path, dst: Path, lg):
+        """Copy *src* → *dst* (via self.stage). Warn if src missing."""
+        if not src.exists():
+            lg.warning("Source missing – skipped %s", src)
+            return
+        self.stage(src, new_name=str(dst), sanitize=False)
+        lg.debug("Stage %s → %s", src.name, dst)
+
+    def _export_with_new_names(
+        self,
+        original: ExtractData,
+        renamed: ExtractData,
+        lg,
+    ):
+        entry = Path(self.entry)
+
+        for term_old, term_new in zip(original.terms, renamed.terms):
+            for course_old, course_new in zip(term_old.courses, term_new.courses):
+                # ---------- content files (per chapter) -----------------
+                for chap_old, chap_new in zip(course_old.chapters, course_new.chapters):
+                    n = min(len(chap_old.content_files), len(chap_new.content_files))
+                    for i in range(n):
+                        fo = chap_old.content_files[i]
+                        fn = chap_new.content_files[i]
+                        src = (
+                            entry
+                            / term_old.name
+                            / course_old.name
+                            / chap_old.name
+                            / fo.name
+                        )
+                        dst = (
+                            Path(term_new.name)
+                            / course_new.name
+                            / chap_new.name
+                            / fn.name
+                        )
+                        self._stage_or_warn(src, dst, lg)
+
+                # ---------- media files (course-level “media”) ----------
+                src_media_dir = (
+                    entry / term_old.name / course_old.name / "media"
+                )  # <─ fixed!
+                dst_media_dir = Path(term_new.name) / course_new.name / "media"
+                if not src_media_dir.is_dir():
+                    continue
+
+                # build a flat list of (old, new) media filenames
+                media_pairs: List[tuple[ExtractedFile, ExtractedFile]] = []
+                for ch_o, ch_n in zip(course_old.chapters, course_new.chapters):
+                    media_pairs.extend(zip(ch_o.media_files, ch_n.media_files))
+
+                for fo, fn in media_pairs:
+                    src = src_media_dir / fo.name
+                    dst = dst_media_dir / fn.name
+                    self._stage_or_warn(src, dst, lg)
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/ai_sanitizer/py.typed
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/ai_sanitizer/py.typed
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/constants.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/constants.py
@ -0,0 +1,66 @@
+"""
+Shared lists and prompts
+"""
+
+# -------------------------------------------------------------------- #
+# file selection – keep only real documents we can show / convert      #
+# -------------------------------------------------------------------- #
+CONTENT_FILE_EXTENSIONS = [
+    "*.pdf",
+    "*.doc",
+    "*.docx",
+    "*.ppt",
+    "*.pptx",
+    "*.txt",
+    "*.rtf",
+]
+
+MEDIA_FILE_EXTENSIONS = [
+    "*.jpg",
+    "*.jpeg",
+    "*.png",
+    "*.gif",
+    "*.svg",
+    "*.mp4",
+    "*.mov",
+    "*.mp3",
+]
+
+# -------------------------------------------------------------------- #
+# naming rules                                                         #
+# -------------------------------------------------------------------- #
+SANITIZE_REGEX = {
+    "base": [r"\s*\(\d+\)$"],
+    "course": [
+        r"^\d+\.\s*",
+        r"\s*\([^)]*\)",
+        r"\s*(?:FS|HS)\d{2}$",
+    ],
+    "chapter": [
+        r"^\d+\.?\s*",
+        r"\s*SW_\d+\s*(?:___)?\s*KW_\d+\s*",
+        r"\bKapitel[_\s]*\d+\b",
+    ],
+    "file": [
+        r",",  # ← new : drop commas
+        r",?\s*inkl\.?\s*",
+        r"\(File\)",
+        r"```json",
+    ],
+}
+
+BLACKLIST_REGEX = {
+    "chapter": [r"^allgemeine informationen$"],
+    "ressource_types": [
+        "(Forum)",
+        "(URL)",
+        "(External tool)",
+        "(Text and media area)",
+    ],
+}
+
+RESSOURCE_TYPES = BLACKLIST_REGEX["ressource_types"]
+BASE_BLACKLIST_REGEX = SANITIZE_REGEX["base"]
+
+MAX_FILENAME_LENGTH = 100
+
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/extractor/init.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/extractor/init.py
@ -0,0 +1,3 @@
+from librarian_extractor.extractor.extractor import Extractor
+
+__all__ = ["Extractor"]
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/extractor/extractor.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/extractor/extractor.py
@ -0,0 +1,301 @@
+"""
+Extractor Worker – resilient version
+------------------------------------
+* Finds the real payload even when the link goes to
+  File_…/index.html first.
+* No `iterdir` on non-directories.
+* Keeps all earlier features: id parsing, allowed-suffix filter,
+  media-folder, sanitising.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import re
+import shutil
+import zipfile
+from pathlib import Path
+from typing import Tuple
+
+import lxml.html
+import parsel
+from librarian_core.utils.path_utils import get_temp_path
+from librarian_core.workers.base import Worker
+from librarian_scraper.models.download_data import DownloadData
+from prefect import get_run_logger, task
+from prefect.futures import wait
+
+from librarian_extractor.constants import (
+    CONTENT_FILE_EXTENSIONS,
+    MEDIA_FILE_EXTENSIONS,
+)
+from librarian_extractor.models.extract_data import (
+    ExtractData,
+    ExtractedChapter,
+    ExtractedCourse,
+    ExtractedFile,
+    ExtractedTerm,
+)
+from librarian_extractor.sanitizers import (
+    annotate_chapter_name,
+    is_chapter_allowed,
+    sanitize_chapter_name,
+    sanitize_course_name,
+    sanitize_file_name,
+)
+
+CONTENT_EXTS = {Path(p).suffix.lower() for p in CONTENT_FILE_EXTENSIONS}
+MEDIA_EXTS = {Path(p).suffix.lower() for p in MEDIA_FILE_EXTENSIONS}
+ALL_EXTS = CONTENT_EXTS | MEDIA_EXTS
+
+_id_rx = re.compile(r"\.(\d{4,})[./]")  # 1172180 from “..._.1172180/index.html”
+
+
+# --------------------------------------------------------------------------- #
+# helpers                                                                     #
+# --------------------------------------------------------------------------- #
+def _hash_id(fname: str) -> str:
+    return hashlib.sha1(fname.encode()).hexdigest()[:10]
+
+
+def _html_stub_target(html_file: Path) -> Path | None:
+    """Parse a Moodle *index.html* stub and return the first file link."""
+    try:
+        tree = lxml.html.parse(html_file)  # type: ignore[arg-type]
+        hrefs = tree.xpath("//ul/li/a/@href")
+        for h in hrefs:
+            h = h.split("?")[0].split("#")[0]
+            p = html_file.parent / h
+            if p.exists():
+                return p
+    except Exception:
+        pass
+    return None
+
+
+def _best_payload(node: Path) -> Path | None:  # noqa: C901
+    """
+    Return the real document given *node* which may be:
+    • the actual file   → return it
+    • File_xxx/dir      → search inside /content or dir itself
+    • File_xxx/index.html stub → parse to find linked file
+    """
+    # 1) immediate hit
+    if node.is_file() and node.suffix.lower() in ALL_EXTS:
+        return node
+
+    # 2) if html stub try to parse inner link
+    if node.is_file() and node.suffix.lower() in {".html", ".htm"}:
+        hinted = _html_stub_target(node)
+        if hinted:
+            return _best_payload(hinted)  # recurse
+
+    # 3) directories to search
+    roots: list[Path] = []
+    if node.is_dir():
+        roots.append(node)
+    elif node.is_file():
+        roots.append(node.parent)
+
+    for r in list(roots):
+        if r.is_dir() and (r / "content").is_dir():
+            roots.insert(0, r / "content")  # prefer content folder
+
+    for r in roots:
+        if not r.is_dir():
+            continue
+        files = [p for p in r.iterdir() if p.is_file() and p.suffix.lower() in ALL_EXTS]
+        if len(files) == 1:
+            return files[0]
+    return None
+
+
+def _file_id_from_href(href: str) -> str:
+    m = _id_rx.search(href)
+    return m.group(1) if m else ""
+
+
+def task_(**kw):
+    kw.setdefault("log_prints", True)
+    return task(**kw)
+
+
+# --------------------------------------------------------------------------- #
+# Worker                                                                      #
+# --------------------------------------------------------------------------- #
+class Extractor(Worker[DownloadData, ExtractData]):
+    input_model = DownloadData
+    output_model = ExtractData
+
+    async def __run__(self, downloads: DownloadData) -> ExtractData:
+        lg = get_run_logger()
+        work_root = Path(get_temp_path()) / "extract"
+        work_root.mkdir(parents=True, exist_ok=True)
+        self.out_dir = work_root
+
+        result = ExtractData()
+        futs = []
+        entry_dir = self.entry
+
+        for t in downloads.terms:
+            (work_root / t.name).mkdir(exist_ok=True)
+            result.terms.append(ExtractedTerm(id=t.id, name=t.name))
+            for c in t.courses:
+                futs.append(
+                    self._extract_course.submit(t.name, c.id, work_root, entry_dir)
+                )
+
+        done, _ = wait(futs)
+        for fut in done:
+            term, meta = fut.result()
+            if meta:
+                next(t for t in result.terms if t.name == term).courses.append(meta)
+
+        for term in result.terms:
+            self.stage(
+                work_root / term.name, new_name=term.name, sanitize=False, move=True
+            )
+        lg.info("Extractor finished – %d terms", len(result.terms))
+        return result
+
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    @task_()
+    def _extract_course(  # noqa: C901
+        term: str, cid: str, out_root: Path, entry_dir: Path
+    ) -> Tuple[str, ExtractedCourse | None]:
+        lg = get_run_logger()
+        z = entry_dir / term / f"{cid}.zip"
+        if not z.is_file():
+            lg.warning("ZIP missing %s", z)
+            return term, None
+
+        tmp = Path(get_temp_path()) / f"u{cid}"
+        tmp.mkdir(exist_ok=True)
+        try:
+            with zipfile.ZipFile(z) as zf:
+                zf.extractall(tmp)
+
+            html, root = Extractor._index_html(tmp)
+            if not html:
+                lg.warning("index.html missing for %s", cid)
+                return term, None
+
+            cname = Extractor._course_name(html) or cid
+            c_meta = ExtractedCourse(id=cid, name=cname)
+            media_dir = out_root / term / cname / "media"
+
+            structure = Extractor._outline(html)
+            if not structure:
+                Extractor._copy_all(
+                    root, out_root / term / cname, c_meta, media_dir, lg # type: ignore
+                )
+                return term, c_meta
+
+            chap_no = 0
+            for title, links in structure:
+                if not is_chapter_allowed(title):
+                    continue
+                chap_no += 1
+                chap_name = annotate_chapter_name(sanitize_chapter_name(title), chap_no)
+                chap_dir = out_root / term / cname / chap_name
+                chap_dir.mkdir(parents=True, exist_ok=True)
+                chap_meta = ExtractedChapter(name=chap_name)
+
+                for text, href in links:
+                    target = _best_payload(root / href.lstrip("./"))
+                    if not target:
+                        lg.debug("payload not found %s", href)
+                        continue
+
+                    base = sanitize_file_name(text)
+                    if not Path(base).suffix:
+                        base += target.suffix  # ensure extension
+
+                    dst = (
+                        media_dir / base
+                        if target.suffix.lower() in MEDIA_EXTS
+                        else chap_dir / base
+                    )
+                    dst.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(target, dst)
+
+                    fid = _file_id_from_href(href) or _hash_id(dst.name)
+                    meta_obj = ExtractedFile(id=fid, name=dst.name)
+                    (
+                        chap_meta.media_files
+                        if dst.is_relative_to(media_dir)
+                        else chap_meta.content_files
+                    ).append(meta_obj)
+
+                if chap_meta.content_files or chap_meta.media_files:
+                    c_meta.chapters.append(chap_meta)
+
+            if c_meta.chapters:
+                lg.info("Extracted %s (%d chap.)", cname, len(c_meta.chapters))
+                return term, c_meta
+            return term, None
+        finally:
+            shutil.rmtree(tmp, ignore_errors=True)
+
+    # ------------------------------------------------------------------ #
+    # internal helpers                                                   #
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _copy_all(
+        root: Path, dst_root: Path, c_meta: ExtractedCourse, media_dir: Path, lg
+    ):
+        chap = ExtractedChapter(name="Everything")
+        dst_root.mkdir(parents=True, exist_ok=True)
+        for fp in root.rglob("*"):
+            if fp.is_file() and fp.suffix.lower() in ALL_EXTS:
+                dst = (
+                    media_dir if fp.suffix.lower() in MEDIA_EXTS else dst_root
+                ) / fp.name
+                dst.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy2(fp, dst)
+                chap.content_files.append(
+                    ExtractedFile(id=_hash_id(fp.name), name=dst.name)
+                )
+        if chap.content_files:
+            c_meta.chapters.append(chap)
+            lg.info("Fallback copy %d files", len(chap.content_files))
+
+    @staticmethod
+    def _index_html(root: Path) -> Tuple[str, Path | None]:
+        for idx in root.rglob("index.html"):
+            try:
+                return idx.read_text("utf-8", errors="ignore"), idx.parent
+            except Exception:
+                continue
+        return "", None
+
+    @staticmethod
+    def _course_name(html: str) -> str:
+        sel = parsel.Selector(text=html)
+        return sanitize_course_name(sel.css("h1 a::text").get(default="").strip())
+
+    @staticmethod
+    def _outline(html: str):
+        t = lxml.html.fromstring(html)
+        res = []
+        for h3 in t.xpath("//h3"):
+            title = h3.text_content().strip()
+            ul = next((s for s in h3.itersiblings() if s.tag == "ul"), None)
+            if ul is None:
+                continue
+            links = []
+            for a in ul.findall(".//a"):
+                if "(File)" in (a.text_content() or ""):
+                    sel = parsel.Selector(
+                        text=lxml.html.tostring(a, encoding="unicode") # type: ignore
+                    )
+                    links.append(
+                        (
+                            sel.css("::text").get().strip(), # type: ignore
+                            sel.css("::attr(href)").get().strip(), # type: ignore
+                        )
+                    )
+            if links:
+                res.append((title, links))
+        return res
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/extractor/py.typed
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/extractor/py.typed
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/models/extract_data.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/models/extract_data.py
@ -0,0 +1,30 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+
+class ExtractedFile(BaseModel):
+    id: str
+    name: str  # Name of the file, relative to ExtractedChapter.name
+
+
+class ExtractedChapter(BaseModel):
+    name: str  # Name of the chapter directory, relative to ExtractedCourse.name
+    content_files: List[ExtractedFile] = Field(default_factory=list)
+    media_files: List[ExtractedFile] = Field(default_factory=list)
+
+
+class ExtractedCourse(BaseModel):
+    id: str
+    name: str  # Name of the course directory, relative to ExtractedTerm.name
+    chapters: List[ExtractedChapter] = Field(default_factory=list)
+
+
+class ExtractedTerm(BaseModel):
+    id: str
+    name: str  # Name of the term directory, relative to ExtractMeta.dir
+    courses: List[ExtractedCourse] = Field(default_factory=list)
+
+
+class ExtractData(BaseModel):
+    terms: List[ExtractedTerm] = Field(default_factory=list)
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/prompts.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/prompts.py
@ -0,0 +1,29 @@
+# -------------------------------------------------------------------- #
+# LLM prompts                                                          #
+# -------------------------------------------------------------------- #
+
+PROMPT_COURSE = """
+General naming rules
+====================
+* Use underscores instead of spaces.
+* Keep meaningful numbers / IDs.
+* Remove Date information, except it is absolutely necessary.
+* -> Normalize dates / months ("Februar" → "02").
+* Remove redundant semester / university codes (e.g. FS24, HS, FHGR, CDS).
+* Remove redundancy in general. ( DRY - Don't Repeat Yourself )
+* Trim superfluous parts like duplicate week information ("1_SW_01_KW_08" → "SW_01").
+* Only keep have one enumarator at a time, so "1_SW_01" → "SW_01".
+* Preserve file extensions!
+* Avoid repeated dots and illegal filesystem characters (colon, slash, …).
+
+The most important rule is to keep everything as consistent as possible.
+
+Important – DO NOT:
+* change the JSON structure,
+* change or reorder any `id`,
+* add any keys.
+
+Return **only** the modified JSON for the course you receive.
+
+Everything should be in english after the sanitization.
+""".strip()
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/py.typed
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/py.typed
--- a/librarian/plugins/librarian-extractor/src/librarian_extractor/sanitizers.py
+++ b/librarian/plugins/librarian-extractor/src/librarian_extractor/sanitizers.py
@ -0,0 +1,71 @@
+"""
+Name-sanitising helpers
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from librarian_extractor.constants import (
+    BASE_BLACKLIST_REGEX,
+    BLACKLIST_REGEX,
+    MAX_FILENAME_LENGTH,
+    RESSOURCE_TYPES,
+    SANITIZE_REGEX,
+)
+
+_INVALID_FS_CHARS = re.compile(r'[\\/:*?"<>|]')
+_WS = re.compile(r"\s+")
+_DUP_DOTS = re.compile(r"\.\.+")
+_TRAILING_NUM = re.compile(r"_\(\d+\)$")
+
+
+def _sanitize_name(name: str, extra_patterns: list[str]) -> str:
+    original = name
+    for rt in RESSOURCE_TYPES:
+        name = name.replace(rt, "")
+    for rx in BASE_BLACKLIST_REGEX + extra_patterns:
+        name = re.sub(rx, "", name, flags=re.IGNORECASE)
+    name = _INVALID_FS_CHARS.sub("_", name)
+    name = _DUP_DOTS.sub(".", name)
+    name = _WS.sub(" ", name).replace(" ", "_")
+    name = re.sub(r"_+", "_", name).strip("_")
+    base, dot, ext = name.rpartition(".")
+    if dot:
+        base = _TRAILING_NUM.sub("", base)
+        dup = re.compile(rf"(?i)[._]{re.escape(ext)}$")
+        base = dup.sub("", base)
+        name = f"{base}.{ext}" if base else f".{ext}"
+    else:
+        name = _TRAILING_NUM.sub("", name)
+    name = name.strip("_.")
+    if len(name) > MAX_FILENAME_LENGTH:
+        if dot and len(ext) < 10:
+            avail = MAX_FILENAME_LENGTH - len(ext) - 1
+            name = f"{base[:avail]}.{ext}"
+        else:
+            name = name[:MAX_FILENAME_LENGTH].rstrip("_")
+    if not name or name == ".":
+        name = re.sub(_INVALID_FS_CHARS, "_", original)[:MAX_FILENAME_LENGTH] or "file"
+    return name
+
+
+def sanitize_course_name(name: str) -> str:
+    return _sanitize_name(name, SANITIZE_REGEX["course"])
+
+
+def sanitize_chapter_name(name: str) -> str:
+    return _sanitize_name(name, SANITIZE_REGEX["chapter"])
+
+
+def sanitize_file_name(name: str) -> str:
+    return _sanitize_name(name, SANITIZE_REGEX["file"])
+
+
+def annotate_chapter_name(name: str, idx: Optional[int] = None) -> str:
+    return f"{idx}_{name}" if idx is not None else name
+
+
+def is_chapter_allowed(name: str) -> bool:
+    return name.strip().lower() not in BLACKLIST_REGEX["chapter"]
--- a/librarian/plugins/librarian-extractor/uv.lock
+++ b/librarian/plugins/librarian-extractor/uv.lock
@ -0,0 +1,215 @@
+version = 1
+revision = 1
+requires-python = ">=3.10"
+
+[[package]]
+name = "colorama"
+version = "0.4.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
+]
+
+[[package]]
+name = "exceptiongroup"
+version = "1.2.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453 },
+]
+
+[[package]]
+name = "iniconfig"
+version = "2.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050 },
+]
+
+[[package]]
+name = "librarian-core"
+version = "0.1.0"
+source = { git = "https://github.com/DotNaos/librarian-core?rev=main#a564a04ad1019cb196af1ee11d654b77839a469b" }
+
+[[package]]
+name = "librarian-scraper"
+version = "0.1.0"
+source = { editable = "." }
+dependencies = [
+    { name = "librarian-core" },
+]
+
+[package.optional-dependencies]
+dev = [
+    { name = "mypy" },
+    { name = "pytest" },
+    { name = "ruff" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "importlib-metadata", marker = "python_full_version < '3.10'" },
+    { name = "librarian-core", git = "https://github.com/DotNaos/librarian-core?rev=main" },
+    { name = "mypy", marker = "extra == 'dev'" },
+    { name = "pytest", marker = "extra == 'dev'" },
+    { name = "ruff", marker = "extra == 'dev'" },
+]
+provides-extras = ["dev"]
+
+[[package]]
+name = "mypy"
+version = "1.15.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mypy-extensions" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/43/d5e49a86afa64bd3839ea0d5b9c7103487007d728e1293f52525d6d5486a/mypy-1.15.0.tar.gz", hash = "sha256:404534629d51d3efea5c800ee7c42b72a6554d6c400e6a79eafe15d11341fd43", size = 3239717 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/68/f8/65a7ce8d0e09b6329ad0c8d40330d100ea343bd4dd04c4f8ae26462d0a17/mypy-1.15.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:979e4e1a006511dacf628e36fadfecbcc0160a8af6ca7dad2f5025529e082c13", size = 10738433 },
+    { url = "https://files.pythonhosted.org/packages/b4/95/9c0ecb8eacfe048583706249439ff52105b3f552ea9c4024166c03224270/mypy-1.15.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c4bb0e1bd29f7d34efcccd71cf733580191e9a264a2202b0239da95984c5b559", size = 9861472 },
+    { url = "https://files.pythonhosted.org/packages/84/09/9ec95e982e282e20c0d5407bc65031dfd0f0f8ecc66b69538296e06fcbee/mypy-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be68172e9fd9ad8fb876c6389f16d1c1b5f100ffa779f77b1fb2176fcc9ab95b", size = 11611424 },
+    { url = "https://files.pythonhosted.org/packages/78/13/f7d14e55865036a1e6a0a69580c240f43bc1f37407fe9235c0d4ef25ffb0/mypy-1.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c7be1e46525adfa0d97681432ee9fcd61a3964c2446795714699a998d193f1a3", size = 12365450 },
+    { url = "https://files.pythonhosted.org/packages/48/e1/301a73852d40c241e915ac6d7bcd7fedd47d519246db2d7b86b9d7e7a0cb/mypy-1.15.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2e2c2e6d3593f6451b18588848e66260ff62ccca522dd231cd4dd59b0160668b", size = 12551765 },
+    { url = "https://files.pythonhosted.org/packages/77/ba/c37bc323ae5fe7f3f15a28e06ab012cd0b7552886118943e90b15af31195/mypy-1.15.0-cp310-cp310-win_amd64.whl", hash = "sha256:6983aae8b2f653e098edb77f893f7b6aca69f6cffb19b2cc7443f23cce5f4828", size = 9274701 },
+    { url = "https://files.pythonhosted.org/packages/03/bc/f6339726c627bd7ca1ce0fa56c9ae2d0144604a319e0e339bdadafbbb599/mypy-1.15.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2922d42e16d6de288022e5ca321cd0618b238cfc5570e0263e5ba0a77dbef56f", size = 10662338 },
+    { url = "https://files.pythonhosted.org/packages/e2/90/8dcf506ca1a09b0d17555cc00cd69aee402c203911410136cd716559efe7/mypy-1.15.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2ee2d57e01a7c35de00f4634ba1bbf015185b219e4dc5909e281016df43f5ee5", size = 9787540 },
+    { url = "https://files.pythonhosted.org/packages/05/05/a10f9479681e5da09ef2f9426f650d7b550d4bafbef683b69aad1ba87457/mypy-1.15.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:973500e0774b85d9689715feeffcc980193086551110fd678ebe1f4342fb7c5e", size = 11538051 },
+    { url = "https://files.pythonhosted.org/packages/e9/9a/1f7d18b30edd57441a6411fcbc0c6869448d1a4bacbaee60656ac0fc29c8/mypy-1.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a95fb17c13e29d2d5195869262f8125dfdb5c134dc8d9a9d0aecf7525b10c2c", size = 12286751 },
+    { url = "https://files.pythonhosted.org/packages/72/af/19ff499b6f1dafcaf56f9881f7a965ac2f474f69f6f618b5175b044299f5/mypy-1.15.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1905f494bfd7d85a23a88c5d97840888a7bd516545fc5aaedff0267e0bb54e2f", size = 12421783 },
+    { url = "https://files.pythonhosted.org/packages/96/39/11b57431a1f686c1aed54bf794870efe0f6aeca11aca281a0bd87a5ad42c/mypy-1.15.0-cp311-cp311-win_amd64.whl", hash = "sha256:c9817fa23833ff189db061e6d2eff49b2f3b6ed9856b4a0a73046e41932d744f", size = 9265618 },
+    { url = "https://files.pythonhosted.org/packages/98/3a/03c74331c5eb8bd025734e04c9840532226775c47a2c39b56a0c8d4f128d/mypy-1.15.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:aea39e0583d05124836ea645f412e88a5c7d0fd77a6d694b60d9b6b2d9f184fd", size = 10793981 },
+    { url = "https://files.pythonhosted.org/packages/f0/1a/41759b18f2cfd568848a37c89030aeb03534411eef981df621d8fad08a1d/mypy-1.15.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f2147ab812b75e5b5499b01ade1f4a81489a147c01585cda36019102538615f", size = 9749175 },
+    { url = "https://files.pythonhosted.org/packages/12/7e/873481abf1ef112c582db832740f4c11b2bfa510e829d6da29b0ab8c3f9c/mypy-1.15.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ce436f4c6d218a070048ed6a44c0bbb10cd2cc5e272b29e7845f6a2f57ee4464", size = 11455675 },
+    { url = "https://files.pythonhosted.org/packages/b3/d0/92ae4cde706923a2d3f2d6c39629134063ff64b9dedca9c1388363da072d/mypy-1.15.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8023ff13985661b50a5928fc7a5ca15f3d1affb41e5f0a9952cb68ef090b31ee", size = 12410020 },
+    { url = "https://files.pythonhosted.org/packages/46/8b/df49974b337cce35f828ba6fda228152d6db45fed4c86ba56ffe442434fd/mypy-1.15.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1124a18bc11a6a62887e3e137f37f53fbae476dc36c185d549d4f837a2a6a14e", size = 12498582 },
+    { url = "https://files.pythonhosted.org/packages/13/50/da5203fcf6c53044a0b699939f31075c45ae8a4cadf538a9069b165c1050/mypy-1.15.0-cp312-cp312-win_amd64.whl", hash = "sha256:171a9ca9a40cd1843abeca0e405bc1940cd9b305eaeea2dda769ba096932bb22", size = 9366614 },
+    { url = "https://files.pythonhosted.org/packages/6a/9b/fd2e05d6ffff24d912f150b87db9e364fa8282045c875654ce7e32fffa66/mypy-1.15.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:93faf3fdb04768d44bf28693293f3904bbb555d076b781ad2530214ee53e3445", size = 10788592 },
+    { url = "https://files.pythonhosted.org/packages/74/37/b246d711c28a03ead1fd906bbc7106659aed7c089d55fe40dd58db812628/mypy-1.15.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:811aeccadfb730024c5d3e326b2fbe9249bb7413553f15499a4050f7c30e801d", size = 9753611 },
+    { url = "https://files.pythonhosted.org/packages/a6/ac/395808a92e10cfdac8003c3de9a2ab6dc7cde6c0d2a4df3df1b815ffd067/mypy-1.15.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:98b7b9b9aedb65fe628c62a6dc57f6d5088ef2dfca37903a7d9ee374d03acca5", size = 11438443 },
+    { url = "https://files.pythonhosted.org/packages/d2/8b/801aa06445d2de3895f59e476f38f3f8d610ef5d6908245f07d002676cbf/mypy-1.15.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c43a7682e24b4f576d93072216bf56eeff70d9140241f9edec0c104d0c515036", size = 12402541 },
+    { url = "https://files.pythonhosted.org/packages/c7/67/5a4268782eb77344cc613a4cf23540928e41f018a9a1ec4c6882baf20ab8/mypy-1.15.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:baefc32840a9f00babd83251560e0ae1573e2f9d1b067719479bfb0e987c6357", size = 12494348 },
+    { url = "https://files.pythonhosted.org/packages/83/3e/57bb447f7bbbfaabf1712d96f9df142624a386d98fb026a761532526057e/mypy-1.15.0-cp313-cp313-win_amd64.whl", hash = "sha256:b9378e2c00146c44793c98b8d5a61039a048e31f429fb0eb546d93f4b000bedf", size = 9373648 },
+    { url = "https://files.pythonhosted.org/packages/09/4e/a7d65c7322c510de2c409ff3828b03354a7c43f5a8ed458a7a131b41c7b9/mypy-1.15.0-py3-none-any.whl", hash = "sha256:5469affef548bd1895d86d3bf10ce2b44e33d86923c29e4d675b3e323437ea3e", size = 2221777 },
+]
+
+[[package]]
+name = "mypy-extensions"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963 },
+]
+
+[[package]]
+name = "packaging"
+version = "25.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469 },
+]
+
+[[package]]
+name = "pluggy"
+version = "1.5.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 },
+]
+
+[[package]]
+name = "pytest"
+version = "8.3.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "colorama", marker = "sys_platform == 'win32'" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "iniconfig" },
+    { name = "packaging" },
+    { name = "pluggy" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634 },
+]
+
+[[package]]
+name = "ruff"
+version = "0.11.7"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5b/89/6f9c9674818ac2e9cc2f2b35b704b7768656e6b7c139064fc7ba8fbc99f1/ruff-0.11.7.tar.gz", hash = "sha256:655089ad3224070736dc32844fde783454f8558e71f501cb207485fe4eee23d4", size = 4054861 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/ec/21927cb906c5614b786d1621dba405e3d44f6e473872e6df5d1a6bca0455/ruff-0.11.7-py3-none-linux_armv6l.whl", hash = "sha256:d29e909d9a8d02f928d72ab7837b5cbc450a5bdf578ab9ebee3263d0a525091c", size = 10245403 },
+    { url = "https://files.pythonhosted.org/packages/e2/af/fec85b6c2c725bcb062a354dd7cbc1eed53c33ff3aa665165871c9c16ddf/ruff-0.11.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:dd1fb86b168ae349fb01dd497d83537b2c5541fe0626e70c786427dd8363aaee", size = 11007166 },
+    { url = "https://files.pythonhosted.org/packages/31/9a/2d0d260a58e81f388800343a45898fd8df73c608b8261c370058b675319a/ruff-0.11.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d3d7d2e140a6fbbc09033bce65bd7ea29d6a0adeb90b8430262fbacd58c38ada", size = 10378076 },
+    { url = "https://files.pythonhosted.org/packages/c2/c4/9b09b45051404d2e7dd6d9dbcbabaa5ab0093f9febcae664876a77b9ad53/ruff-0.11.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4809df77de390a1c2077d9b7945d82f44b95d19ceccf0c287c56e4dc9b91ca64", size = 10557138 },
+    { url = "https://files.pythonhosted.org/packages/5e/5e/f62a1b6669870a591ed7db771c332fabb30f83c967f376b05e7c91bccd14/ruff-0.11.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f3a0c2e169e6b545f8e2dba185eabbd9db4f08880032e75aa0e285a6d3f48201", size = 10095726 },
+    { url = "https://files.pythonhosted.org/packages/45/59/a7aa8e716f4cbe07c3500a391e58c52caf665bb242bf8be42c62adef649c/ruff-0.11.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:49b888200a320dd96a68e86736cf531d6afba03e4f6cf098401406a257fcf3d6", size = 11672265 },
+    { url = "https://files.pythonhosted.org/packages/dd/e3/101a8b707481f37aca5f0fcc3e42932fa38b51add87bfbd8e41ab14adb24/ruff-0.11.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:2b19cdb9cf7dae00d5ee2e7c013540cdc3b31c4f281f1dacb5a799d610e90db4", size = 12331418 },
+    { url = "https://files.pythonhosted.org/packages/dd/71/037f76cbe712f5cbc7b852e4916cd3cf32301a30351818d32ab71580d1c0/ruff-0.11.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64e0ee994c9e326b43539d133a36a455dbaab477bc84fe7bfbd528abe2f05c1e", size = 11794506 },
+    { url = "https://files.pythonhosted.org/packages/ca/de/e450b6bab1fc60ef263ef8fcda077fb4977601184877dce1c59109356084/ruff-0.11.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bad82052311479a5865f52c76ecee5d468a58ba44fb23ee15079f17dd4c8fd63", size = 13939084 },
+    { url = "https://files.pythonhosted.org/packages/0e/2c/1e364cc92970075d7d04c69c928430b23e43a433f044474f57e425cbed37/ruff-0.11.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7940665e74e7b65d427b82bffc1e46710ec7f30d58b4b2d5016e3f0321436502", size = 11450441 },
+    { url = "https://files.pythonhosted.org/packages/9d/7d/1b048eb460517ff9accd78bca0fa6ae61df2b276010538e586f834f5e402/ruff-0.11.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:169027e31c52c0e36c44ae9a9c7db35e505fee0b39f8d9fca7274a6305295a92", size = 10441060 },
+    { url = "https://files.pythonhosted.org/packages/3a/57/8dc6ccfd8380e5ca3d13ff7591e8ba46a3b330323515a4996b991b10bd5d/ruff-0.11.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:305b93f9798aee582e91e34437810439acb28b5fc1fee6b8205c78c806845a94", size = 10058689 },
+    { url = "https://files.pythonhosted.org/packages/23/bf/20487561ed72654147817885559ba2aa705272d8b5dee7654d3ef2dbf912/ruff-0.11.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a681db041ef55550c371f9cd52a3cf17a0da4c75d6bd691092dfc38170ebc4b6", size = 11073703 },
+    { url = "https://files.pythonhosted.org/packages/9d/27/04f2db95f4ef73dccedd0c21daf9991cc3b7f29901a4362057b132075aa4/ruff-0.11.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:07f1496ad00a4a139f4de220b0c97da6d4c85e0e4aa9b2624167b7d4d44fd6b6", size = 11532822 },
+    { url = "https://files.pythonhosted.org/packages/e1/72/43b123e4db52144c8add336581de52185097545981ff6e9e58a21861c250/ruff-0.11.7-py3-none-win32.whl", hash = "sha256:f25dfb853ad217e6e5f1924ae8a5b3f6709051a13e9dad18690de6c8ff299e26", size = 10362436 },
+    { url = "https://files.pythonhosted.org/packages/c5/a0/3e58cd76fdee53d5c8ce7a56d84540833f924ccdf2c7d657cb009e604d82/ruff-0.11.7-py3-none-win_amd64.whl", hash = "sha256:0a931d85959ceb77e92aea4bbedfded0a31534ce191252721128f77e5ae1f98a", size = 11566676 },
+    { url = "https://files.pythonhosted.org/packages/68/ca/69d7c7752bce162d1516e5592b1cc6b6668e9328c0d270609ddbeeadd7cf/ruff-0.11.7-py3-none-win_arm64.whl", hash = "sha256:778c1e5d6f9e91034142dfd06110534ca13220bfaad5c3735f6cb844654f6177", size = 10677936 },
+]
+
+[[package]]
+name = "tomli"
+version = "2.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/18/87/302344fed471e44a87289cf4967697d07e532f2421fdaf868a303cbae4ff/tomli-2.2.1.tar.gz", hash = "sha256:cd45e1dc79c835ce60f7404ec8119f2eb06d38b1deba146f07ced3bbc44505ff", size = 17175 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/ca/75707e6efa2b37c77dadb324ae7d9571cb424e61ea73fad7c56c2d14527f/tomli-2.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:678e4fa69e4575eb77d103de3df8a895e1591b48e740211bd1067378c69e8249", size = 131077 },
+    { url = "https://files.pythonhosted.org/packages/c7/16/51ae563a8615d472fdbffc43a3f3d46588c264ac4f024f63f01283becfbb/tomli-2.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:023aa114dd824ade0100497eb2318602af309e5a55595f76b626d6d9f3b7b0a6", size = 123429 },
+    { url = "https://files.pythonhosted.org/packages/f1/dd/4f6cd1e7b160041db83c694abc78e100473c15d54620083dbd5aae7b990e/tomli-2.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece47d672db52ac607a3d9599a9d48dcb2f2f735c6c2d1f34130085bb12b112a", size = 226067 },
+    { url = "https://files.pythonhosted.org/packages/a9/6b/c54ede5dc70d648cc6361eaf429304b02f2871a345bbdd51e993d6cdf550/tomli-2.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6972ca9c9cc9f0acaa56a8ca1ff51e7af152a9f87fb64623e31d5c83700080ee", size = 236030 },
+    { url = "https://files.pythonhosted.org/packages/1f/47/999514fa49cfaf7a92c805a86c3c43f4215621855d151b61c602abb38091/tomli-2.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c954d2250168d28797dd4e3ac5cf812a406cd5a92674ee4c8f123c889786aa8e", size = 240898 },
+    { url = "https://files.pythonhosted.org/packages/73/41/0a01279a7ae09ee1573b423318e7934674ce06eb33f50936655071d81a24/tomli-2.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8dd28b3e155b80f4d54beb40a441d366adcfe740969820caf156c019fb5c7ec4", size = 229894 },
+    { url = "https://files.pythonhosted.org/packages/55/18/5d8bc5b0a0362311ce4d18830a5d28943667599a60d20118074ea1b01bb7/tomli-2.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e59e304978767a54663af13c07b3d1af22ddee3bb2fb0618ca1593e4f593a106", size = 245319 },
+    { url = "https://files.pythonhosted.org/packages/92/a3/7ade0576d17f3cdf5ff44d61390d4b3febb8a9fc2b480c75c47ea048c646/tomli-2.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:33580bccab0338d00994d7f16f4c4ec25b776af3ffaac1ed74e0b3fc95e885a8", size = 238273 },
+    { url = "https://files.pythonhosted.org/packages/72/6f/fa64ef058ac1446a1e51110c375339b3ec6be245af9d14c87c4a6412dd32/tomli-2.2.1-cp311-cp311-win32.whl", hash = "sha256:465af0e0875402f1d226519c9904f37254b3045fc5084697cefb9bdde1ff99ff", size = 98310 },
+    { url = "https://files.pythonhosted.org/packages/6a/1c/4a2dcde4a51b81be3530565e92eda625d94dafb46dbeb15069df4caffc34/tomli-2.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2d0f2fdd22b02c6d81637a3c95f8cd77f995846af7414c5c4b8d0545afa1bc4b", size = 108309 },
+    { url = "https://files.pythonhosted.org/packages/52/e1/f8af4c2fcde17500422858155aeb0d7e93477a0d59a98e56cbfe75070fd0/tomli-2.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4a8f6e44de52d5e6c657c9fe83b562f5f4256d8ebbfe4ff922c495620a7f6cea", size = 132762 },
+    { url = "https://files.pythonhosted.org/packages/03/b8/152c68bb84fc00396b83e7bbddd5ec0bd3dd409db4195e2a9b3e398ad2e3/tomli-2.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8d57ca8095a641b8237d5b079147646153d22552f1c637fd3ba7f4b0b29167a8", size = 123453 },
+    { url = "https://files.pythonhosted.org/packages/c8/d6/fc9267af9166f79ac528ff7e8c55c8181ded34eb4b0e93daa767b8841573/tomli-2.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e340144ad7ae1533cb897d406382b4b6fede8890a03738ff1683af800d54192", size = 233486 },
+    { url = "https://files.pythonhosted.org/packages/5c/51/51c3f2884d7bab89af25f678447ea7d297b53b5a3b5730a7cb2ef6069f07/tomli-2.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db2b95f9de79181805df90bedc5a5ab4c165e6ec3fe99f970d0e302f384ad222", size = 242349 },
+    { url = "https://files.pythonhosted.org/packages/ab/df/bfa89627d13a5cc22402e441e8a931ef2108403db390ff3345c05253935e/tomli-2.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40741994320b232529c802f8bc86da4e1aa9f413db394617b9a256ae0f9a7f77", size = 252159 },
+    { url = "https://files.pythonhosted.org/packages/9e/6e/fa2b916dced65763a5168c6ccb91066f7639bdc88b48adda990db10c8c0b/tomli-2.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:400e720fe168c0f8521520190686ef8ef033fb19fc493da09779e592861b78c6", size = 237243 },
+    { url = "https://files.pythonhosted.org/packages/b4/04/885d3b1f650e1153cbb93a6a9782c58a972b94ea4483ae4ac5cedd5e4a09/tomli-2.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:02abe224de6ae62c19f090f68da4e27b10af2b93213d36cf44e6e1c5abd19fdd", size = 259645 },
+    { url = "https://files.pythonhosted.org/packages/9c/de/6b432d66e986e501586da298e28ebeefd3edc2c780f3ad73d22566034239/tomli-2.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:b82ebccc8c8a36f2094e969560a1b836758481f3dc360ce9a3277c65f374285e", size = 244584 },
+    { url = "https://files.pythonhosted.org/packages/1c/9a/47c0449b98e6e7d1be6cbac02f93dd79003234ddc4aaab6ba07a9a7482e2/tomli-2.2.1-cp312-cp312-win32.whl", hash = "sha256:889f80ef92701b9dbb224e49ec87c645ce5df3fa2cc548664eb8a25e03127a98", size = 98875 },
+    { url = "https://files.pythonhosted.org/packages/ef/60/9b9638f081c6f1261e2688bd487625cd1e660d0a85bd469e91d8db969734/tomli-2.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:7fc04e92e1d624a4a63c76474610238576942d6b8950a2d7f908a340494e67e4", size = 109418 },
+    { url = "https://files.pythonhosted.org/packages/04/90/2ee5f2e0362cb8a0b6499dc44f4d7d48f8fff06d28ba46e6f1eaa61a1388/tomli-2.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f4039b9cbc3048b2416cc57ab3bda989a6fcf9b36cf8937f01a6e731b64f80d7", size = 132708 },
+    { url = "https://files.pythonhosted.org/packages/c0/ec/46b4108816de6b385141f082ba99e315501ccd0a2ea23db4a100dd3990ea/tomli-2.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:286f0ca2ffeeb5b9bd4fcc8d6c330534323ec51b2f52da063b11c502da16f30c", size = 123582 },
+    { url = "https://files.pythonhosted.org/packages/a0/bd/b470466d0137b37b68d24556c38a0cc819e8febe392d5b199dcd7f578365/tomli-2.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92ef1a44547e894e2a17d24e7557a5e85a9e1d0048b0b5e7541f76c5032cb13", size = 232543 },
+    { url = "https://files.pythonhosted.org/packages/d9/e5/82e80ff3b751373f7cead2815bcbe2d51c895b3c990686741a8e56ec42ab/tomli-2.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9316dc65bed1684c9a98ee68759ceaed29d229e985297003e494aa825ebb0281", size = 241691 },
+    { url = "https://files.pythonhosted.org/packages/05/7e/2a110bc2713557d6a1bfb06af23dd01e7dde52b6ee7dadc589868f9abfac/tomli-2.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e85e99945e688e32d5a35c1ff38ed0b3f41f43fad8df0bdf79f72b2ba7bc5272", size = 251170 },
+    { url = "https://files.pythonhosted.org/packages/64/7b/22d713946efe00e0adbcdfd6d1aa119ae03fd0b60ebed51ebb3fa9f5a2e5/tomli-2.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ac065718db92ca818f8d6141b5f66369833d4a80a9d74435a268c52bdfa73140", size = 236530 },
+    { url = "https://files.pythonhosted.org/packages/38/31/3a76f67da4b0cf37b742ca76beaf819dca0ebef26d78fc794a576e08accf/tomli-2.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:d920f33822747519673ee656a4b6ac33e382eca9d331c87770faa3eef562aeb2", size = 258666 },
+    { url = "https://files.pythonhosted.org/packages/07/10/5af1293da642aded87e8a988753945d0cf7e00a9452d3911dd3bb354c9e2/tomli-2.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a198f10c4d1b1375d7687bc25294306e551bf1abfa4eace6650070a5c1ae2744", size = 243954 },
+    { url = "https://files.pythonhosted.org/packages/5b/b9/1ed31d167be802da0fc95020d04cd27b7d7065cc6fbefdd2f9186f60d7bd/tomli-2.2.1-cp313-cp313-win32.whl", hash = "sha256:d3f5614314d758649ab2ab3a62d4f2004c825922f9e370b29416484086b264ec", size = 98724 },
+    { url = "https://files.pythonhosted.org/packages/c7/32/b0963458706accd9afcfeb867c0f9175a741bf7b19cd424230714d722198/tomli-2.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:a38aa0308e754b0e3c67e344754dff64999ff9b513e691d0e786265c93583c69", size = 109383 },
+    { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257 },
+]
+
+[[package]]
+name = "typing-extensions"
+version = "4.13.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806 },
+]
--- a/librarian/plugins/librarian-scraper/README.md
+++ b/librarian/plugins/librarian-scraper/README.md
@ -0,0 +1 @@
+# Librarian Scraper
--- a/librarian/plugins/librarian-scraper/pyproject.toml
+++ b/librarian/plugins/librarian-scraper/pyproject.toml
@ -0,0 +1,41 @@
+[project]
+name = "librarian-scraper"
+version = "0.2.1"
+description = "FastAPI gateway and runtime pipeline for Librarian"
+readme = "README.md"
+authors = [{ name = "DotNaos", email = "schuetzoliver00@gmail.com" }]
+requires-python = ">=3.10"
+dependencies = [
+    "importlib_metadata; python_version<'3.10'",
+    "playwright>=1.51.0",
+    "dotenv>=0.9.9",
+    "parsel>=1.10.0",
+    "librarian-core",
+    "httpx>=0.28.1",
+]
+
+[build-system]
+requires = ["hatchling>=1.21"]
+build-backend = "hatchling.build"
+
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/librarian_scraper"]
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.uv.sources]
+#librarian-core = { git = "https://github.com/DotNaos/librarian-core", rev = "dev" }
+
+[project.entry-points."librarian.workers"]
+crawler = "librarian_scraper.crawler:Crawler"
+downloader = "librarian_scraper.downloader:Downloader"
+
+
+# ───────── optional: dev / test extras ─────────
+[project.optional-dependencies]
+dev = ["ruff", "pytest", "mypy"]
+
+[project.scripts]
+example = "examples.app:app"
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/init.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/init.py
@ -0,0 +1,12 @@
+from .crawler import (
+    Crawler,
+)
+from .downloader import (
+    Downloader,
+)
+
+__all__ = [
+    "Crawler",
+    "Downloader",
+    "Extractor",
+]
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/constants.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/constants.py
@ -0,0 +1,29 @@
+"""
+URLs used by the scraper.
+Functions marked as PUBLIC can be accessed without authentication.
+Functions marked as PRIVATE require authentication.
+"""
+
+BASE_URL = "https://moodle.fhgr.ch"
+
+CRAWLER = {
+    "DELAY_SLOW": 2.0,
+    "DELAY_FAST": 0.5,
+    "BATCH_SLOW": 2,
+    "BATCH_FAST": 8,
+}
+
+class PUBLIC_URLS:
+    base_url = BASE_URL
+    login = f"{BASE_URL}/login/index.php"
+    index = f"{BASE_URL}/course/index.php"
+    degree_program = lambda degree_program_id: f"{BASE_URL}/course/index.php?categoryid={degree_program_id}"
+    category = lambda category_id: f"{BASE_URL}/course/index.php?categoryid={category_id}"
+    term = lambda term_id: f"{BASE_URL}/course/index.php?categoryid={term_id}"
+
+class PRIVATE_URLS:
+    user_courses = f"{BASE_URL}/my/courses.php"
+    dashboard = f"{BASE_URL}/my/"
+    course = lambda course_id: f"{BASE_URL}/course/view.php?id={course_id}"
+    files = lambda context_id: f"{BASE_URL}/course/downloadcontent.php?contextid={context_id}"
+    file = lambda file_id: f"{BASE_URL}/mod/resource/view.php?id={file_id}"
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/init.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/init.py
@ -0,0 +1,7 @@
+from librarian_scraper.crawler.cookie_crawler import CookieCrawler
+from librarian_scraper.crawler.crawler import Crawler
+
+__all__ = [
+    "CookieCrawler",
+    "Crawler",
+]
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/cookie_crawler.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/cookie_crawler.py
@ -0,0 +1,138 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+from typing import List, Optional
+
+from httpx import Cookies
+from playwright.async_api import Browser, Cookie, Page, async_playwright
+
+from librarian_scraper.constants import PRIVATE_URLS, PUBLIC_URLS
+
+
+class CookieCrawler:
+    """
+    Retrieve Moodle session cookies + sesskey via Playwright.
+
+    Usage
+    -----
+    >>> crawler = CookieCrawler()
+    >>> cookies, sesskey = await crawler.crawl()          # inside async code
+    # or
+    >>> cookies, sesskey = CookieCrawler.crawl_sync()     # plain scripts
+    """
+
+    # ------------------------------------------------------------------ #
+    # construction                                                       #
+    # ------------------------------------------------------------------ #
+    def __init__(self, *, headless: bool = True) -> None:
+        self.headless = headless
+        self.cookies: Optional[List[Cookie]] = None
+        self.sesskey: str = ""
+
+        self.username: str = os.getenv("MOODLE_USERNAME", "")
+        self.password: str = os.getenv("MOODLE_PASSWORD", "")
+        if not self.username or not self.password:
+            raise ValueError(
+                "Set MOODLE_USERNAME and MOODLE_PASSWORD as environment variables."
+            )
+
+    # ------------------------------------------------------------------ #
+    # public API                                                         #
+    # ------------------------------------------------------------------ #
+    async def crawl(self) -> tuple[Cookies, str]:
+        """
+        Async entry-point – await this inside FastAPI / Prefect etc.
+        """
+        async with async_playwright() as p:
+            browser: Browser = await p.chromium.launch(headless=self.headless)
+            page = await browser.new_page()
+            await page.goto(PUBLIC_URLS.login)
+            logging.info("Login page loaded: %s", page.url)
+
+            await self._login(page)
+            await browser.close()
+
+        if not self.cookies:
+            raise RuntimeError("Login failed – no cookies retrieved.")
+
+        return self._to_cookiejar(self.cookies), self.sesskey
+
+    @classmethod
+    def crawl_sync(cls, **kwargs) -> tuple[Cookies, str]:
+        """
+        Synchronous helper for CLI / notebooks.
+
+        Detects whether an event loop is already running.  If so, it
+        schedules the coroutine and waits; otherwise it starts a fresh loop.
+        """
+        self = cls(**kwargs)
+
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:  # no loop running → safe to create one
+            return asyncio.run(self.crawl())
+
+        # An event loop exists – schedule coroutine
+        return loop.run_until_complete(self.crawl())
+
+    # ------------------------------------------------------------------ #
+    # internal helpers                                                   #
+    # ------------------------------------------------------------------ #
+    async def _login(self, page: Page) -> None:
+        """Fill the SSO form and extract cookies + sesskey."""
+
+        # Select organisation / IdP
+        await page.click("#wayf_submit_button")
+
+        # Wait for the credential form
+        await page.wait_for_selector("form[method='post']", state="visible")
+
+        # Credentials
+        await page.fill("input[id='username']", self.username)
+        await page.fill("input[id='password']", self.password)
+        await page.click("button[class='aai_login_button']")
+
+        # Wait for redirect to /my/ page (dashboard), this means the login is complete
+        await page.wait_for_url(PRIVATE_URLS.dashboard)
+        await page.wait_for_selector("body", state="attached")
+
+        # Navigate to personal course overview
+        await page.goto(PRIVATE_URLS.user_courses)
+        await page.wait_for_selector("body", state="attached")
+
+        # Collect session cookies
+        self.cookies = await page.context.cookies()
+
+        # Extract sesskey from injected Moodle config
+        try:
+            self.sesskey = await page.evaluate(
+                "() => window.M && M.cfg && M.cfg.sesskey"
+            )
+        except Exception as exc:
+            raise RuntimeError("sesskey not found via JS evaluation") from exc
+
+        if not self.sesskey:
+            raise RuntimeError("sesskey is empty after evaluation.")
+
+        logging.debug("sesskey: %s", self.sesskey)
+        logging.debug("cookies: %s", self.cookies)
+
+        # Dev convenience
+        if not self.headless:
+            await page.wait_for_timeout(5000)
+
+    # ------------------------------------------------------------------ #
+    # cookie conversion                                                  #
+    # ------------------------------------------------------------------ #
+    def _to_cookiejar(self, raw: List[Cookie]) -> Cookies:
+        jar = Cookies()
+        for c in raw:
+            jar.set(
+                name=c.get("name", ""),
+                value=c.get("value", ""),
+                domain=c.get("domain", "").lstrip("."),
+                path=c.get("path", "/"),
+            )
+        return jar
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/crawler.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/crawler.py
@ -0,0 +1,264 @@
+"""
+librarian_scraper.crawler.crawler
+---------------------------------
+Scrapes Moodle degree programmes into CrawlData.
+• Hero images
+• Polite throttling / batching
+• Term-filter: only the latest two terms (dev)
+• USER_SPECIFIC flag to keep / drop inaccessible courses
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import time
+from datetime import timedelta
+from typing import List, Tuple
+import sys
+import asyncio
+
+if sys.platform == "win32":
+    # Switch from Selector to Proactor so asyncio.subprocess works
+    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
+
+import httpx
+import parsel
+from librarian_core.utils.path_utils import get_cache_root
+from librarian_core.workers.base import Worker
+from prefect import get_run_logger, task
+from prefect.futures import wait
+
+from librarian_scraper.constants import CRAWLER, PRIVATE_URLS, PUBLIC_URLS
+from librarian_scraper.crawler.cookie_crawler import CookieCrawler
+from librarian_scraper.models.crawl_data import (
+    CrawlCourse,
+    CrawlData,
+    CrawlProgram,
+    CrawlTerm,
+)
+
+# --------------------------------------------------------------------------- #
+# module-level shared items for static task                                   #
+# --------------------------------------------------------------------------- #
+_COOKIE_JAR: httpx.Cookies | None = None
+_DELAY: float = 0.0
+
+CACHE_FILE = get_cache_root() / "librarian_no_access_cache.json"
+
+
+# --------------------------------------------------------------------------- #
+# utility                                                                     #
+# --------------------------------------------------------------------------- #
+def looks_like_enrol(resp: httpx.Response) -> bool:
+    txt = resp.text.lower()
+    return (
+        "login" in str(resp.url).lower()
+        or "#page-enrol" in txt
+        or "you need to enrol" in txt
+    )
+
+
+# --------------------------------------------------------------------------- #
+# main worker                                                                 #
+# --------------------------------------------------------------------------- #
+class Crawler(Worker[CrawlProgram, CrawlData]):
+    input_model = CrawlProgram
+    output_model = CrawlData
+
+    # toggles (env overrides)
+    RELAXED: bool
+    USER_SPECIFIC: bool
+    CLEAR_CACHE: bool
+
+    # ------------------------------------------------------------------ #
+    # flow entry-point                                                   #
+    # ------------------------------------------------------------------ #
+    async def __run__(self, program: CrawlProgram) -> CrawlData:
+        global _COOKIE_JAR, _DELAY
+        lg = get_run_logger()
+
+        self.RELAXED = os.getenv("SCRAPER_RELAXED", "true").lower() == "true"
+        self.USER_SPECIFIC = os.getenv("SCRAPER_USER_SPECIFIC", "true").lower() == "true"
+        self.CLEAR_CACHE = os.getenv("SCRAPER_CLEAR_CACHE", "false").lower() == "true"
+
+        _DELAY = CRAWLER["DELAY_SLOW"] if self.RELAXED else CRAWLER["DELAY_FAST"]
+        batch = CRAWLER["BATCH_SLOW"] if self.RELAXED else CRAWLER["BATCH_FAST"]
+        lg.info(
+            "Mode=%s user_specific=%s delay=%.1fs batch=%s",
+            "RELAXED" if self.RELAXED else "FAST",
+            self.USER_SPECIFIC,
+            _DELAY,
+            batch,
+        )
+
+        # --------------------------- login
+        cookies, _ = await CookieCrawler().crawl()
+        _COOKIE_JAR = cookies
+        self._client = httpx.Client(cookies=cookies, follow_redirects=True)
+
+        if not self._logged_in():
+            lg.error("Guest session detected – aborting crawl.")
+            raise RuntimeError("Login failed")
+
+        # --------------------------- cache
+        no_access: set[str] = set() if self.CLEAR_CACHE else self._load_cache()
+
+        # --------------------------- scrape terms (first two for dev)
+        terms = self._crawl_terms(program.id)[:2]
+        lg.info("Terms discovered: %d", len(terms))
+
+        # --------------------------- scrape courses
+        for term in terms:
+            courses = self._crawl_courses(term.id)
+            lg.info("[%s] raw courses: %d", term.name, len(courses))
+
+            for i in range(0, len(courses), batch):
+                futs = [
+                    self._crawl_course_task.submit(course.id)
+                    for course in courses[i : i + batch]
+                ]
+                done, _ = wait(futs)
+
+                for fut in done:
+                    cid, res_id = fut.result()
+                    if res_id:
+                        next(
+                            c for c in courses if c.id == cid
+                        ).content_ressource_id = res_id
+                    else:
+                        no_access.add(cid)
+
+            term.courses = (
+                [c for c in courses if c.content_ressource_id]
+                if self.USER_SPECIFIC
+                else courses
+            )
+            lg.info("[%s] kept: %d", term.name, len(term.courses))
+
+        # --------------------------- persist cache
+        self._save_cache(no_access)
+
+        return CrawlData(
+            degree_program=CrawlProgram(
+                id=program.id,
+                name=program.name,
+                terms=[t for t in terms if t.courses],
+            )
+        )
+
+    # ------------------------------------------------------------------ #
+    # static task inside class                                           #
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    @task(
+        name="crawl_course",
+        retries=2,
+        retry_delay_seconds=5,
+        log_prints=True,
+        cache_expiration=timedelta(days=1),
+    )
+    def _crawl_course_task(course_id: str) -> Tuple[str, str]:
+        """
+        Returns (course_id, content_resource_id or "").
+        Never raises; logs reasons instead.
+        """
+        lg = get_run_logger()
+        assert _COOKIE_JAR is not None
+
+        url = PRIVATE_URLS.course(course_id)
+        for attempt in (1, 2):
+            try:
+                r = httpx.get(
+                    url, cookies=_COOKIE_JAR, follow_redirects=True, timeout=30
+                )
+                r.raise_for_status()
+                time.sleep(_DELAY)
+                break
+            except Exception as exc:
+                lg.warning("GET %s failed (%s) attempt %d/2", url, exc, attempt)
+                time.sleep(_DELAY)
+        else:
+            lg.warning("Course %s unreachable.", course_id)
+            return course_id, ""
+
+        if looks_like_enrol(r):
+            lg.info("No access to course %s (enrol / login page).", course_id)
+            return course_id, ""
+
+        href = (
+            parsel.Selector(r.text)
+            .css('a[data-downloadcourse="1"]::attr(href)')
+            .get("")
+        )
+        if not href:
+            lg.info("Course %s has no downloadable content.", course_id)
+            return course_id, ""
+
+        return course_id, href.split("=")[-1]
+
+    # ------------------------------------------------------------------ #
+    # helpers                                                            #
+    # ------------------------------------------------------------------ #
+    def _logged_in(self) -> bool:
+        html = self._get_html(PUBLIC_URLS.index)
+        return not parsel.Selector(text=html).css("div.usermenu span.login a")
+
+    def _crawl_terms(self, dp_id: str) -> List[CrawlTerm]:
+        html = self._get_html(PUBLIC_URLS.degree_program(dp_id))
+        sel = parsel.Selector(text=html)
+        out = []
+        for a in sel.css("div.category h3.categoryname a"):
+            name = a.xpath("text()").get("").strip()
+            if re.match(r"^(FS|HS)\d{2}$", name):
+                out.append(
+                    CrawlTerm(name=name, id=a.xpath("@href").get("").split("=")[-1])
+                )
+        order = {"FS": 0, "HS": 1}
+        return sorted(
+            out, key=lambda t: (2000 + int(t.name[2:]), order[t.name[:2]]), reverse=True
+        )
+
+    def _crawl_courses(self, term_id: str) -> List[CrawlCourse]:
+        html = self._get_html(PUBLIC_URLS.term(term_id))
+        sel = parsel.Selector(text=html)
+        courses = []
+        for box in sel.css("div.coursebox"):
+            anchor = box.css("h3.coursename a")
+            if not anchor:
+                continue
+            cid = anchor.attrib.get("href", "").split("=")[-1]
+            raw = anchor.xpath("text()").get("").strip()
+            name = re.sub(r"\s*(FS|HS)\d{2}\s*", "", raw)
+            name = re.sub(r"\s*\(.*?\)\s*", "", name).strip()
+            hero = box.css("div.courseimage img::attr(src)").get("") or ""
+            courses.append(CrawlCourse(id=cid, name=name, hero_image=hero))
+        return courses
+
+    def _get_html(self, url: str) -> str:
+        try:
+            r = self._client.get(url, timeout=30)
+            r.raise_for_status()
+            time.sleep(_DELAY)
+            return r.text
+        except Exception as exc:
+            get_run_logger().warning("GET %s failed (%s)", url, exc)
+            return ""
+
+    # ------------------------------------------------------------------ #
+    # cache helpers                                                      #
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _load_cache() -> set[str]:
+        try:
+            return set(json.loads(CACHE_FILE.read_text()))
+        except Exception:
+            return set()
+
+    @staticmethod
+    def _save_cache(cache: set[str]) -> None:
+        try:
+            CACHE_FILE.write_text(json.dumps(sorted(cache), indent=2))
+        except Exception as exc:
+            get_run_logger().warning("Could not save cache: %s", exc)
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/index_crawler.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/index_crawler.py
@ -0,0 +1,357 @@
+import concurrent.futures
+import json
+import logging
+import re
+import tempfile
+import time
+from pathlib import Path
+
+import httpx
+import parsel
+from librarian_core.model import Course, DegreeProgram, FileEntry, MoodleIndex, Semester
+
+from . import URLs
+
+CACHE_FILENAME = "librarian_no_access_cache.json"
+NO_ACCESS_CACHE_FILE = Path(tempfile.gettempdir()) / CACHE_FILENAME
+
+
+class IndexCrawler:
+    def __init__(self, degree_program: DegreeProgram, cookies: httpx.Cookies, debug: bool = False, *, max_workers: int = 8) -> None:
+        self.degree_program = degree_program
+        self.debug = debug
+        self.client = httpx.Client(cookies=cookies, follow_redirects=True)
+        self.max_workers = max_workers
+
+        # When True the cached “no-access” set is ignored for this run
+        self._ignore_cache: bool = False
+
+        # Load persisted cache of course-IDs the user cannot access
+        if NO_ACCESS_CACHE_FILE.exists():
+            try:
+                self._no_access_cache: set[str] = set(json.loads(NO_ACCESS_CACHE_FILE.read_text()))
+            except Exception:
+                logging.warning("Failed to read no-access cache, starting fresh.")
+                self._no_access_cache = set()
+        else:
+            self._no_access_cache = set()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.client.close()
+
+    def __del__(self):
+        # Fallback in case the context manager isn’t used
+        if not self.client.is_closed:
+            self.client.close()
+
+    """
+    Crawl a single instance of MoodleIndex.
+    This returns a MoodleIndex object populated with data.
+    """
+
+    def crawl_index(self, userSpecific: bool = True, *, use_cache: bool = True) -> MoodleIndex:
+        """
+        Build and return a `MoodleIndex`.
+
+        Parameters
+        ----------
+        userSpecific : bool
+            When True, include only courses that expose a downloadable content resource.
+        use_cache : bool, default True
+            If False, bypass the persisted “no-access” cache so every course is probed
+            afresh. Newly discovered “no-access” courses are still written back to the
+            cache at the end of the crawl.
+        """
+        # Set runtime flag for has_user_access()
+        self._ignore_cache = not use_cache
+
+        semesters = []
+        # Get all courses for each semester and the courseid and name for each course.
+        semesters = self.crawl_semesters()
+        # Crawl only the latest two semesters to reduce load (remove once caching is implemented)
+        for semester in semesters[:2]:
+            courses = self.crawl_courses(semester)
+
+            # Crawl courses in parallel to speed things up
+            with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as pool:
+                list(pool.map(self.crawl_course, courses))
+
+            # Filter courses once all have been processed
+            for course in courses:
+                if userSpecific:
+                    if course.content_ressource_id:
+                        semester.courses.append(course)
+                else:
+                    semester.courses.append(course)
+
+        # Only add semesters that have at least one course
+        # Filter out semesters that ended up with no courses after crawling
+        semesters: list[Semester] = [
+            semester for semester in semesters if semester.courses
+        ]
+
+        created_index = MoodleIndex(
+            degree_program=DegreeProgram(
+                name=self.degree_program.name,
+                id=self.degree_program.id,
+                semesters=semesters,
+            ),
+        )
+        # Persist any newly discovered no-access courses
+        self._save_no_access_cache()
+
+        # Restore default behaviour for subsequent calls
+        self._ignore_cache = False
+
+        return created_index
+
+    # --------------------------------------------------------------------- #
+    # High-level crawling helpers
+    # --------------------------------------------------------------------- #
+    def crawl_semesters(self) -> list[Semester]:
+        """
+        Crawl the semesters from the Moodle index page.
+        """
+        url = URLs.get_degree_program_url(self.degree_program.id)
+        res = self.get_with_retries(url)
+
+        if res.status_code == 200:
+            semesters = self.extract_semesters(res.text)
+            logging.debug(f"Found semesters: {semesters}")
+            return semesters
+
+        return []
+
+    def crawl_courses(self, semester: Semester) -> list[Course]:
+        """
+        Crawl the courses from the Moodle index page.
+        """
+        url = URLs.get_semester_url(semester_id=semester.id)
+        res = self.get_with_retries(url)
+
+        if res.status_code == 200:
+            courses = self.extract_courses(res.text)
+            logging.debug(f"Found courses: {courses}")
+            return courses
+
+        return []
+
+    def crawl_course(self, course: Course) -> None:
+        """
+        Crawl a single Moodle course page.
+        """
+
+        hasAccess = self.has_user_access(course)
+
+        if not hasAccess:
+            return
+
+        # TODO: Cache which courses the user has no access to, to avoid repeated requests
+
+        course.content_ressource_id = self.crawl_content_ressource_id(course)
+        course.files = self.crawl_course_files(course)
+
+    # --------------------------------------------------------------------- #
+    # Networking utilities
+    # --------------------------------------------------------------------- #
+    def get_with_retries(self, url: str, retries: int = 3, delay: int = 1) -> httpx.Response:
+        """
+        Simple GET with retries and exponential back-off.
+        """
+        for attempt in range(1, retries + 1):
+            try:
+                response = self.client.get(url)
+
+                response.raise_for_status()
+
+                return response
+            except Exception as e:
+                logging.warning(f"Request to {url} failed ({e}), attempt {attempt}/{retries}")
+                if attempt < retries:
+                    time.sleep(delay * (2 ** (attempt - 1)))
+        raise Exception(f"Failed to GET {url} after {retries} attempts")
+
+    def save_html(self, url: str, response: httpx.Response) -> None:
+        """
+        Persist raw HTML locally for debugging.
+        """
+        filename = url.split("/")[-1] + ".html"
+        with open(filename, "w", encoding="utf-8") as f:
+            f.write(response.text)
+        logging.info(f"Saved HTML to {filename}")
+
+    # --------------------------------------------------------------------- #
+    # Extractors
+    # --------------------------------------------------------------------- #
+    def extract_semesters(self, html: str) -> list[Semester]:
+        selector = parsel.Selector(text=html)
+
+        logging.info("Extracting semesters from the HTML content.")
+
+        semesters: list[Semester] = []
+
+        # Each semester sits in a collapsed container
+        semester_containers = selector.css("div.category.notloaded.with_children.collapsed")
+
+        for container in semester_containers:
+            anchor = container.css("h3.categoryname.aabtn a")
+            if not anchor:
+                continue
+
+            anchor = anchor[0]
+            semester_name = (
+                anchor.xpath("text()").get("").replace("\n", "").replace("\t", "").strip()
+            )
+            semester_id = anchor.attrib.get("href", "").split("=")[-1]
+
+            # Only keep semesters labeled FS or HS
+            if "FS" not in semester_name and "HS" not in semester_name:
+                continue
+
+            semesters.append(Semester(name=semester_name, id=semester_id))
+
+        semester_order = {
+            "FS": 0,  # Frühjahrs‐/Spring Semester
+            "HS": 1,  # Herbst‐/Fall Semester
+        }
+        # Sort by year and then by FS before HS
+        sorted_semesters = sorted(
+            semesters,
+            key=lambda s: (
+                2000 + int(s.name[2:]),       # parse "25" → int 25, add 2000 → 2025
+                semester_order[s.name[:2]]    # map "FS" → 0, "HS" → 1
+            ),
+            reverse=True,
+        )
+        return sorted_semesters
+
+    def extract_courses(self, html: str) -> list[Course]:
+        """
+        Parse courses and capture optional “hero_image” (overview image) if present.
+        """
+        selector = parsel.Selector(text=html)
+
+        logging.info("Extracting courses from the HTML content.")
+
+        courses: list[Course] = []
+
+        for header in selector.css("h3.coursename"):
+            anchor = header.css("a")
+            if not anchor:
+                logging.warning("No course anchor found in the course header.")
+                continue
+
+            anchor = anchor[0]
+            course_name = (
+                anchor.xpath("text()").get("").replace("\n", "").replace("\t", "").strip()
+            )
+            course_id = anchor.attrib.get("href", "").split("=")[-1]
+
+            # Remove trailing semester tag and code patterns
+            course_name = re.sub(r"\s*(FS|HS)\d{2}\s*", "", course_name)
+            course_name = re.sub(r"\s*\(.*?\)\s*", "", course_name).strip()
+
+            # Try to locate a hero/overview image that belongs to this course box
+            # Traverse up to the containing course box, then look for <div class="courseimage"><img ...>
+            course_container = header.xpath('./ancestor::*[contains(@class,"coursebox")][1]')
+            hero_src = (
+                course_container.css("div.courseimage img::attr(src)").get("")
+                if course_container else ""
+            )
+
+            courses.append(
+                Course(
+                    id=course_id,
+                    name=course_name,
+                    activity_type="",      # TODO: Make optional
+                    hero_image=hero_src or ""
+                )
+            )
+
+        logging.info(f"{len(courses)} courses extracted.")
+        return courses
+
+    def has_user_access(self, course: Course) -> bool:
+        """
+        Return True only if the authenticated user can access the course (result cached).
+        (i.e. the response is HTTP 200 **and** is not a redirected login/enrol page).
+        """
+        if not self._ignore_cache and course.id in self._no_access_cache:
+            return False
+
+        url = URLs.get_course_url(course.id)
+        res = self.get_with_retries(url)
+
+        if res.status_code != 200:
+            self._no_access_cache.add(course.id)
+            return False
+
+        # Detect Moodle redirection to a login or enrolment page
+        final_url = str(res.url).lower()
+        if "login" in final_url or "enrol" in final_url:
+            self._no_access_cache.add(course.id)
+            return False
+
+        # Some enrolment pages still return 200; look for HTML markers
+        if "#page-enrol" in res.text or "you need to enrol" in res.text.lower():
+            self._no_access_cache.add(course.id)
+            return False
+
+        # If we got here the user has access; otherwise cache the deny
+        return True
+
+    def crawl_content_ressource_id(self, course: Course) -> str:
+        course_id = course.id
+        url = URLs.get_course_url(course_id)
+        res = self.get_with_retries(url)
+        psl = parsel.Selector(res.text)
+
+        try:
+            logging.info("Searching for 'Download course content' link.")
+            # Use parsel CSS selector to find the anchor tag with the specific data attribute
+            download_link_selector = psl.css('a[data-downloadcourse="1"]')
+            if not download_link_selector:
+                raise ValueError("Download link not found.")
+
+            # Extract the href attribute from the first matching element
+            href = download_link_selector[0].attrib.get("href")
+            if not href:
+                raise ValueError("Href attribute not found on the download link.")
+
+            context_id = href.split("=")[-1]
+            course.content_ressource_id = context_id
+
+            return context_id
+        except Exception as e:
+            logging.error(
+                f"Error extracting content resource ID for course '{course.name}': {e}",
+                exc_info=False,
+            )
+            logging.debug("Debugging info: Error accessing course content.", exc_info=True)
+            return ''
+
+    def crawl_course_files(self, course: Course) -> list[FileEntry]:
+        """
+        Crawl the course files from the Moodle course page.
+        """
+        url = URLs.get_course_url(course.id)
+        res = self.get_with_retries(url)
+
+        if res.status_code == 200:
+            files = [] # TODO: either implement this or remove, because files are extracted from the .zip file
+            logging.debug(f"Found files: {files}")
+            return files
+
+        return []
+
+    # ----------------------------------------------------------------- #
+    # Cache persistence helpers
+    # ----------------------------------------------------------------- #
+    def _save_no_access_cache(self) -> None:
+        try:
+            NO_ACCESS_CACHE_FILE.write_text(json.dumps(sorted(self._no_access_cache)))
+        except Exception as exc:
+            logging.warning(f"Could not persist no-access cache: {exc}")
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/py.typed
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/py.typed
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/urls.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/crawler/urls.py
@ -0,0 +1,59 @@
+# TODO: Move to librarian-core
+"""
+All URLs used in the crawler.
+Functions marked as PUBLIC can be accessed without authentication.
+Functions marked as PRIVATE require authentication.
+"""
+class URLs:
+    base_url = "https://moodle.fhgr.ch"
+
+    @classmethod
+    def get_base_url(cls):
+        """PUBLIC"""
+        return cls.base_url
+
+    # ------------------------- Moodle URLs -------------------------
+    @classmethod
+    def get_login_url(cls):
+        """PUBLIC"""
+        return f"{cls.base_url}/login/index.php"
+
+    @classmethod
+    def get_index_url(cls):
+        """PUBLIC"""
+        return f"{cls.base_url}/course/index.php"
+
+    @classmethod
+    def get_degree_program_url(cls, degree_program_id):
+        """PUBLIC"""
+        return f"{cls.base_url}/course/index.php?categoryid={degree_program_id}"
+
+    @classmethod
+    def get_category_url(cls, category_id):
+        """PUBLIC"""
+        return f"{cls.base_url}/course/index.php?categoryid={category_id}"
+
+    @classmethod
+    def get_semester_url(cls, semester_id):
+        """PUBLIC"""
+        return f"{cls.base_url}/course/index.php?categoryid={semester_id}"
+
+    @classmethod
+    def get_user_courses_url(cls):
+        """PRIVATE"""
+        return f"{cls.base_url}/my/courses.php"
+
+    @classmethod
+    def get_course_url(cls, course_id):
+        """PRIVATE"""
+        return f"{cls.base_url}/course/view.php?id={course_id}"
+
+    @classmethod
+    def get_files_url(cls, context_id):
+        """PRIVATE"""
+        return f"{cls.base_url}/course/downloadcontent.php?contextid={context_id}"
+
+    @classmethod
+    def get_file_url(cls, file_id):
+        """PRIVATE"""
+        return f"{cls.base_url}/mod/resource/view.php?id={file_id}"
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/downloader/init.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/downloader/init.py
@ -0,0 +1,5 @@
+from .downloader import *
+
+__all__ = [
+    "Downloader",
+]
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/downloader/downloader.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/downloader/downloader.py
@ -0,0 +1,151 @@
+"""
+Downloader Worker
+=================
+Input  : CrawlData      (from the crawler)
+Output : DownloadData   (metadata only; files staged)
+
+Folder tree after run
+---------------------
+export_dir/
+└─ {TERM_NAME}/
+   ├─ {course_id}.zip
+   └─ …
+"""
+
+from __future__ import annotations
+
+import asyncio
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import httpx
+from librarian_core.utils.path_utils import get_temp_path
+from librarian_core.workers.base import Worker
+from prefect import get_run_logger, task
+from prefect.futures import wait
+
+from librarian_scraper.constants import CRAWLER
+from librarian_scraper.crawler.cookie_crawler import CookieCrawler
+from librarian_scraper.models.crawl_data import CrawlData
+from librarian_scraper.models.download_data import (
+    DownloadCourse,
+    DownloadData,
+    DownloadTerm,
+)
+
+
+# --------------------------------------------------------------------------- #
+# helper decorator                                                            #
+# --------------------------------------------------------------------------- #
+def task_(**kw):
+    kw.setdefault("log_prints", True)
+    kw.setdefault("retries", 2)
+    kw.setdefault("retry_delay_seconds", 5)
+    return task(**kw)
+
+
+# --------------------------------------------------------------------------- #
+# shared state for static task                                                #
+# --------------------------------------------------------------------------- #
+_COOKIE_JAR: httpx.Cookies | None = None
+_SESSKEY: str = ""
+_LIMIT: int = 2
+_DELAY: float = 0.0
+
+
+class Downloader(Worker[CrawlData, DownloadData]):
+    DOWNLOAD_URL = "https://moodle.fhgr.ch/course/downloadcontent.php"
+
+    # tuning
+    CONCURRENCY = 8
+    RELAXED = True  # False → faster
+
+    input_model = CrawlData
+    output_model = DownloadData
+
+    # ------------------------------------------------------------------ #
+    async def __run__(self, crawl: CrawlData) -> DownloadData:
+        global _COOKIE_JAR, _SESSKEY, _LIMIT, _DELAY
+        lg = get_run_logger()
+
+        # ------------ login
+        cookies, sesskey = await CookieCrawler().crawl()
+        _COOKIE_JAR, _SESSKEY = cookies, sesskey
+
+        # ------------ tuning
+        _LIMIT = 1 if self.RELAXED else max(1, min(self.CONCURRENCY, 8))
+        _DELAY = CRAWLER["DELAY_SLOW"] if self.RELAXED else CRAWLER["DELAY_FAST"]
+
+        # ------------ working dir
+        work_root = Path(get_temp_path()) / f"dl_{int(time.time())}"
+        work_root.mkdir(parents=True, exist_ok=True)
+
+        result = DownloadData()
+        futures = []
+        term_dirs: List[Tuple[str, Path]] = []
+
+        # schedule downloads
+        for term in crawl.degree_program.terms:
+            term_dir = work_root / term.name
+            term_dir.mkdir(parents=True, exist_ok=True)
+            term_dirs.append((term.name, term_dir))
+
+            dl_term = DownloadTerm(id=term.id, name=term.name)
+            result.terms.append(dl_term)
+
+            for course in term.courses:
+                dest = term_dir / f"{course.id}.zip"
+                dl_term.courses.append(DownloadCourse(id=course.id, name=course.name))
+                futures.append(
+                    self._download_task.submit(course.content_ressource_id, dest)
+                )
+
+        wait(futures)  # block for all downloads
+
+        # stage term directories
+        for name, dir_path in term_dirs:
+            self.stage(dir_path, new_name=name, sanitize=False, move=True)
+
+        lg.info("Downloader finished – staged %d term folders", len(term_dirs))
+        return result
+
+    # ------------------------------------------------------------------ #
+    # static task                                                        #
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    @task_()
+    def _download_task(context_id: str, dest: Path) -> None:
+        lg = get_run_logger()
+        if not context_id:
+            lg.info("Skip (no context id) → %s", dest.name)
+            return
+
+        async def fetch() -> bool:
+            sem = asyncio.Semaphore(_LIMIT)
+
+            async with sem:
+                data = {"sesskey": _SESSKEY, "download": 1, "contextid": context_id}
+                async with httpx.AsyncClient(cookies=_COOKIE_JAR) as cli:
+                    try:
+                        async with cli.stream(
+                            "POST", Downloader.DOWNLOAD_URL, data=data, timeout=60
+                        ) as r:
+                            r.raise_for_status()
+                            with dest.open("wb") as fh:
+                                async for chunk in r.aiter_bytes():
+                                    fh.write(chunk)
+                        lg.info("Downloaded %s", dest)
+                        return True
+                    except httpx.HTTPStatusError as exc:
+                        lg.warning(
+                            "HTTP %s for %s", exc.response.status_code, dest.name
+                        )
+                    except Exception as exc:
+                        lg.warning("Error downloading %s (%s)", dest.name, exc)
+            return False
+
+        ok = asyncio.run(fetch())
+        if not ok and dest.exists():
+            dest.unlink(missing_ok=True)
+        time.sleep(_DELAY)
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/downloader/py.typed
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/downloader/py.typed
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/models/init.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/models/init.py
@ -0,0 +1,24 @@
+from librarian_scraper.models.crawl_data import (
+    CrawlCourse,
+    CrawlData,
+    CrawlFile,
+    CrawlProgram,
+    CrawlTerm,
+)
+
+from librarian_scraper.models.download_data import (
+    DownloadCourse,
+    DownloadData,
+    DownloadTerm,
+)
+
+__all__ = [
+    "CrawlData",
+    "CrawlCourse",
+    "CrawlFile",
+    "CrawlProgram",
+    "CrawlTerm",
+    "DownloadData",
+    "DownloadCourse",
+    "DownloadTerm",
+]
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/models/crawl_data.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/models/crawl_data.py
@ -0,0 +1,188 @@
+from __future__ import annotations
+
+import re
+from datetime import datetime, timezone
+
+from pydantic import BaseModel, Field
+
+"""
+Example of a MoodleIndex (JSON):
+MoodleIndex: {
+    degree_program: {
+        id: '1157',
+        name: 'Computational and Data Science',
+        terms: [
+            {
+                id: '1745',
+                name: 'FS25',
+                courses: [
+                    {
+                        id: '18863',
+                        name: 'Programmierung und Prompt Engineering II',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1159522/course/overviewfiles/PythonBooks.PNG',
+                        content_ressource_id: '1159522',
+                        files: [],
+                    },
+                    {
+                        id: '18240',
+                        name: 'Effiziente Algorithmen',
+                        activity_type: '',
+                        hero_image: '',
+                        content_ressource_id: '1125554',
+                        files: [],
+                    },
+                    {
+                        id: '18237',
+                        name: 'Mathematik II',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1125458/course/overviewfiles/Integration_Differential_b.png',
+                        content_ressource_id: '1125458',
+                        files: [],
+                    },
+                    {
+                        id: '18236',
+                        name: '2025 FS FHGR CDS Numerische Methoden',
+                        activity_type: '',
+                        hero_image: '',
+                        content_ressource_id: '1125426',
+                        files: [],
+                    },
+                    {
+                        id: '18228',
+                        name: 'Datenbanken und Datenverarbeitung',
+                        activity_type: '',
+                        hero_image: '',
+                        content_ressource_id: '1125170',
+                        files: [],
+                    },
+                ],
+            },
+            {
+                id: '1746',
+                name: 'HS24',
+                courses: [
+                    {
+                        id: '18030',
+                        name: 'Bootcamp Wissenschaftliches Arbeiten',
+                        activity_type: '',
+                        hero_image: '',
+                        content_ressource_id: '1090544',
+                        files: [],
+                    },
+                    {
+                        id: '17527',
+                        name: 'Einführung in Data Science',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1059194/course/overviewfiles/cds1010.jpg',
+                        content_ressource_id: '1059194',
+                        files: [],
+                    },
+                    {
+                        id: '17526',
+                        name: 'Einführung in Computational Science',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1059162/course/overviewfiles/cds_intro_sim.jpg',
+                        content_ressource_id: '1059162',
+                        files: [],
+                    },
+                    {
+                        id: '17525',
+                        name: 'Mathematik I',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1059130/course/overviewfiles/AdobeStock_452512134.png',
+                        content_ressource_id: '1059130',
+                        files: [],
+                    },
+                    {
+                        id: '17507',
+                        name: 'Programmierung und Prompt Engineering',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1058554/course/overviewfiles/10714013_33861.jpg',
+                        content_ressource_id: '1058554',
+                        files: [],
+                    },
+                    {
+                        id: '17505',
+                        name: 'Algorithmen und Datenstrukturen',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1058490/course/overviewfiles/Bild1.png',
+                        content_ressource_id: '1058490',
+                        files: [],
+                    },
+                    {
+                        id: '17503',
+                        name: 'Computer Science',
+                        activity_type: '',
+                        hero_image:
+                            'https://moodle.fhgr.ch/pluginfile.php/1058426/course/overviewfiles/Titelbild.jpg',
+                        content_ressource_id: '1058426',
+                        files: [],
+                    },
+                ],
+            },
+        ],
+    },
+    timestamp: '2025-04-27T14:20:11.354825+00:00',
+};
+"""
+
+
+# ---------------------------------------------------------------------------
+# Base Model
+# ---------------------------------------------------------------------------
+class CrawlData(BaseModel):
+    degree_program: CrawlProgram = Field(
+        default_factory=lambda: CrawlProgram(id="", name="")
+    )
+    timestamp: str = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat()
+    )
+
+
+# ---------------------------------------------------------------------------
+#  Degree Program
+# ---------------------------------------------------------------------------
+class CrawlProgram(BaseModel):
+    id: str = Field("1157", description="Unique identifier for the degree program.")
+    name: str = Field("Computational and Data Science", description="Name of the degree program.")
+    terms: list[CrawlTerm] = Field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+#  Term
+# ---------------------------------------------------------------------------
+_TERM_RE = re.compile(r"^(HS|FS)\d{2}$")  # HS24 / FS25 …
+
+
+class CrawlTerm(BaseModel):
+    id: str
+    name: str = Field(..., pattern=_TERM_RE.pattern)  # e.g. “HS24”
+    courses: list[CrawlCourse] = Field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+#  Course
+# ---------------------------------------------------------------------------
+class CrawlCourse(BaseModel):
+    id: str
+    name: str
+    hero_image: str = ""
+    content_ressource_id: str = ""
+    files: list[CrawlFile] = Field(default_factory=list)
+
+
+# ---------------------------------------------------------------------------
+#  Files
+# ---------------------------------------------------------------------------
+class CrawlFile(BaseModel):
+    id: str
+    res_id: str
+    name: str
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/models/download_data.py
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/models/download_data.py
@ -0,0 +1,18 @@
+from typing import List
+
+from pydantic import BaseModel, Field
+
+
+class DownloadCourse(BaseModel):
+    id: str
+    name: str  # Stores the name of the zip file inside the term directory
+
+
+class DownloadTerm(BaseModel):
+    id: str
+    name: str  # Stores the name of the term directory inside DownloadMeta.dir
+    courses: List[DownloadCourse] = Field(default_factory=list)
+
+
+class DownloadData(BaseModel):
+    terms: List[DownloadTerm] = Field(default_factory=list)
--- a/librarian/plugins/librarian-scraper/src/librarian_scraper/py.typed
+++ b/librarian/plugins/librarian-scraper/src/librarian_scraper/py.typed
--- a/librarian/plugins/librarian-scraper/uv.lock
+++ b/librarian/plugins/librarian-scraper/uv.lock
--- a/librarian/plugins/librarian-vspace/README.md
+++ b/librarian/plugins/librarian-vspace/README.md
@ -0,0 +1,5 @@
+# UV Update
+```shell
+uv lock --upgrade
+uv sync
+```
--- a/librarian/plugins/librarian-vspace/examples/chunks/chunker.py
+++ b/librarian/plugins/librarian-vspace/examples/chunks/chunker.py
@ -0,0 +1,47 @@
+import os
+
+def chunk_file(input_file, output_dir=None, start_num=1, padding=2):
+    """
+    Split a file into chunks and save each chunk as a separate file.
+    
+    Args:
+        input_file (str): Path to the input file
+        output_dir (str, optional): Directory to save chunk files. Defaults to current directory.
+        start_num (int, optional): Starting number for the chunk files. Defaults to 1.
+        padding (int, optional): Number of digits to pad the incremental numbers. Defaults to 2.
+    """
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+    with open(input_file) as f:
+        content = f.read()
+        chunks = content.split("---")
+        
+        chunk_count = start_num
+        for chunk in chunks:
+            chunk = chunk.replace('---', '').strip()
+            if not chunk:  # Skip empty chunks
+                continue
+                
+            # Define output path with padded incremental number
+            file_name = f'chunk_{chunk_count:0{padding}d}.md'
+            if output_dir:
+                outfile_path = os.path.join(output_dir, file_name)
+            else:
+                outfile_path = file_name
+                
+            with open(outfile_path, 'w') as outfile:
+                outfile.write(chunk)
+            
+            chunk_count += 1
+    
+    return chunk_count - start_num  # Return the number of chunks written
+
+# Example usage
+if __name__ == "__main__":
+    #input_file = "/home/gra/PycharmProjects/librarian_vspace/examples/chunks/knowledge_chunks_detailed.md"
+    input_file = "/home/gra/PycharmProjects/librarian_vspace/examples/chunks/knowledge_chunks_1500.md"
+    # You can specify an output directory or omit it to use the current directory
+    output_dir = "/examples/chunks/chunk_md_x"
+    chunk_file(input_file, output_dir)
+
--- a/librarian/plugins/librarian-vspace/examples/demo_run_cluster_export.py
+++ b/librarian/plugins/librarian-vspace/examples/demo_run_cluster_export.py
@ -0,0 +1,43 @@
+
+#!/usr/bin/env python3
+"""examples/demo_run_cluster_export.py
+
+Launch ClusterExportWorker via FlowArtifact wrapper, mirroring the embedder demo.
+"""
+
+from __future__ import annotations
+import asyncio
+import logging
+from pathlib import Path
+import os
+
+from librarian_vspace.vquery.cluster_export_worker import ClusterExportWorker, ClusterExportInput
+from librarian_core.workers.base import FlowArtifact
+
+COURSE_ID = 15512  # example id
+
+logger = logging.getLogger(__name__)
+
+def _load_env(path: Path) -> None:
+    if not path.is_file():
+        return
+    for line in path.read_text().splitlines():
+        if line.strip() and not line.startswith("#") and "=" in line:
+            k, v = [p.strip() for p in line.split("=", 1)]
+            os.environ.setdefault(k, v)
+
+async def _main() -> None:
+    payload = ClusterExportInput(course_id=COURSE_ID)
+
+    worker = ClusterExportWorker()
+    art = FlowArtifact.new(run_id="", dir=Path.cwd(), data=payload)
+    result_artifact = await worker.flow()(art)  # FlowArtifact
+
+    output = result_artifact.data  # ClusterExportOutput
+    logger.info("✅ Worker finished – output directory: %s", output.output_dir)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+    APP_DIR = Path(__file__).resolve().parent
+    _load_env(APP_DIR / ".env")
+    asyncio.run(_main())
--- a/librarian/plugins/librarian-vspace/examples/demo_run_embedder.py
+++ b/librarian/plugins/librarian-vspace/examples/demo_run_embedder.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import random
+from pathlib import Path
+from typing import Any, List, Dict
+import json
+
+from librarian_vspace.vecembed.embedder_worker import EmbedderWorker, EmbedderInput
+from librarian_core.workers.base import FlowArtifact
+from librarian_core.temp_payloads.chunk_data import ChunkData
+
+# ------------------------------------------------------------------ #
+# Configuration
+# ------------------------------------------------------------------ #
+# Folder with the small sample dataset (3 × .md files)
+DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/data/FS25/Effiziente_Algorithmen").expanduser()
+#DEMO_PATH: Path = Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/chunk_md").expanduser()
+
+# Where to write the concatenated text file
+# (one level above the dataset folder keeps things tidy)
+COURSE_ID_POOL = [16301, 16091, 17505, 18239, 17503, 15512]
+
+logger = logging.getLogger(__name__)
+INPUT_MODEL=Path("/home/gra/PycharmProjects/librarian_vspace/examples/chunks/moodle_chunks/51cd7cf6-e782-4f17-af00-30852cdcd5fc/51cd7cf6-e782-4f17-af00-30852cdcd5fc/result.json")
+
+# ------------------------------------------------------------------ #
+def _load_env(path: Path) -> None:
+    """Load KEY=VALUE pairs from a .env file if present."""
+    if not path.is_file():
+        return
+    for line in path.read_text().splitlines():
+        if line.strip() and not line.startswith("#") and "=" in line:
+            k, v = [p.strip() for p in line.split("=", 1)]
+            os.environ.setdefault(k, v)
+
+
+def discover_chunks(root: Path) -> List[Path]:
+    """Return all markdown files in the dataset folder."""
+    return sorted(root.glob("*.md"))
+
+
+def build_course(root: Path) -> Dict[str, Any]:
+    """Minimal dict that satisfies EmbedderWorker's `chunk_course`."""
+    files = [
+        {"file_name": p.name, "file_id": str(random.getrandbits(24))}
+        for p in discover_chunks(root)
+    ]
+    if not files:
+        raise FileNotFoundError(f"No .md files found in {root}")
+    return {
+        "path": str(root),
+        "files": files,
+        #"course_id": str(random.choice(COURSE_ID_POOL)),
+        "course_id": "18240"
+    }
+
+
+# ------------------------------------------------------------------ #
+async def _main() -> None:
+    course = build_course(DEMO_PATH)
+    concat_path = DEMO_PATH
+
+    with open(INPUT_MODEL, 'r') as file:
+        json_data = json.load(file)
+
+    #payload = EmbedderInput(chunk_course=course, concat_path=concat_path)
+    payload = ChunkData.model_validate_json(json_data)
+    worker = EmbedderWorker()
+    logger.info("🔨 Launching EmbedderWorker …")
+    art = FlowArtifact.new(run_id="", dir=concat_path, data=payload)
+    result = await worker.flow()(art)  # type: ignore[arg-type]
+
+    logger.info("✅ Worker finished: %s", result)
+
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    APP_DIR = Path(__file__).resolve().parent
+    _load_env(APP_DIR / ".env")
+
+    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+    asyncio.run(_main())
--- a/librarian/plugins/librarian-vspace/examples/demo_run_query.py
+++ b/librarian/plugins/librarian-vspace/examples/demo_run_query.py
@ -0,0 +1,66 @@
+
+#!/usr/bin/env python3
+"""examples/demo_run_query.py
+
+Runs QueryWorker via FlowArtifact wrapper (mirrors cluster export demo).
+"""
+
+from __future__ import annotations
+import asyncio
+import logging
+import os
+from pathlib import Path
+
+from librarian_vspace.vquery.query_worker import QueryWorker, QueryInput
+from librarian_vspace.models.query_model import VectorSearchRequest
+from librarian_core.workers.base import FlowArtifact
+
+# ------------------------------------------------------------------ #
+# Config
+# ------------------------------------------------------------------ #
+SEARCH_STRING = "integration"
+COURSE_FILTER_GT = 900  # adjust if needed
+
+logger = logging.getLogger(__name__)
+
+def _load_env(path: Path) -> None:
+    if not path.is_file():
+        return
+    for line in path.read_text().splitlines():
+        if line.strip() and not line.startswith("#") and "=" in line:
+            k, v = [p.strip() for p in line.split("=", 1)]
+            os.environ.setdefault(k, v)
+
+# ------------------------------------------------------------------ #
+async def _main() -> None:
+    # Vector search request
+    vs_req = VectorSearchRequest(
+        interface_name=os.getenv("EMBED_INTERFACE", "ollama"),
+        model_name=os.getenv("EMBED_MODEL", "snowflake-arctic-embed2"),
+        search_string=SEARCH_STRING,
+        filters={"file_id": ("gt", COURSE_FILTER_GT)},
+        top_k=10,
+    )
+
+    payload = QueryInput(
+        request=vs_req,
+        db_schema=os.getenv("VECTOR_SCHEMA", "librarian"),
+        rpc_function=os.getenv("VECTOR_FUNCTION", "pdf_chunking"),
+        embed_model=os.getenv("EMBED_MODEL", "snowflake-arctic-embed2"),
+    )
+
+    worker = QueryWorker()
+    art = FlowArtifact.new(run_id="", dir=Path.cwd(), data=payload)
+    result_artifact = await worker.flow()(art)  # FlowArtifact
+
+    response = result_artifact.data  # VectorSearchResponse
+    logger.info("✅ Worker finished – received %s results", response.total)
+    for idx, ck in enumerate(response.results, 1):
+        logger.info("• %s: %s", idx, ck.chunk[:80] + ("…" if len(ck.chunk or '') > 80 else ""))
+
+# ------------------------------------------------------------------ #
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+    APP_DIR = Path(__file__).resolve().parent
+    _load_env(APP_DIR / ".env")
+    asyncio.run(_main())
--- a/librarian/plugins/librarian-vspace/examples/demo_run_tsne_export.py
+++ b/librarian/plugins/librarian-vspace/examples/demo_run_tsne_export.py
@ -0,0 +1,43 @@
+
+#!/usr/bin/env python3
+"""examples/demo_run_tsne_export.py
+
+Launch TsneExportWorker via FlowArtifact wrapper.
+"""
+
+from __future__ import annotations
+import asyncio
+import logging
+from pathlib import Path
+import os
+
+from librarian_vspace.vecview.tsne_export_worker import TsneExportWorker, TsneExportInput
+from librarian_core.workers.base import FlowArtifact
+
+COURSE_ID = 18240  # choose a course with embeddings
+
+logger = logging.getLogger(__name__)
+
+def _load_env(path: Path) -> None:
+    if not path.is_file():
+        return
+    for line in path.read_text().splitlines():
+        if line.strip() and not line.startswith("#") and "=" in line:
+            k, v = [p.strip() for p in line.split("=", 1)]
+            os.environ.setdefault(k, v)
+
+async def _main() -> None:
+    payload = TsneExportInput(course_id=COURSE_ID)
+
+    worker = TsneExportWorker()
+    art = FlowArtifact.new(run_id="", dir=Path.cwd(), data=payload)
+    result_artifact = await worker.flow()(art)  # FlowArtifact
+
+    output = result_artifact.data  # TsneExportOutput
+    logger.info("✅ Worker finished – JSON file: %s", output.json_path)
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")
+    APP_DIR = Path(__file__).resolve().parent
+    _load_env(APP_DIR / ".env")
+    asyncio.run(_main())
--- a/librarian/plugins/librarian-vspace/examples/parallelism_test.py
+++ b/librarian/plugins/librarian-vspace/examples/parallelism_test.py
@ -0,0 +1,4 @@
+from librarian_vspace.vutils.parallelism_advisor import recommended_workers
+print(recommended_workers(kind="cpu"))
+print(recommended_workers(kind="io"))
+print(recommended_workers(kind="gpu"))
--- a/librarian/plugins/librarian-vspace/examples/run_visualizer.py
+++ b/librarian/plugins/librarian-vspace/examples/run_visualizer.py
@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Loads vector data using vecmap.loader, reduces dimensions via t-SNE,
+and launches an interactive 3D visualization using vecmap.visualizer (Dash/Plotly).
+
+Configuration is primarily driven by environment variables.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import pathlib
+import sys
+import pandas as pd
+
+# Define application directory relative to this script file
+APP_DIR = pathlib.Path(__file__).resolve().parent
+# Define the source directory containing vecmap, vutils, etc.
+SRC_DIR = APP_DIR.parent / "src"
+# Define path to .env file relative to APP_DIR
+DOTENV_PATH = APP_DIR / ".env"
+
+# --- Explicitly Manage sys.path ---
+app_dir_str = str(APP_DIR)
+src_dir_str = str(SRC_DIR)
+if app_dir_str in sys.path:
+    try: sys.path.remove(app_dir_str)
+    except ValueError: pass
+if src_dir_str not in sys.path:
+    sys.path.insert(0, src_dir_str)
+elif sys.path[0] != src_dir_str:
+    try: sys.path.remove(src_dir_str)
+    except ValueError: pass
+    sys.path.insert(0, src_dir_str)
+print(f"[DEBUG] sys.path start: {sys.path[:3]}")
+
+# --- .env Loader ---
+def _load_env_file(path: pathlib.Path) -> None:
+    print(f"Attempting to load .env file from: {path}")
+    if not path.is_file(): print(f".env file not found at {path}, skipping."); return
+    loaded, skipped = 0, 0
+    try:
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip();
+                if not line or line.startswith("#") or "=" not in line: continue
+                key, val = line.split("=", 1); key, val = key.strip(), val.strip()
+                if key not in os.environ: os.environ[key] = val; loaded += 1
+                else: skipped += 1
+        print(f"Loaded {loaded} new vars, skipped {skipped} existing vars from .env")
+    except Exception as e: print(f"Error reading .env file at {path}: {e}")
+_load_env_file(DOTENV_PATH)
+
+# --- Logging Setup ---
+log_level_str = os.getenv("VECMAP_DEBUG", "false").lower()
+log_level = logging.DEBUG if log_level_str in ("true", "1") else logging.INFO
+logging.basicConfig(level=log_level, format='[%(asctime)s] [%(levelname)s] [%(name)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+if log_level > logging.DEBUG:
+    for logger_name in ["urllib3", "httpx", "supabase"]: logging.getLogger(logger_name).setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+
+# --- Imports ---
+try:
+    from librarian_vspace.vecmap.loader import VectorLoader, VectorLoaderError
+    from librarian_vspace.vecmap.visualizer import VectorVisualizer # Removed DEFAULT_N_CLUSTERS import
+    import librarian_vspace.vutils
+    import librarian_vspace.vecembed
+    logger.debug("Successfully imported components.")
+except ImportError as e:
+    logger.error(f"Failed to import necessary modules: {e}", exc_info=True)
+    sys.exit(1)
+
+# --- Main Logic ---
+def main() -> None:
+    logger.info("--- Starting VecMap Visualizer ---")
+
+    # --- Configuration ---
+    db_schema = os.getenv("VECTOR_SCHEMA", "librarian")
+    db_function = os.getenv("VECTOR_FUNCTION", "pdf_chunking")
+    model_name = os.getenv("EMBED_MODEL", "snowflake-arctic-embed2")
+    interface_name = os.getenv("EMBED_INTERFACE", "ollama")
+    embedding_column = os.getenv("EMBEDDING_COLUMN", "embedding")
+    try: limit_str = os.getenv("VECMAP_LIMIT"); data_limit = int(limit_str) if limit_str else None
+    except ValueError: logger.warning(f"Invalid VECMAP_LIMIT. Ignoring."); data_limit = None
+    try: perplexity_str = os.getenv("VECMAP_PERPLEXITY", "30.0"); tsne_perplexity = float(perplexity_str)
+    except ValueError: logger.warning(f"Invalid VECMAP_PERPLEXITY. Using 30.0."); tsne_perplexity = 30.0
+    
+    # n_clusters configuration removed
+    
+    dash_host = os.getenv("VECMAP_HOST", "127.0.0.1")
+    try: port_str = os.getenv("VECMAP_PORT", "8050"); dash_port = int(port_str)
+    except ValueError: logger.warning(f"Invalid VECMAP_PORT. Using 8050."); dash_port = 8050
+    dash_debug = log_level == logging.DEBUG
+
+    logger.info("Effective Configuration:")
+    logger.info(f"  Database: schema={db_schema}, function={db_function}")
+    logger.info(f"  Model/Interface: model={model_name}, interface={interface_name}")
+    logger.info(f"  Data Params: column={embedding_column}, limit={data_limit}")
+    logger.info(f"  Processing: perplexity={tsne_perplexity} (n_clusters is now dynamic)") # Updated log
+    logger.info(f"  Server: host={dash_host}, port={dash_port}, debug={dash_debug}")
+
+    # --- 1. Initial Load and Reduce ---
+    initial_df_reduced = pd.DataFrame()
+    try:
+        logger.info("Performing initial data load and processing...")
+        loader = VectorLoader(schema=db_schema, function=db_function, model=model_name, embedding_column=embedding_column)
+        tsne_params = {"perplexity": tsne_perplexity}
+        initial_df_reduced = loader.load_and_reduce(limit=data_limit, tsne_params=tsne_params)
+        if initial_df_reduced.empty: logger.warning("Initial data load resulted in an empty dataset.")
+        else: logger.info(f"Successfully loaded and reduced {len(initial_df_reduced)} vectors initially.")
+    except VectorLoaderError as e: logger.error(f"Initial data load failed: {e}", exc_info=dash_debug)
+    except Exception as e: logger.error(f"Unexpected error during initial data load: {e}", exc_info=dash_debug)
+
+    # --- 2. Initialize and Start Visualization ---
+    try:
+        logger.info("Initializing VectorVisualizer...")
+        visualizer = VectorVisualizer(
+            initial_data=initial_df_reduced,
+            db_schema=db_schema, 
+            db_function=db_function,
+            interface_name=interface_name, 
+            model_name=model_name,         
+            embedding_column=embedding_column,
+            initial_limit=data_limit,      
+            initial_perplexity=tsne_perplexity
+            # n_clusters argument removed
+        )
+        logger.info("Launching visualizer...")
+        visualizer.run(host=dash_host, port=dash_port, debug=dash_debug)
+    except TypeError as te:
+         logger.error(f"TypeError during VectorVisualizer initialization: {te}", exc_info=True)
+         sys.exit(1)
+    except Exception as e:
+        logger.error(f"Failed to initialize or run visualizer: {e}", exc_info=dash_debug)
+        sys.exit(1)
+
+    logger.info("--- VecMap Visualizer finished ---")
+
+if __name__ == "__main__":
+    main()
--- a/librarian/plugins/librarian-vspace/pyproject.toml
+++ b/librarian/plugins/librarian-vspace/pyproject.toml
@ -0,0 +1,52 @@
+[project]
+name = "librarian-vspace"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+authors = [
+    { name = "TheOriginalGraLargeShrimpakaReaper", email = "graber-michael@hotmail.com" }
+]
+requires-python = ">=3.10"
+dependencies = [
+  "librarian-core",
+  "importlib_metadata; python_version<'3.10'",
+  "dotenv>=0.9.9",
+  "psycopg2-binary>=2.9.10",
+  "python-dotenv>=1.1.0",
+  "requests>=2.32.3",
+  "supabase>=2.15.0",
+  "numpy>=2.2.5",
+  "dash>=3.0.4",
+  "scikit-learn>=1.6.1",
+  "plotly>=6.0.1",
+  "pandas>=2.2.3",
+  "pathlib>=1.0.1",
+  "prefect>=3.4.1",
+]
+
+[tool.uv.sources]
+librarian-core = { git = "https://github.com/DotNaos/librarian-core", rev = "dev" }
+
+[build-system]
+requires = ["hatchling>=1.21"]
+build-backend = "hatchling.build"
+
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/librarian_vspace"]
+
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+
+# ───────── optional: dev / test extras ─────────
+[project.optional-dependencies]
+dev   = ["ruff", "pytest", "mypy"]
+
+[project.entry-points."librarian.workers"]
+embedder = "librarian_vspace.vecembed:EmbedderWorker"
+clusterexporter = "librarian_vspace.vquery:ClusterExportWorker"
+tnseexport = "librarian_vspace.vecview:TsneExportWorker"
+vectorquerying = "librarian_vspace.vquery:QueryWorker"
+
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/init.py
@ -0,0 +1,22 @@
+
+"""Embedding‑related helpers."""
+import pkgutil
+import importlib
+
+__all__ = []
+
+# Iterate over all modules in this package
+for finder, module_name, is_pkg in pkgutil.iter_modules(__path__):
+    # import the sub-module
+    module = importlib.import_module(f"{__name__}.{module_name}")
+
+    # decide which names to re-export:
+    #  use module.__all__ if it exists, otherwise every non-private attribute
+    public_names = getattr(
+        module, "__all__", [n for n in dir(module) if not n.startswith("_")]
+    )
+
+    # bring each name into the package namespace
+    for name in public_names:
+        globals()[name] = getattr(module, name)
+        __all__.append(name) # type: ignore
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/models/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/models/init.py
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/models/query_model.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/models/query_model.py
@ -0,0 +1,38 @@
+
+"""Pydantic models for vector search requests and responses."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, Optional
+from pydantic import BaseModel, Field
+
+
+class VectorSearchRequest(BaseModel):
+    """Input payload for a vector search."""
+
+    interface_name: str = Field(..., description="Name of the embedding interface")
+    model_name: str = Field(..., description="Name of the embedding model")
+    search_string: str = Field(..., description="The natural language query to embed and search for")
+    filters: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Optional key/value filters applied server‑side",
+    )
+    top_k: int = Field(10, ge=1, le=100, description="Number of matches to return")
+    embedding_column: str = Field(
+        "embedding",
+        description="Name of the embedding column in the database table",
+    )
+
+
+class Chunklet(BaseModel):
+    """Single result row returned by the database RPC."""
+
+    chunk: Optional[str] = None
+    file_id: Optional[str | int] = None
+
+
+class VectorSearchResponse(BaseModel):
+    """Output payload wrapping vector‑search results."""
+
+    total: int
+    results: List[Chunklet]
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/models/tsne_model.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/models/tsne_model.py
@ -0,0 +1,31 @@
+
+"""Data models for t‑SNE exports.
+
+These models are used by *vecview* and any endpoint that needs to return or
+validate t‑SNE projection data.
+"""
+
+from __future__ import annotations
+
+from typing import List, Optional
+from pydantic import BaseModel
+
+
+class TSNEPoint(BaseModel):
+    """A single point in a 3‑D t‑SNE projection."""
+
+    x: float
+    y: float
+    z: float
+    file_id: str
+    chunk: str
+    cluster: Optional[str] = None
+    hover_text: Optional[str] = None
+
+
+class TSNEData(BaseModel):
+    """Container returned to callers requesting a t‑SNE view."""
+
+    course_id: Optional[int] = None
+    total: int
+    points: List[TSNEPoint]
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/py.typed
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/py.typed
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/init.py
@ -0,0 +1,9 @@
+
+"""Embedding‑related helpers."""
+from __future__ import annotations
+
+from .vector_inserter import VectorInserter
+from .embedding_generator import EmbeddingGenerator
+from .embedding_workflow import EmbeddingWorkflow
+
+__all__ = ["VectorInserter", "EmbeddingGenerator", "EmbeddingWorkflow"]
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedder.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedder.py
@ -0,0 +1,155 @@
+
+"""Parallel‑aware embedding helpers.
+
+* **embed_single_file()** – embed one file (sync).
+* **run_embedder()**      – embed all files in a course (async, kept for back‑compat).
+* **_create_hnsw_index()** – helper to (re)build PGVector HNSW index.
+
+This file contains no Prefect code; it’s pure embedding logic.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from functools import lru_cache
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, List, Union
+
+from postgrest import APIResponse
+
+from librarian_core.temp_payloads.chunk_data import ChunkCourse, ChunkFile
+from librarian_vspace.vecembed.embedding_generator import EmbeddingGenerator
+from librarian_vspace.vecembed.vector_inserter import VectorInserter
+from librarian_vspace.vecembed.embedding_workflow import EmbeddingWorkflow
+from librarian_vspace.vutils.supabase_singleton import MySupabase
+
+logger = logging.getLogger(__name__)
+
+@lru_cache(maxsize=1)
+def _autodiscover_pg_conn():
+    supa = MySupabase.get_client()  # type: ignore
+    if supa is None:
+        raise RuntimeError("MySupabase.get_client() returned None – no DB connection.")
+    return supa
+
+
+def _create_hnsw_index(
+    supa,
+    table_fqn: str,
+    *,
+    column_name: str = "embedding",
+    query_operator: str = "<=>",
+    m: int = 16,
+    ef: int = 64,
+) -> None:
+    if "." not in table_fqn:
+        raise ValueError("table_fqn must be schema.table")
+    schema, table = table_fqn.split(".", 1)
+    try:
+        supa.schema(schema).rpc(
+            "create_or_reindex_hnsw",
+            dict(
+                p_schema=schema,
+                p_table=table,
+                p_column=column_name,
+                p_operator=query_operator,
+                p_m=m,
+                p_ef=ef,
+            ),
+        ).execute()
+    except Exception:
+        logger.exception("Failed to run create_or_reindex_hnsw")
+
+# --------------------------------------------------------------------------- #
+# single file                                                                 #
+# --------------------------------------------------------------------------- #
+def embed_single_file(
+    *,
+    course_id: str,
+    file_entry: dict | ChunkFile | SimpleNamespace,
+    concat_path: Union[str, Path],
+    db_schema: str = "librarian",
+    db_function: str = "pdf_chunking",
+    interface_name: str = "ollama",
+    model_name: str = "snowflake-arctic-embed2",
+    file_type: str = "md",
+) -> Path | None:
+
+    if isinstance(file_entry, (dict, SimpleNamespace)):
+        file_name = file_entry["file_name"] if isinstance(file_entry, dict) else file_entry.file_name
+        file_id = file_entry["file_id"] if isinstance(file_entry, dict) else file_entry.file_id
+    else:
+        file_name, file_id = file_entry.file_name, file_entry.file_id
+
+    chunk_path = Path(concat_path) / file_name
+    if not chunk_path.exists():
+        logger.warning("Missing chunk file %s – skipping", chunk_path)
+        return None
+
+    generator = EmbeddingGenerator()
+    inserter = VectorInserter(schema=db_schema, function=db_function, model=model_name)
+
+    wf = EmbeddingWorkflow(
+        chunk_path=chunk_path,
+        course_id=course_id,
+        file_id=file_id,
+        file_type=file_type,
+        interface_name=interface_name,
+        model_name=model_name,
+        generator=generator,
+        inserter=inserter,
+    )
+    wf.process()
+    return chunk_path
+
+# --------------------------------------------------------------------------- #
+async def run_embedder(
+    course: ChunkCourse,
+    concat_path: Union[str, Path],
+    *,
+    db_schema: str = "librarian",
+    db_function: str = "pdf_chunking",
+    interface_name: str = "ollama",
+    model_name: str = "snowflake-arctic-embed2",
+    file_type: str = "md",
+    vector_column: str = "embedding",
+    query_operator: str = "<=>",
+    hnsw_m: int = 16,
+    hnsw_ef: int = 64,
+    max_parallel_files: int | None = None,
+) -> Path:
+
+    supa_client = _autodiscover_pg_conn()
+    root = Path(concat_path)
+    sem = asyncio.Semaphore(max_parallel_files or len(course.files) or 1)
+
+    async def _wrapper(cf):
+        async with sem:
+            return await asyncio.to_thread(
+                embed_single_file,
+                course_id=course.course_id,
+                file_entry=cf,
+                concat_path=root,
+                db_schema=db_schema,
+                db_function=db_function,
+                interface_name=interface_name,
+                model_name=model_name,
+                file_type=file_type,
+            )
+
+    await asyncio.gather(*[asyncio.create_task(_wrapper(cf)) for cf in course.files])
+
+    inserter = VectorInserter(schema=db_schema, function=db_function, model=model_name)
+    _create_hnsw_index(
+        supa_client,
+        inserter.table_fqn(),
+        column_name=vector_column,
+        query_operator=query_operator,
+        m=hnsw_m,
+        ef=hnsw_ef,
+    )
+    return root
+
+__all__ = ["embed_single_file", "run_embedder", "_create_hnsw_index", "_autodiscover_pg_conn"]
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedder_worker.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedder_worker.py
@ -0,0 +1,67 @@
+
+"""EmbedderWorker – Prefect‑mapped per file."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Any, List
+
+from prefect import get_run_logger, task, unmapped
+from pydantic import BaseModel, Field
+from librarian_core.workers.base import Worker
+
+@task(name="embed_file", retries=2, retry_delay_seconds=5, log_prints=True, tags=["embed_file"])
+def embed_file_task(course_dict: dict | SimpleNamespace, file_entry: dict, concat_path: Path) -> Path | None:
+    from librarian_vspace.vecembed.embedder import embed_single_file
+    cid = course_dict["course_id"] if isinstance(course_dict, dict) else course_dict.course_id
+    return embed_single_file(course_id=cid, file_entry=file_entry, concat_path=concat_path)
+
+class EmbedderInput(BaseModel):
+    chunk_courses: List[Any] = Field(default_factory=list, alias="chunk_courses")
+    concat_path: Path
+    chunk_course: Any | None = None
+    def model_post_init(self, _):
+        if not self.chunk_courses and self.chunk_course is not None:
+            self.chunk_courses = [self.chunk_course]
+    model_config = dict(populate_by_name=True, extra="allow")
+
+class EmbedderOutput(BaseModel):
+    result_paths: List[Path]
+
+class EmbedderWorker(Worker[EmbedderInput, EmbedderOutput]):
+    input_model = EmbedderInput
+    output_model = EmbedderOutput
+
+    async def __run__(self, payload: EmbedderInput) -> EmbedderOutput:
+        log = get_run_logger()
+        total_files = sum(len(c["files"]) if isinstance(c, dict) else len(c.files) for c in payload.chunk_courses)
+        log.info("Embedding %d files", total_files)
+
+        result_paths: List[Path] = []
+
+        # constants – could be parameterised later
+        schema = "librarian"
+        func = "pdf_chunking"
+        model_name = "snowflake-arctic-embed2"
+
+        for course in payload.chunk_courses:
+            files = course["files"] if isinstance(course, dict) else course.files
+            futures = embed_file_task.map(unmapped(course), files, unmapped(payload.concat_path))
+            for fut in futures:
+                path = fut.result()
+                if path:
+                    result_paths.append(path)
+
+            # rebuild index once per course
+            from librarian_vspace.vecembed.embedder import _create_hnsw_index, _autodiscover_pg_conn
+            from librarian_vspace.vecembed.vector_inserter import VectorInserter
+
+            supa = _autodiscover_pg_conn()
+            inserter = VectorInserter(schema=schema, function=func, model=model_name)
+            _create_hnsw_index(supa, inserter.table_fqn())
+
+        for p in result_paths:
+            self.stage(p, new_name=p.name)
+
+        return EmbedderOutput(result_paths=result_paths)
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedding_generator.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedding_generator.py
@ -0,0 +1,21 @@
+
+"""Factory for embedding back‑ends."""
+import logging
+from typing import Any, List, Optional, Tuple, Dict, Type
+
+from librarian_vspace.vecembed.embedding_interface import EmbeddingInterface
+from librarian_vspace.vecembed.ollama_embedder import OllamaEmbedder
+
+logger = logging.getLogger(__name__)
+
+class EmbeddingGenerator:
+    _registry: Dict[str, Type[EmbeddingInterface]] = {
+        "ollama": OllamaEmbedder,
+    }
+
+    def generate_embedding(self, interface_name: str, model_name: str, text_to_embed: str, identifier: Any) -> Tuple[str, Optional[List[float]], Any]:
+        cls = self._registry.get(interface_name.lower())
+        if not cls:
+            raise ValueError(f"Unsupported embedding interface: {interface_name}")
+        embedder = cls(model_name=model_name)
+        return embedder.embed(text_to_embed, identifier)
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedding_interface.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedding_interface.py
@ -0,0 +1,14 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import Any, List, Optional, Tuple
+
+class EmbeddingInterface(ABC):
+    """Contract for any embedding service implementation."""
+
+    def __init__(self, model_name: str, **kwargs: Any) -> None:
+        self.model_name = model_name
+
+    @abstractmethod
+    def embed(self, text_or_chunk: str, identifier: Any) -> Tuple[str, Optional[List[float]], Any]:
+        """Return (original_text, embedding, identifier) — embedding may be None on failure."""
+        pass
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedding_workflow.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/embedding_workflow.py
@ -0,0 +1,92 @@
+"""Orchestrates loading, embedding, and storing a text chunk."""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Any, Optional, Dict, Union
+
+# Import the worker classes for type hinting
+from librarian_vspace.vecembed.embedding_generator import EmbeddingGenerator
+from librarian_vspace.vecembed.vector_inserter import VectorInserter
+
+logger = logging.getLogger(__name__)
+
+class EmbeddingWorkflow:
+    # Accept generator and inserter instances in __init__
+    def __init__(self,
+                 chunk_path: Union[str, Path],
+                 course_id: Any,
+                 file_id: Any,
+                 file_type: str,
+                 interface_name: str, # Still needed for generate_embedding method
+                 model_name: str,    # Still needed for generate_embedding method
+                 generator: EmbeddingGenerator, # Accept pre-instantiated generator
+                 inserter: VectorInserter,     # Accept pre-instantiated inserter
+                 # db_schema and db_function are now implicit via the inserter
+                 # db_schema: str = "librarian",
+                 # db_function: str = "pdf_chunking",
+                 ):
+        self.chunk_path = Path(chunk_path)
+        self.course_id = course_id
+        self.file_id = file_id
+        self.file_type = file_type
+        # Keep interface_name and model_name as they are passed to the generator's method
+        self.interface_name = interface_name
+        self.model_name = model_name
+
+        # Assign the passed instances instead of creating new ones
+        self.generator = generator
+        self.inserter = inserter
+
+        # No need to store db_schema/db_function here if inserter handles it
+
+    # ---------------- helpers ----------------
+    def _load_chunk(self) -> Optional[str]:
+        try:
+            text = self.chunk_path.read_text(encoding="utf-8").strip()
+            if not text:
+                logger.warning("Chunk %s is empty", self.chunk_path)
+                return None
+            return text
+        except Exception as exc:
+            logger.error("Failed to read %s: %s", self.chunk_path, exc)
+            return None
+
+    def process(self) -> bool:
+        chunk_text = self._load_chunk()
+        if chunk_text is None:
+            return False
+
+        # Use the shared generator instance
+        original_text, vector, _ = self.generator.generate_embedding(
+            interface_name=self.interface_name, # Pass parameters to the method
+            model_name=self.model_name,       # Pass parameters to the method
+            text_to_embed=chunk_text,
+            identifier=self.file_id,
+        )
+
+        if vector is None:
+            # Log failure within generator if not already done, or here
+            logger.error(f"Failed to generate embedding for {self.chunk_path}")
+            return False
+
+        payload: Dict[str, Any] = {
+            "chunk": original_text,
+            "course_id": self.course_id,
+            "file_id": self.file_id,
+            "file_type": self.file_type,
+            "embedding": vector,
+        }
+
+        # Use the shared inserter instance
+        insert_result = self.inserter.insert_vector(payload)
+
+        if insert_result is None:
+            logger.error(f"Failed to insert vector for {self.chunk_path}")
+            return False
+
+        logger.debug(f"Successfully processed and inserted {self.chunk_path}")
+        return True # Indicate success
+
+
+# Keep __all__ if needed
+# __all__ = ["EmbeddingWorkflow"]
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/ollama_embedder.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/ollama_embedder.py
@ -0,0 +1,44 @@
+
+"""Ollama-based embedding implementation (env handled at application layer)."""
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any, List, Optional, Tuple
+
+import requests
+
+from librarian_vspace.vecembed.embedding_interface import EmbeddingInterface
+
+logger = logging.getLogger(__name__)
+
+
+class OllamaEmbedder(EmbeddingInterface):
+    def __init__(self, model_name: str, **kwargs: Any) -> None:
+        super().__init__(model_name=model_name)
+        self.base_url = os.getenv("OLLAMA_BASE_URL")
+        if not self.base_url:
+            raise ValueError("OLLAMA_BASE_URL not configured – ensure env is set in the examples layer")
+        self.api_endpoint = f"{self.base_url.rstrip('/')}/api/embeddings"
+
+
+    def embed(self, text_or_chunk: str, identifier: Any) -> Tuple[str, Optional[List[float]], Any]:
+        payload = {"model": self.model_name, "prompt": text_or_chunk}
+        vector: Optional[List[float]] = None
+        try:
+            logger.debug("Requesting embedding for id=%s", identifier)
+            resp = requests.post(self.api_endpoint, json=payload, timeout=3600, headers={"Content-Type": "application/json"})
+            resp.raise_for_status()
+            data = resp.json()
+            if isinstance(data.get("embedding"), list):
+                vector = data["embedding"]
+                logger.debug("Received embedding dim=%d for id=%s", len(vector), identifier)
+            else:
+                logger.error("Invalid response from Ollama: %s", data)
+        except requests.exceptions.Timeout:
+            logger.error("Timeout contacting Ollama at %s", self.api_endpoint)
+        except requests.exceptions.RequestException as exc:
+            logger.error("HTTP error contacting Ollama: %s", exc)
+        except Exception:
+            logger.exception("Unexpected error during embed for id=%s", identifier)
+        return text_or_chunk, vector, identifier
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/py.typed
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/py.typed
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/vector_inserter.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecembed/vector_inserter.py
@ -0,0 +1,23 @@
+
+from __future__ import annotations
+import logging
+from typing import Any, Dict, List, Optional
+from librarian_vspace.vutils.vector_class import BaseVectorOperator
+
+logger = logging.getLogger(__name__)
+
+class VectorInserter(BaseVectorOperator):
+    """High-level write helper for embeddings."""
+
+    def insert_vector(self, data: Dict[str, Any]) -> Optional[List[Dict[str, Any]]]:
+        if not self.table:
+            logger.error("Table resolution failed earlier")
+            return None
+        preview = {k: (f"<vector,len={len(v)}>" if k == "embedding" else v) for k, v in data.items()}
+        logger.debug("Insert → %s.%s :: %s", self.schema, self.table, preview)
+        try:
+            resp = self.spc.schema(self.schema).table(self.table).insert(data).execute()
+            return resp.data if isinstance(resp.data, list) else []
+        except Exception:
+            logger.exception("Insert failed for %s", self.table_fqn())
+            return None
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from librarian_vspace!"
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/loader.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/loader.py
@ -0,0 +1,264 @@
+"""Loads vectors from Supabase, reduces dimensions using t-SNE."""
+from __future__ import annotations
+
+import logging
+import json # Import json for parsing
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from sklearn.manifold import TSNE
+
+# Assuming vutils is installed or in the python path
+try:
+    from librarian_vspace.vutils.vector_class import BaseVectorOperator
+except ImportError as e:
+    logging.error(f"Failed to import vutils: {e}. Ensure vutils package is installed.")
+    raise
+
+logger = logging.getLogger(__name__)
+
+class VectorLoaderError(Exception):
+    """Custom exception for loader errors."""
+    pass
+
+
+class VectorLoader:
+    """Fetches vectors and applies t-SNE."""
+
+    DEFAULT_TSNE_PARAMS = {
+        "n_components": 3,
+        "perplexity": 30.0, # Adjust based on dataset size (5-50 typically)
+        "n_iter": 1000,     # Minimum recommended iterations
+        "learning_rate": "auto", # Usually a good default
+        "init": "pca",      # PCA initialization is often faster and more stable
+        "random_state": 42, # For reproducibility
+        "n_jobs": -1,       # Use all available CPU cores
+        "verbose": 1,       # Log progress (controls scikit-learn's verbosity)
+    }
+
+    def __init__(self, schema: str, function: str, model: str, embedding_column: str = "embedding"):
+        """
+        Initializes the loader.
+        (Constructor remains the same)
+        """
+        logger.info(f"Initializing VectorLoader for {schema=}, {function=}, {model=}")
+        try:
+            self.operator = BaseVectorOperator(schema=schema, function=function, model=model)
+            self.embedding_column = embedding_column
+            if not self.operator.table:
+                 raise VectorLoaderError("BaseVectorOperator failed to resolve table.")
+            logger.info(f"Target table resolved to: {self.operator.table_fqn()}")
+        except (ImportError, ValueError, RuntimeError) as e:
+             logger.exception("Failed to initialize BaseVectorOperator.")
+             raise VectorLoaderError(f"Failed to initialize BaseVectorOperator: {e}") from e
+
+
+    def _parse_vector_string(self, vector_str: Any) -> Optional[List[float]]:
+        """Safely parses the string representation of a vector into a list of floats."""
+        if not isinstance(vector_str, str):
+            # If it's already a list (less likely now, but safe check), return it if valid
+            if isinstance(vector_str, list) and all(isinstance(n, (int, float)) for n in vector_str):
+                 return vector_str # Assume it's already correctly parsed
+            logger.debug(f"Unexpected type for vector parsing: {type(vector_str)}. Skipping.")
+            return None
+        try:
+            # Use json.loads which correctly handles [...] syntax
+            parsed_list = json.loads(vector_str)
+            if isinstance(parsed_list, list) and all(isinstance(n, (int, float)) for n in parsed_list):
+                return [float(n) for n in parsed_list] # Ensure elements are floats
+            else:
+                logger.warning(f"Parsed vector string '{vector_str[:50]}...' but result is not a list of numbers.")
+                return None
+        except json.JSONDecodeError:
+            logger.warning(f"Failed to JSON decode vector string: '{vector_str[:50]}...'")
+            return None
+        except Exception as e:
+            logger.error(f"Unexpected error parsing vector string '{vector_str[:50]}...': {e}", exc_info=True)
+            return None
+
+
+    def fetch_all_vectors(self, limit: Optional[int] = None) -> pd.DataFrame:
+        """
+        Fetches all vectors and metadata from the resolved table.
+        Parses string representations of vectors into lists.
+
+        Args:
+            limit: Optional limit on the number of rows to fetch (for large tables).
+
+        Returns:
+            A pandas DataFrame with columns like 'file_id', 'chunk', 'embedding' (as list).
+
+        Raises:
+            VectorLoaderError: If fetching fails or no data is found.
+        """
+        if not self.operator.table or not self.operator.schema:
+            raise VectorLoaderError("Operator not initialized, table name or schema is unknown.")
+
+        table_name = self.operator.table
+        schema_name = self.operator.schema
+        select_columns = f"file_id, chunk, {self.embedding_column}"
+
+        logger.info(f"Fetching data from {schema_name}.{table_name} (columns: {select_columns})...")
+        try:
+            query = self.operator.spc.schema(schema_name).table(table_name).select(select_columns)
+            if limit:
+                logger.info(f"Applying limit: {limit}")
+                query = query.limit(limit)
+            response = query.execute()
+
+            if not response.data:
+                logger.warning(f"No data found in table {self.operator.table_fqn()}.")
+                return pd.DataFrame(columns=['file_id', 'chunk', self.embedding_column])
+
+            logger.info(f"Fetched {len(response.data)} rows.")
+            df = pd.DataFrame(response.data)
+
+            # --- FIX: Parse the embedding string into a list ---
+            logger.info(f"Parsing string representation in '{self.embedding_column}' column...")
+            parsed_embeddings = df[self.embedding_column].apply(self._parse_vector_string)
+            # Overwrite the original string column with the parsed list (or None if parsing failed)
+            df[self.embedding_column] = parsed_embeddings
+            logger.debug(f"Sample '{self.embedding_column}' data after parsing (first 5 rows):\n{df[[self.embedding_column]].head()}")
+            # --- END FIX ---
+
+
+            # === Enhanced Debugging for Embedding Column (Now checks the parsed list) ===
+            logger.info(f"Checking validity of parsed '{self.embedding_column}' column...")
+            if self.embedding_column not in df.columns:
+                 raise VectorLoaderError(f"Required embedding column '{self.embedding_column}' missing after processing.")
+
+            # 1. Check for NULLs (includes rows where parsing failed and returned None)
+            initial_count = len(df)
+            null_mask = df[self.embedding_column].isnull()
+            null_count = null_mask.sum()
+            if null_count > 0:
+                logger.warning(f"Found {null_count} rows with NULL or unparsable vectors in '{self.embedding_column}'.")
+
+            df_no_nulls = df.dropna(subset=[self.embedding_column])
+            count_after_null_drop = len(df_no_nulls)
+            logger.debug(f"{count_after_null_drop} rows remaining after dropping NULLs/unparsable.")
+
+            # 2. Check for non-empty list type (This check might be slightly redundant now if parsing worked, but keep for safety)
+            if not df_no_nulls.empty:
+                def is_valid_list(x):
+                    # Check should pass if parsing was successful
+                    return isinstance(x, list) and len(x) > 0
+
+                valid_list_mask = df_no_nulls[self.embedding_column].apply(is_valid_list)
+                invalid_list_count = len(df_no_nulls) - valid_list_mask.sum()
+
+                if invalid_list_count > 0:
+                    # This indicates an issue with the parsing logic or unexpected data format
+                    logger.error(f"Found {invalid_list_count} rows where '{self.embedding_column}' is not a non-empty list *after parsing*. This should not happen.")
+                    invalid_entries = df_no_nulls[~valid_list_mask][self.embedding_column]
+                    for i, entry in enumerate(invalid_entries.head(5)):
+                        logger.debug(f"  Problematic entry example {i+1}: Type={type(entry)}, Value='{str(entry)[:100]}...'")
+
+                df_filtered = df_no_nulls[valid_list_mask].copy()
+            else:
+                df_filtered = df_no_nulls
+
+            final_count = len(df_filtered)
+            # === End Enhanced Debugging ===
+
+            if final_count < initial_count:
+                logger.warning(f"Filtered out {initial_count - final_count} rows total due to missing/invalid '{self.embedding_column}'.")
+
+            if df_filtered.empty:
+                 logger.warning(f"No valid embedding data found after filtering. Check data in table {self.operator.table_fqn()} and parsing logic.")
+                 return pd.DataFrame(columns=['file_id', 'chunk', self.embedding_column])
+
+            logger.info(f"Proceeding with {final_count} valid rows.")
+
+            # Validate and potentially add placeholder metadata columns AFTER filtering
+            if 'file_id' not in df_filtered.columns:
+                 logger.warning("'file_id' column missing, using index instead.")
+                 df_filtered['file_id'] = df_filtered.index
+            if 'chunk' not in df_filtered.columns:
+                 logger.warning("'chunk' column missing, hover text will be limited.")
+                 df_filtered['chunk'] = "N/A"
+
+            return df_filtered
+
+        except Exception as e:
+            logger.exception(f"Failed to fetch data from {self.operator.table_fqn()}.")
+            if 'relation' in str(e) and 'does not exist' in str(e):
+                 raise VectorLoaderError(f"Table/Relation not found error: {e}. Check schema/table name and permissions.") from e
+            else:
+                 raise VectorLoaderError(f"Database query failed: {e}") from e
+
+    # reduce_dimensions and load_and_reduce methods remain the same as the previous version
+    # (they expect df with a valid list in the embedding column)
+
+    def reduce_dimensions(self, df: pd.DataFrame, tsne_params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
+        """
+        Applies t-SNE to reduce embedding dimensions to 3D.
+        (Code remains the same as previous correct version)
+        """
+        if df.empty:
+            logger.warning("Input DataFrame for reduce_dimensions is empty. Returning empty DataFrame.")
+            empty_df_with_cols = df.copy()
+            for col in ['x', 'y', 'z']:
+                 if col not in empty_df_with_cols:
+                     empty_df_with_cols[col] = pd.Series(dtype=float)
+            return empty_df_with_cols
+
+        if self.embedding_column not in df.columns:
+            raise VectorLoaderError(f"Embedding column '{self.embedding_column}' missing in DataFrame passed to reduce_dimensions.")
+
+        try:
+            embeddings = np.array(df[self.embedding_column].tolist(), dtype=float)
+        except ValueError as ve:
+             logger.error(f"Failed to convert embedding list to numeric numpy array: {ve}")
+             raise VectorLoaderError(f"Data in '{self.embedding_column}' could not be converted to numeric vectors.") from ve
+
+        if embeddings.ndim != 2:
+             raise VectorLoaderError(f"Embedding data has unexpected dimensions: {embeddings.ndim} (expected 2). Shape: {embeddings.shape}")
+
+        n_samples = embeddings.shape[0]
+
+        if n_samples < 2:
+             logger.warning(f"Found only {n_samples} valid vector(s). t-SNE requires at least 2. Assigning default 3D coordinates.")
+             default_coords = [[0.0, 0.0, 0.0]] * n_samples
+             df[['x', 'y', 'z']] = default_coords
+             return df
+
+        logger.info(f"Applying t-SNE to {n_samples} vectors of dimension {embeddings.shape[1]}...")
+
+        current_tsne_params = self.DEFAULT_TSNE_PARAMS.copy()
+        if tsne_params:
+            current_tsne_params.update(tsne_params)
+            logger.info(f"Using custom t-SNE params: {tsne_params}")
+
+        if n_samples <= current_tsne_params['perplexity']:
+            new_perplexity = max(5.0, float(n_samples - 1))
+            logger.warning(f"Adjusting t-SNE perplexity from {current_tsne_params['perplexity']:.1f} "
+                           f"to {new_perplexity:.1f} due to low sample count ({n_samples}).")
+            current_tsne_params['perplexity'] = new_perplexity
+
+        if n_samples * embeddings.shape[1] > 100000 and current_tsne_params['n_iter'] < 1000:
+             logger.warning(f"Dataset size seems large, increasing t-SNE n_iter from {current_tsne_params['n_iter']} to 1000 for better convergence.")
+             current_tsne_params['n_iter'] = 1000
+
+        try:
+            logger.debug(f"Final t-SNE parameters: {current_tsne_params}")
+            tsne = TSNE(**current_tsne_params)
+            reduced_embeddings = tsne.fit_transform(embeddings)
+
+            df[['x', 'y', 'z']] = reduced_embeddings
+            logger.info("t-SNE reduction complete.")
+            return df
+
+        except Exception as e:
+            logger.exception("t-SNE dimensionality reduction failed.")
+            raise VectorLoaderError(f"t-SNE failed: {e}") from e
+
+
+    def load_and_reduce(self, limit: Optional[int] = None, tsne_params: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
+        """Orchestrates fetching vectors and reducing dimensions."""
+        logger.info("Starting vector load and reduction process...")
+        df_raw_filtered = self.fetch_all_vectors(limit=limit)
+        df_reduced = self.reduce_dimensions(df_raw_filtered, tsne_params=tsne_params)
+        logger.info("Vector load and reduction process finished.")
+        return df_reduced
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/py.typed
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/py.typed
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/visualizer.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecmap/visualizer.py
@ -0,0 +1,776 @@
+# --- START OF FILE visualizer.py ---
+
+"""Dash/Plotly based 3D visualizer for vector embeddings with tabs, clustering, filtering, and centroid click interaction."""
+from __future__ import annotations
+
+import logging
+import os
+from io import StringIO
+from typing import Any, Dict, List, Optional, Tuple
+
+import dash
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+from dash import dcc, html, ctx  # Import ctx
+from dash.dependencies import Input, Output, State
+from dash.exceptions import PreventUpdate
+from sklearn.cluster import KMeans
+
+# --- Imports ---
+try:
+    from librarian_vspace.vecembed.embedding_generator import EmbeddingGenerator
+except ImportError as e:
+    logging.error(f"Import vecembed failed: {e}. Using Dummy.")
+
+
+    # Define dummy class correctly indented
+    class EmbeddingGenerator:
+        """Dummy class if vecembed import fails."""
+
+        def generate_embedding(*args, **kwargs) -> Tuple[
+            str, None, Any]:  # Match expected output type Optional[List[float]]
+            logging.error("Dummy EmbeddingGenerator called.")
+            text_to_embed = kwargs.get("text_to_embed", args[3] if len(args) > 3 else "unknown")
+            identifier = kwargs.get("identifier", args[4] if len(args) > 4 else "unknown")
+            logger.debug(f"Dummy generate_embedding called for text='{text_to_embed}', id='{identifier}'")
+            # Return None for the vector part to match expected type
+            return text_to_embed, None, identifier
+
+try:
+    from librarian_vspace.vutils.vector_query_loader import VectorQueryLoader as VectorLoader, VectorQueryLoaderError as VectorLoaderError
+except ImportError as e:
+    logging.error(f"Import loader failed: {e}. Using Dummy.")
+
+
+    # Define dummy classes correctly indented
+    class VectorLoader:
+        """Dummy class if loader import fails."""
+
+        def __init__(self, *args, **kwargs):
+            logging.error("Dummy VectorLoader initialized.")
+            pass
+
+        def load_and_reduce(self, *args, **kwargs) -> pd.DataFrame:
+            logging.error("Dummy VectorLoader load_and_reduce called.")
+            return pd.DataFrame()  # Return empty DataFrame
+
+
+    class VectorLoaderError(Exception):
+        """Dummy exception if loader import fails."""
+        pass
+# --- End Imports ---
+
+logger = logging.getLogger(__name__)
+DEFAULT_N_CLUSTERS = 8
+
+# Opacity constants
+OPACITY_DEFAULT = 0.8
+OPACITY_SEARCH_DIMMED = 0.1  # Reduced from 0.6 to 0.3 for better visual distinction
+OPACITY_SELECTED_CLUSTER = 0.9
+
+
+class VectorVisualizer:
+    def __init__(self, initial_data: pd.DataFrame,
+                 db_schema: str, db_function: str,
+                 interface_name: str, model_name: str,
+                 embedding_column: str = "embedding",
+                 initial_limit: Optional[int] = None,
+                 initial_perplexity: float = 30.0,
+                 n_clusters: int = DEFAULT_N_CLUSTERS
+                 ):
+        required_cols = ['x', 'y', 'z', 'file_id', 'chunk', embedding_column]
+        processed_data_json: Optional[str] = None
+        processed_color_map: Dict = {}
+        processed_original_embeddings: np.ndarray = np.array([])
+        processed_cluster_centroids: Dict[str, List[float]] = {}
+
+        self.embedding_column = embedding_column
+        self.n_clusters = n_clusters
+        self.db_schema = db_schema
+        self.db_function = db_function
+        self.model_name = model_name
+        self.limit = initial_limit
+        self.perplexity = initial_perplexity
+        self.interface_name = interface_name
+        # Use the correctly defined EmbeddingGenerator (either real or dummy)
+        self.app = dash.Dash(__name__, suppress_callback_exceptions=True)
+        self.embedding_generator = EmbeddingGenerator()  # Instantiated here
+
+        if initial_data.empty or not all(col in initial_data.columns for col in required_cols):
+            logger.warning("Initial DataFrame empty/invalid.")
+            base_cols = required_cols + ['cluster', 'hover_text']
+            initial_df_processed = pd.DataFrame(columns=base_cols)
+        else:
+            try:
+                logger.info("Processing initial data...")
+                df_copy = initial_data.copy()
+                df_after_kmeans, kmeans_color_map = self._run_kmeans(df_copy, self.n_clusters)
+                if not isinstance(df_after_kmeans, pd.DataFrame): raise TypeError("KMeans failed.")
+                processed_color_map = kmeans_color_map
+                df_after_prepare = self._prepare_plot_data(df_after_kmeans)
+                if not isinstance(df_after_prepare, pd.DataFrame): raise TypeError("Prep data failed.")
+                initial_df_processed = df_after_prepare
+                if not initial_df_processed.empty and all(
+                        c in initial_df_processed for c in ['x', 'y', 'z', 'cluster']):
+                    processed_cluster_centroids = self._calculate_centroids(initial_df_processed)
+                else:
+                    logger.warning("Could not calculate initial centroids.")
+                if not initial_df_processed.empty:
+                    processed_data_json = initial_df_processed.to_json(date_format='iso', orient='split')
+                else:
+                    logger.warning("DataFrame empty after processing.")
+                if not initial_df_processed.empty and self.embedding_column in initial_df_processed.columns:
+                    try:
+                        emb = initial_df_processed[self.embedding_column].iloc[0]
+                        if isinstance(emb, np.ndarray):
+                            processed_original_embeddings = np.stack(initial_df_processed[self.embedding_column].values)
+                        elif isinstance(emb, list):
+                            processed_original_embeddings = np.array(
+                                initial_df_processed[self.embedding_column].tolist(), dtype=float)
+                        else:
+                            raise TypeError("Unsupported embedding type.")
+                    except Exception as emb_err:
+                        logger.error(f"Embed processing error: {emb_err}"); processed_original_embeddings = np.array([])
+                else:
+                    logger.warning("Could not extract original embeddings.")
+            except Exception as e:
+                logger.error(f"Initial processing error: {e}", exc_info=True)
+                processed_data_json, processed_color_map, processed_original_embeddings, processed_cluster_centroids = None, {}, np.array(
+                    []), {}
+                initial_df_processed = pd.DataFrame()
+
+        self.initial_data_json = processed_data_json
+        self.initial_cluster_color_map = processed_color_map
+        self.initial_cluster_centroids = processed_cluster_centroids
+        self.original_embeddings = processed_original_embeddings
+        # Determine slider limits and elbow‑based default
+        self.max_clusters = max(1, len(initial_data))
+        try:
+            self.optimal_clusters = self._estimate_optimal_clusters(processed_original_embeddings,
+                                                                    max_k=min(10, self.max_clusters))
+        except Exception:
+            self.optimal_clusters = self.n_clusters
+        # Use elbow result as the current cluster count
+        self.n_clusters = self.optimal_clusters
+        self._build_layout();
+        self._register_callbacks()
+
+    def _run_kmeans(self, df: pd.DataFrame, n_clusters: int) -> Tuple[pd.DataFrame, Dict[str, str]]:
+        """Runs K-Means, assigns string cluster labels."""
+        default_map = {"-1": "grey"}
+        if df.empty or self.embedding_column not in df.columns: df['cluster'] = "-1"; return df, default_map
+        try:
+            emb_col = df[self.embedding_column]
+            if isinstance(emb_col.iloc[0], np.ndarray):
+                embeddings = np.stack(emb_col.values)
+            elif isinstance(emb_col.iloc[0], list):
+                embeddings = np.array(emb_col.tolist(), dtype=float)
+            else:
+                raise TypeError("Unsupported embedding type.")
+            if embeddings.ndim != 2: raise ValueError("Embeddings must be 2D.")
+
+            eff_clusters = min(n_clusters, embeddings.shape[0])
+            if embeddings.shape[0] < 2 or eff_clusters < 1:
+                lbl = "0" if embeddings.shape[0] > 0 else "-1";
+                df['cluster'] = lbl
+                colors = px.colors.qualitative.Plotly;
+                return df, {lbl: colors[0 % len(colors)]} if lbl == "0" else default_map
+            if eff_clusters == 1: df['cluster'] = "0"; colors = px.colors.qualitative.Plotly; return df, {
+                "0": colors[0 % len(colors)]}
+
+            kmeans = KMeans(n_clusters=eff_clusters, random_state=42, n_init='auto')
+            df['cluster'] = kmeans.fit_predict(embeddings).astype(str)
+            unique_labels = sorted(df['cluster'].unique())
+            colors = px.colors.qualitative.Plotly
+            color_map = {lbl: colors[i % len(colors)] for i, lbl in enumerate(unique_labels)}
+            return df, color_map
+        except (TypeError, ValueError) as e:
+            logger.error(f"KMeans input error: {e}"); df['cluster'] = "-1"; return df, default_map
+        except Exception as e:
+            logger.exception("KMeans failed."); df['cluster'] = "-1"; return df, default_map
+
+    def _prepare_plot_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Prepares hover text."""
+        if df.empty: return df
+        if 'cluster' not in df.columns: df['cluster'] = 'N/A'
+        df_copy = df.copy()
+
+        def gen_hover(row):
+            try:
+                return f"ID: {row.get('file_id', 'N/A')}<br>Cluster: {str(row.get('cluster', 'N/A'))}<br>Chunk: {str(row.get('chunk', ''))[:200]}{'...' if len(str(row.get('chunk', ''))) > 200 else ''}"
+            except Exception:
+                return "Hover gen error"
+
+        try:
+            df_copy['hover_text'] = df_copy.apply(gen_hover, axis=1); return df_copy
+        except Exception as e:
+            logger.error(f"Hover gen failed: {e}"); return df
+
+    def _calculate_centroids(self, df: pd.DataFrame) -> Dict[str, List[float]]:
+        """Calculates 3D centroids."""
+        centroids = {}
+        required = ['x', 'y', 'z', 'cluster'];
+        numeric_cols = ['x', 'y', 'z']
+        if df.empty or not all(col in df.columns for col in required): return centroids
+        df_copy = df.copy();
+        df_copy['cluster'] = df['cluster'].astype(str)
+        for col in numeric_cols:
+            if not pd.api.types.is_numeric_dtype(df_copy[col]):
+                try:
+                    df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')
+                except Exception:
+                    logger.error(f"Centroid calc: conv error '{col}'"); return {}
+            if df_copy[col].isnull().any(): logger.warning(f"Centroid calc: NaNs in '{col}'")
+        try:
+            # Calculate mean, drop rows where ALL numeric_cols are NaN, then drop rows where the resulting mean is NaN
+            centroid_data = df_copy.dropna(subset=numeric_cols, how='all').groupby('cluster')[
+                numeric_cols].mean().dropna()
+            return {str(idx): row.tolist() for idx, row in centroid_data.iterrows()}
+        except Exception as e:
+            logger.exception("Centroid calc failed."); return {}
+
+    def _create_base_figure(self) -> go.Figure:
+        """Creates base Plotly figure."""
+        fig = go.Figure()
+        fig.update_layout(title='3D t-SNE', margin=dict(l=0, r=0, b=0, t=40),
+                          scene_camera_eye=dict(x=1.5, y=1.5, z=0.5),
+                          scene=dict(xaxis_title='TSNE-1', yaxis_title='TSNE-2', zaxis_title='TSNE-3',
+                                     aspectmode='data'),
+                          legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, bgcolor='rgba(255,255,255,0.7)'),
+                          hovermode='closest')
+        return fig
+
+    def _build_layout(self) -> None:
+        """Builds the Dash layout."""
+        self.app.layout = html.Div([
+            dcc.Store(id='stored-data', data=self.initial_data_json),
+            dcc.Store(id='cluster-color-map-store', data=self.initial_cluster_color_map),
+            dcc.Store(id='cluster-centroids-store', data=self.initial_cluster_centroids),
+            dcc.Store(id='search-results-store', data=None),
+            dcc.Store(id='selected-cluster-store', data=None),  # Store for click state
+            html.H1("Vector Embedding Visualizer"),
+            dcc.Tabs(id="main-tabs", value='tab-vis', children=[
+                dcc.Tab(label='Visualization', value='tab-vis', children=[
+                    html.Div([  # Controls
+                        html.Div(
+                            [html.Button('Reload Data', id='reload-button', n_clicks=0, style={'marginRight': '10px'}),
+                             dcc.Input(id='search-input', type='text', placeholder='Search term...', debounce=True,
+                                       style={'width': '40%', 'marginRight': '5px'}),
+                             html.Button('Search', id='search-button', n_clicks=0)],
+                            style={'padding': '10px', 'display': 'flex'}),
+                        html.Div([html.Label("Similarity:", style={'marginRight': '10px'}),
+                                  dcc.Slider(id='similarity-slider', min=0, max=1, step=0.01, value=0.0,
+                                             marks={i / 10: f'{i / 10:.1f}' for i in range(11)},
+                                             tooltip={"placement": "bottom", "always_visible": True}, disabled=True)],
+                                 id='slider-container', style={'display': 'none', 'padding': '10px 20px'}),
+                        html.Div([
+                            html.Label("Clusters:", style={'marginRight': '10px'}),
+                            dcc.Slider(
+                                id='cluster-slider',
+                                min=1,
+                                max=self.max_clusters,
+                                step=1,
+                                value=self.optimal_clusters,
+                                marks=self._cluster_marks(),
+                                tooltip={'placement': 'bottom', 'always_visible': True}
+                            )
+                        ], style={'padding': '10px 20px'}),
+                        html.Div(id='status-output', style={'padding': '10px', 'color': 'blue', 'minHeight': '20px'}),
+                        dcc.Loading(id="loading-graph", type="circle",
+                                    children=dcc.Graph(id='vector-graph', style={'height': '70vh'}))
+                    ])
+                ]),
+                dcc.Tab(label='Settings', value='tab-settings', children=[
+                    html.Div([html.H3("Settings"), html.Div([html.Label("Marker Size:", style={'marginRight': '10px'}),
+                                                             dcc.Slider(id='size-slider', min=1, max=15, step=1,
+                                                                        value=4,
+                                                                        marks={i: str(i) for i in range(1, 16)},
+                                                                        tooltip={'placement': 'bottom',
+                                                                                 'always_visible': True})],
+                                                            style={'padding': '10px 20px'})], style={'padding': '20px'})
+                ]),
+            ]),
+        ])
+
+    def _cosine_similarity(self, vec1: np.ndarray, vec2: np.ndarray) -> np.ndarray | float:
+        """Calculates cosine similarity."""
+        if not isinstance(vec1, np.ndarray): vec1 = np.array(vec1, dtype=float)
+        if not isinstance(vec2, np.ndarray): vec2 = np.array(vec2, dtype=float)
+        if vec1.ndim == 1: vec1 = vec1.reshape(1, -1)
+        if vec2.ndim == 1: vec2 = vec2.reshape(1, -1)
+        if vec1.shape[1] != vec2.shape[1]: raise ValueError("Vector dimension mismatch")
+        norm1 = np.linalg.norm(vec1, axis=1, keepdims=True);
+        norm2 = np.linalg.norm(vec2, axis=1, keepdims=True)
+        z1 = (norm1 == 0).flatten();
+        z2 = (norm2 == 0).flatten()
+        # Handle potential division by zero for zero vectors
+        norm1[z1] = 1.0;
+        norm2[z2] = 1.0
+        sim = np.dot(vec1 / norm1, (vec2 / norm2).T)
+        # Ensure zero vectors result in zero similarity
+        if np.any(z1): sim[:, :] = 0.0;
+        sim[:, z2] = 0.0
+        sim = np.clip(sim, -1.0, 1.0)
+        return sim.item() if sim.size == 1 else sim.flatten()
+
+    def _find_neighbors(self, search_vector: List[float], k: int = 10) -> Optional[Tuple[np.ndarray, np.ndarray]]:
+        """Finds k nearest neighbors."""
+        if not search_vector or not isinstance(search_vector, list): return None
+        if self.original_embeddings is None or self.original_embeddings.size == 0: return None
+        try:
+            vec = np.array(search_vector, dtype=float)
+            if vec.ndim != 1: raise ValueError("Search vector != 1D.")
+            if self.original_embeddings.ndim != 2: raise ValueError("Embeddings != 2D.")
+            if self.original_embeddings.shape[1] != vec.shape[0]: raise ValueError("Dimension mismatch.")
+            sims = self._cosine_similarity(vec, self.original_embeddings)
+            if not isinstance(sims, np.ndarray) or sims.ndim != 1 or sims.shape[0] != self.original_embeddings.shape[
+                0]: raise TypeError("Similarity calc failed.")
+            k_actual = min(k, len(sims));
+            if k_actual <= 0: return None
+            idx = np.argpartition(sims, -k_actual)[-k_actual:]  # Get indices of top k
+            sorted_idx = idx[np.argsort(sims[idx])][::-1]  # Sort top k indices by similarity
+            return sorted_idx, sims[sorted_idx]
+        except (ValueError, TypeError) as e:
+            logger.error(f"Neighbor input error: {e}"); return None
+        except Exception as e:
+            logger.exception(f"Neighbor search error: {e}"); return None
+
+    # --- Callbacks ---
+    def _register_callbacks(self) -> None:
+        """Sets up Dash callbacks."""
+
+        # --- Callback 1: Reload Button ---
+        @self.app.callback(
+            Output('stored-data', 'data', allow_duplicate=True), Output('cluster-color-map-store', 'data'),
+            Output('cluster-centroids-store', 'data'),
+            Output('status-output', 'children'), Output('search-results-store', 'data', allow_duplicate=True),
+            Output('selected-cluster-store', 'data', allow_duplicate=True),
+            Input('reload-button', 'n_clicks'), prevent_initial_call=True)
+        def handle_reload(n_clicks: int) -> Tuple[Optional[str], Dict, Dict, str, None, None]:
+            if n_clicks == 0: raise PreventUpdate
+            logger.info("Reload triggered...")
+            status = "Reloading...";
+            color_map, centroids, data_json = {}, {}, None;
+            self.original_embeddings = np.array([])
+            try:
+                # Ensure VectorLoader is properly imported or defined (dummy used if import fails)
+                loader = VectorLoader(self.db_schema, self.db_function, self.model_name, self.embedding_column)
+                reduced_data = loader.load_and_reduce(limit=self.limit, tsne_params={"perplexity": self.perplexity})
+                if not isinstance(reduced_data, pd.DataFrame) or reduced_data.empty: raise VectorLoaderError("No data.")
+
+                df_clustered, color_map = self._run_kmeans(reduced_data.copy(), self.n_clusters)
+                if not isinstance(df_clustered, pd.DataFrame): raise TypeError(
+                    "KMeans failed post-reload.")  # Add check
+                df_final = self._prepare_plot_data(df_clustered)
+                if not isinstance(df_final, pd.DataFrame): raise TypeError(
+                    "Prepare plot failed post-reload.")  # Add check
+
+                if not df_final.empty and all(c in df_final for c in ['x', 'y', 'z', 'cluster']):
+                    centroids = self._calculate_centroids(df_final)
+                else:
+                    logger.warning("Could not calculate centroids after reload (missing cols or empty).")
+
+                if not reduced_data.empty and self.embedding_column in reduced_data.columns:
+                    try:
+                        emb_col = reduced_data[self.embedding_column]
+                        # Check type of first element before processing
+                        if not emb_col.empty:
+                            first_emb = emb_col.iloc[0]
+                            if isinstance(first_emb, np.ndarray):
+                                self.original_embeddings = np.stack(emb_col.values)
+                            elif isinstance(first_emb, list):
+                                self.original_embeddings = np.array(emb_col.tolist(), dtype=float)
+                            else:
+                                raise TypeError(f"Unsupported reloaded embed type: {type(first_emb)}")
+                            logger.info(f"Stored reloaded embeddings (shape: {self.original_embeddings.shape}).")
+                        else:
+                            logger.warning("Embedding column empty during reload storage.")
+                    except Exception as e:
+                        logger.error(f"Store embed fail: {e}"); self.original_embeddings = np.array([])
+                else:
+                    logger.warning("Embedding column missing or df empty during reload storage.")
+
+                if not df_final.empty:
+                    data_json = df_final.to_json(date_format='iso',
+                                                 orient='split'); status = f"Reloaded ({len(df_final)} pts)."
+                else:
+                    status = "Warning: Reload empty post-process."
+            except (VectorLoaderError, TypeError, Exception) as e:
+                logger.exception(
+                    f"Reload error: {e}"); status = f"Error: {e}"; data_json, color_map, centroids = None, {}, {}; self.original_embeddings = np.array(
+                    [])
+            return data_json, color_map, centroids, status, None, None
+
+        # --- Callback 1 b: Cluster‑count Slider ---
+        @self.app.callback(
+            Output('stored-data', 'data', allow_duplicate=True),
+            Output('cluster-color-map-store', 'data', allow_duplicate=True),
+            Output('cluster-centroids-store', 'data', allow_duplicate=True),
+            Output('status-output', 'children', allow_duplicate=True),
+            Input('cluster-slider', 'value'),
+            State('stored-data', 'data'),
+            prevent_initial_call=True
+        )
+        def update_n_clusters(k: int, stored_json: str):
+            if not stored_json:
+                raise PreventUpdate
+
+            # Update the visualizer state
+            self.n_clusters = k
+
+            try:
+                df = pd.read_json(StringIO(stored_json), orient='split')
+                df, color_map = self._run_kmeans(df, k)
+                df = self._prepare_plot_data(df)
+                centroids = self._calculate_centroids(df)
+
+                status = f"Cluster count set to {k}."
+                return (df.to_json(date_format='iso', orient='split'),
+                        color_map,
+                        centroids,
+                        status)
+            except Exception as err:
+                logger.error(f"Clustering update failed: {err}")
+                raise PreventUpdate
+
+        # --- Callback 2: Search Button ---
+        @self.app.callback(
+            Output('search-results-store', 'data', allow_duplicate=True),
+            Output('status-output', 'children', allow_duplicate=True),
+            Input('search-button', 'n_clicks'), State('search-input', 'value'), prevent_initial_call=True)
+        def handle_search(n_clicks: int, term: str) -> Tuple[Optional[Dict], str]:
+            if n_clicks == 0 or not term: return None, "Enter search term."
+            logger.info(f"Search: '{term}'");
+            status = f"Embedding '{term}'..."
+            try:
+                if self.original_embeddings is None or self.original_embeddings.size == 0: return None, "Error: No data."
+                _, vec, _ = self.embedding_generator.generate_embedding(self.interface_name, self.model_name, term,
+                                                                        "search")
+                if vec is None: return None, f"Error: Embed failed."
+                status = f"Finding neighbors...";
+                neighbors = self._find_neighbors(vec, k=20)
+                if neighbors is None: return None, f"No neighbors found."
+                idx, sims = neighbors;
+                results = {"indices": idx.tolist(), "similarities": sims.tolist(), "term": term}
+                status = f"Found {len(idx)} neighbors.";
+                return results, status
+            except Exception as e:
+                logger.exception("Search error."); return None, f"Error: {e}"
+
+        # --- Callback 3: Slider Visibility ---
+        @self.app.callback(
+            Output('slider-container', 'style'), Output('similarity-slider', 'disabled'),
+            Output('similarity-slider', 'value'),
+            Input('search-results-store', 'data'), prevent_initial_call=True)
+        def update_slider_visibility(res: Optional[Dict]) -> Tuple[Dict, bool, float]:
+            show = res and isinstance(res, dict) and "indices" in res
+            style = {'display': 'block' if show else 'none', 'padding': '10px 20px'}
+            return style, not show, 0.0
+
+        # --- Callback 4: Graph Update (Main Logic with clickData fix and logging) ---
+        @self.app.callback(
+            Output('vector-graph', 'figure'), Output('selected-cluster-store', 'data'),
+            Output('status-output', 'children', allow_duplicate=True),
+            Input('stored-data', 'data'), Input('cluster-color-map-store', 'data'),
+            Input('cluster-centroids-store', 'data'),
+            Input('search-results-store', 'data'), Input('similarity-slider', 'value'), Input('size-slider', 'value'),
+            Input('vector-graph', 'clickData'),  # Input for clicks
+            State('selected-cluster-store', 'data'),  # Get current selection
+            prevent_initial_call='initial_duplicate'  # Allow initial run
+        )
+        def update_graph(stored_data_json: Optional[str], cluster_color_map: Optional[Dict],
+                         cluster_centroids: Optional[Dict[str, List[float]]],
+                         search_results: Optional[Dict], similarity_threshold: float, size_value: int,
+                         click_data: Optional[Dict],
+                         current_selected_cluster: Optional[str]) -> Tuple[go.Figure, Optional[str], str]:
+
+            fig = self._create_base_figure();
+            status_msg = "";
+            new_selected_cluster = current_selected_cluster
+            trigger = ctx.triggered_id if ctx.triggered else "Initial"
+            logger.debug(f"--- Graph Update | Trigger: {trigger} | CurrentSel: {current_selected_cluster} ---")
+
+            # --- Data Load & Validation ---
+            if not stored_data_json: return fig, None, "No data."
+            try:
+                df = pd.read_json(StringIO(stored_data_json), orient='split')
+                if df.empty: return fig, None, "Empty data."
+                required = ['x', 'y', 'z', 'cluster', 'hover_text'];
+                assert all(col in df.columns for col in required)
+                df['cluster'] = df['cluster'].astype(str)
+                color_map = cluster_color_map if isinstance(cluster_color_map, dict) else {}
+                centroids = cluster_centroids if isinstance(cluster_centroids, dict) else {}
+                if not color_map:
+                    logger.warning("Missing color map, generating default.")
+                    unique_clusters = df['cluster'].unique();
+                    colors = px.colors.qualitative.Plotly
+                    color_map = {str(c): colors[i % len(colors)] for i, c in enumerate(unique_clusters)} or {
+                        '0': 'grey'}
+
+                # Calculate overall data range (handle potential NaNs/Infs in full data)
+                df_finite = df[['x', 'y', 'z']].replace([np.inf, -np.inf], np.nan).dropna()
+                if not df_finite.empty:
+                    overall_x_min, overall_x_max = df_finite['x'].min(), df_finite['x'].max()
+                    overall_y_min, overall_y_max = df_finite['y'].min(), df_finite['y'].max()
+                    overall_z_min, overall_z_max = df_finite['z'].min(), df_finite['z'].max()
+                    logger.debug(
+                        f"Overall Finite Range: X=[{overall_x_min:.2f}, {overall_x_max:.2f}], Y=[{overall_y_min:.2f}, {overall_y_max:.2f}], Z=[{overall_z_min:.2f}, {overall_z_max:.2f}]")
+                else:
+                    logger.warning("No finite data points found in the dataset to calculate overall range.")
+                    overall_x_min, overall_x_max = -10, 10  # Default ranges if no finite data
+                    overall_y_min, overall_y_max = -10, 10
+                    overall_z_min, overall_z_max = -10, 10
+
+            except Exception as e:
+                logger.exception("Graph data error."); return fig, current_selected_cluster, f"Error: {e}"
+
+            # --- Click Processing ---
+            if trigger == 'vector-graph':
+                logger.debug(f"Click Data Received: {click_data}")
+                if click_data and 'points' in click_data and click_data['points']:
+                    point_data = click_data['points'][0]
+                    clicked_customdata = point_data.get('customdata');
+                    clicked_text = point_data.get('text', '')
+                    logger.debug(f"Clicked Point Customdata: {clicked_customdata}");
+                    logger.debug(f"Clicked Point Text: '{clicked_text}'")
+                    is_centroid_click = False;
+                    clicked_cluster_id = None
+                    if isinstance(clicked_customdata, list) and len(clicked_customdata) > 0: clicked_customdata = \
+                    clicked_customdata[0]
+                    if isinstance(clicked_customdata, (str, int)):  # Accept string or int cluster IDs
+                        is_centroid_click = True;
+                        clicked_cluster_id = str(clicked_customdata);
+                        logger.info(f"Centroid Click Parsed via customdata: Cluster '{clicked_cluster_id}'")
+                    elif isinstance(clicked_text, str) and clicked_text.startswith("Centroid: Cluster "):
+                        try:
+                            clicked_cluster_id = clicked_text.split("Centroid: Cluster ")[
+                                1]; is_centroid_click = True; logger.info(
+                                f"Centroid Click Parsed via text: Cluster '{clicked_cluster_id}'")
+                        except Exception as parse_err:
+                            logger.warning(f"Failed text parse: {parse_err}")
+                    if is_centroid_click and clicked_cluster_id is not None:
+                        if current_selected_cluster == clicked_cluster_id:
+                            new_selected_cluster = None; status_msg = "Cluster view reset."; logger.info("Deselecting.")
+                        else:
+                            new_selected_cluster = clicked_cluster_id; status_msg = f"Showing Cluster {new_selected_cluster}."; logger.info(
+                                f"Selecting {new_selected_cluster}.")
+                    elif not is_centroid_click and current_selected_cluster is not None:
+                        new_selected_cluster = None; status_msg = "Cluster view reset."; logger.info("Deselecting.")
+                else:  # Click background
+                    if current_selected_cluster is not None: new_selected_cluster = None; status_msg = "Cluster view reset."; logger.info(
+                        "Deselecting.")
+                logger.debug(f"Click Result: new_selected_cluster = {new_selected_cluster}")
+            else:
+                logger.debug("No click trigger.")
+
+            # --- Data Filtering ---
+            active_selection_id = new_selected_cluster
+            df_to_plot = df.copy();
+            centroids_to_plot = centroids.copy()
+            logger.debug(f"Filtering based on active_selection_id: {active_selection_id}")
+            if active_selection_id is not None:
+                df_to_plot = df_to_plot[df_to_plot['cluster'] == active_selection_id]
+                centroids_to_plot = {cid: coords for cid, coords in centroids_to_plot.items() if
+                                     cid == active_selection_id}
+                logger.debug(f"Filtered DF rows: {len(df_to_plot)}")
+                if not df_to_plot.empty:
+                    logger.debug(f"Coordinates of filtered points:\n{df_to_plot[['x', 'y', 'z']]}")
+                else:
+                    logger.warning("Filtered DataFrame is empty.")
+
+            # --- Search Highlighting ---
+            search_highlight_mask = np.zeros(len(df_to_plot), dtype=bool)
+            search_term = None;
+            is_search_active = False;
+            highlight_sims = {}
+            if search_results and isinstance(search_results, dict) and "indices" in search_results:
+                is_search_active = True;
+                search_term = search_results.get("term", "N/A")
+                orig_indices = search_results.get("indices", []);
+                orig_sims = search_results.get("similarities", [])
+                if not df_to_plot.empty:
+                    orig_to_current_map = {orig_idx: current_idx for current_idx, orig_idx in
+                                           enumerate(df_to_plot.index)}
+                    current_indices_hl = [orig_to_current_map[oi] for i, oi in enumerate(orig_indices) if
+                                          i < len(orig_sims) and orig_sims[
+                                              i] >= similarity_threshold and oi in orig_to_current_map]
+                    if current_indices_hl:
+                        search_highlight_mask[current_indices_hl] = True
+                        for i, orig_idx in enumerate(orig_indices):
+                            if i < len(orig_sims) and orig_sims[
+                                i] >= similarity_threshold and orig_idx in orig_to_current_map:
+                                highlight_sims[orig_to_current_map[orig_idx]] = orig_sims[i]
+                else:
+                    logger.warning("Cannot apply search highlighting - filtered df empty.")
+
+            # --- Plotting ---
+            df_search_hl = df_to_plot[search_highlight_mask];
+            df_normal = df_to_plot[~search_highlight_mask]
+            base_size = size_value;
+            normal_op = OPACITY_SELECTED_CLUSTER if active_selection_id else (
+                OPACITY_SEARCH_DIMMED if is_search_active else OPACITY_DEFAULT)
+
+            # --- Add Dummy Points if needed ---
+            num_points_to_plot = len(df_normal) + len(df_search_hl)
+            if active_selection_id is not None and num_points_to_plot <= 2:
+                logger.info(
+                    f"Adding dummy invisible points to aid auto-ranging for cluster {active_selection_id} (points={num_points_to_plot}).")
+                # Use overall range calculated earlier
+                dummy_x = [overall_x_min, overall_x_max]
+                dummy_y = [overall_y_min, overall_y_max]
+                dummy_z = [overall_z_min, overall_z_max]
+                # Ensure dummy points are valid numbers (in case overall range calc failed)
+                if np.isfinite(dummy_x + dummy_y + dummy_z).all():
+                    fig.add_trace(go.Scatter3d(
+                        x=dummy_x, y=dummy_y, z=dummy_z,
+                        mode='markers', marker=dict(size=1, opacity=0),  # Invisible
+                        hoverinfo='skip', showlegend=False, name='_dummy_'
+                    ))
+                else:
+                    logger.warning("Could not add dummy points because overall range contained non-finite values.")
+
+            # Plot Normal Points
+            if not df_normal.empty:
+                finite_mask_normal = np.isfinite(df_normal[['x', 'y', 'z']]).all(axis=1)
+                df_normal_finite = df_normal[finite_mask_normal]
+                if not df_normal_finite.empty:
+                    logger.debug(f"Plotting df_normal (len={len(df_normal_finite)}).")
+                    colors = df_normal_finite['cluster'].map(color_map).fillna('darkgrey')
+                    name = 'Embeddings' if active_selection_id is None else f'Cluster {active_selection_id}'
+                    fig.add_trace(
+                        go.Scatter3d(x=df_normal_finite['x'], y=df_normal_finite['y'], z=df_normal_finite['z'],
+                                     mode='markers',
+                                     marker=dict(color=colors, size=base_size, opacity=normal_op, line=dict(width=0.5)),
+                                     text=df_normal_finite['hover_text'], hoverinfo='text', name=name))
+                else:
+                    logger.warning("No finite normal points to plot.")
+
+            # Plot Search Highlighted Points
+            if not df_search_hl.empty:
+                finite_mask_search = np.isfinite(df_search_hl[['x', 'y', 'z']]).all(axis=1)
+                df_search_hl_finite = df_search_hl[finite_mask_search]
+                if not df_search_hl_finite.empty:
+                    hl_size = max(base_size * 1.5, base_size + 2);
+                    hl_texts = []
+                    # Need mapping from df_search_hl_finite index back to df_to_plot positional index for sims
+                    positions_in_df_to_plot = df_to_plot.index.get_indexer_for(df_search_hl_finite.index)
+
+                    for i, (global_index, row) in enumerate(df_search_hl_finite.iterrows()):
+                        pos = positions_in_df_to_plot[i]  # Get original position in df_to_plot
+                        sim = highlight_sims.get(pos, float('nan'))
+                        sim_txt = f"{sim:.4f}" if not np.isnan(sim) else "N/A"
+                        hl_texts.append(f"{row['hover_text']}<br><b>Sim: {sim_txt}</b>")
+
+                    fig.add_trace(
+                        go.Scatter3d(x=df_search_hl_finite['x'], y=df_search_hl_finite['y'], z=df_search_hl_finite['z'],
+                                     mode='markers',
+                                     marker=dict(color='red', size=hl_size, opacity=1.0, symbol='diamond',
+                                                 line=dict(color='black', width=1)), text=hl_texts, hoverinfo='text',
+                                     name=f'Search Neighbors'))
+                    if not df_search_hl_finite[['x', 'y', 'z']].isnull().values.any():  # Search Centroid
+                        try:
+                            sc = df_search_hl_finite[['x', 'y', 'z']].mean().values; fig.add_trace(
+                                go.Scatter3d(x=[sc[0]], y=[sc[1]], z=[sc[2]], mode='markers',
+                                             marker=dict(color='magenta', size=max(hl_size, 10), symbol='cross',
+                                                         line=dict(width=1)), text=f"Search: '{search_term}' Centroid",
+                                             hoverinfo='text', name='Search Centroid'))
+                        except Exception as e:
+                            logger.warning(f"Search centroid plot fail: {e}")
+                else:
+                    logger.warning("No finite search highlighted points to plot.")
+
+            # Plot Centroids (filtered)
+            if centroids_to_plot:
+                cent_size = base_size + 1;
+                logger.debug(f"Plotting centroids: {list(centroids_to_plot.keys())}")
+                for cid, coords in centroids_to_plot.items():
+                    if isinstance(coords, list) and len(coords) == 3:
+                        logger.debug(f"Plotting Centroid {cid} at coords: {coords}")
+                        if np.isnan(coords).any() or np.isinf(coords).any(): logger.error(
+                            f"!!! Centroid {cid} NaN/Inf coords !!!"); continue
+                        color = color_map.get(str(cid), 'grey');
+                        name = f"Centroid {cid}";
+                        hover_txt = f"Centroid: Cluster {cid}"
+                        fig.add_trace(go.Scatter3d(
+                            x=[coords[0]], y=[coords[1]], z=[coords[2]], mode='markers',
+                            marker=dict(color=color, size=cent_size, symbol='circle', opacity=0.9,
+                                        line=dict(color='black', width=1.5)),
+                            customdata=[str(cid)], text=hover_txt, hoverinfo='text', name=name,
+                            legendgroup="centroids", showlegend=True
+                        ))
+                    else:
+                        logger.warning(f"Invalid centroid data for {cid}")
+
+            # --- Final Layout & Status ---
+            title = f"3D t-SNE ({len(df)} points)"
+            if active_selection_id is not None:
+                title = f"Cluster {active_selection_id} ({len(df_to_plot)} points)"
+            elif is_search_active:
+                title = f"3D t-SNE - Search: '{search_term}'"
+            if active_selection_id and is_search_active: title += f" - Search: '{search_term}'"
+            base_layout = self._create_base_figure().layout
+            fig.update_layout(
+                title=title, legend_title_text='Legend', legend=base_layout.legend,
+                scene=base_layout.scene  # Use base scene settings (includes aspectmode='data')
+                # Rely on auto-ranging (potentially helped by dummy points if added)
+            )
+
+            final_status = status_msg
+            if not final_status:  # Default status
+                base = f"{len(df_to_plot)} points shown."
+                if active_selection_id: base = f"Cluster {active_selection_id}: {len(df_to_plot)} points."
+                final_status = base;
+                if is_search_active: final_status += f" (Search: '{search_term}')"
+
+            return fig, new_selected_cluster, final_status
+
+    def run(self, host: str = "127.0.0.1", port: int = 8050, debug: bool = False) -> None:
+        """Starts the Dash server."""
+        logger.info(f"Starting Dash server on http://{host}:{port}")
+        try:
+            self.app.run(host=host, port=port, debug=debug)
+        except OSError as e:
+            logger.error(f"Server start failed: {e}. Port {port} busy?")
+        except Exception as e:
+            logger.exception(f"Server error: {e}")
+
+    # ──────────────────────────────────────────────────────────────
+    # >>>  Helpers for automatic cluster‐count selection  <<<
+    # ──────────────────────────────────────────────────────────────
+    def _estimate_optimal_clusters(self, embeddings: np.ndarray, max_k: int = 10) -> int:
+        """
+        Estimate an optimal number of clusters using a quick elbow heuristic.
+        Computes K‑means inertia for k = 1…max_k and picks the k that is farthest
+        from the straight line connecting (1, inertia₁) and (max_k, inertiaₘₐₓ).
+        """
+        if embeddings is None or embeddings.size == 0:
+            return 1
+        n_samples = embeddings.shape[0]
+        if n_samples < 3:
+            return 1
+
+        max_k = min(max_k, n_samples)
+        inertias = []
+        for k in range(1, max_k + 1):
+            km = KMeans(n_clusters=k, random_state=42, n_init="auto").fit(embeddings)
+            inertias.append(km.inertia_)
+
+        # distance from each point to the line between first and last
+        x = np.arange(1, max_k + 1)
+        x1, y1 = 1, inertias[0]
+        x2, y2 = max_k, inertias[-1]
+        numerator = np.abs((y2 - y1) * x - (x2 - x1) * np.array(inertias) + x2 * y1 - y2 * x1)
+        denominator = np.sqrt((y2 - y1) ** 2 + (x2 - x1) ** 2)
+        elbow_idx = int(np.argmax(numerator / denominator))
+        return elbow_idx + 1  # since k starts at 1
+
+    def _cluster_marks(self) -> Dict[int, str]:
+        """Generate tick marks for the cluster-count slider."""
+        if self.max_clusters <= 15:
+            return {i: str(i) for i in range(1, self.max_clusters + 1)}
+        # Show first, optimal, and max for large data sets
+        return {1: "1", self.optimal_clusters: str(self.optimal_clusters), self.max_clusters: str(self.max_clusters)}
+
+    # --- END OF FILE visualizer.py ---
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from librarian_vspace!"
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/py.typed
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/py.typed
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/tsne_export_worker.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/tsne_export_worker.py
@ -0,0 +1,93 @@
+
+"""TsneExportWorker – Prefect worker that generates a t‑SNE JSON export.
+
+It wraps Vspace.get_tnse → vecview.get_tsne_json, writes the JSON to a file,
+stages it, and returns the file path.
+
+Minimal Pydantic payload models are defined locally to avoid extra deps.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from prefect import get_run_logger
+from pydantic import BaseModel
+from librarian_core.workers.base import Worker
+
+from librarian_vspace.vecview.vecview import get_tsne_json
+
+# ------------------------------------------------------------------ #
+def _safe_get_logger(name: str):
+    try:
+        return get_run_logger()
+    except Exception:
+        return logging.getLogger(name)
+
+
+# ------------------------------------------------------------------ #
+# Pydantic payloads
+# ------------------------------------------------------------------ #
+class TsneExportInput(BaseModel):
+    course_id: int
+    limit: Optional[int] = None
+    perplexity: float = 30.0
+    db_schema: str = "librarian"
+    rpc_function: str = "pdf_chunking"
+    embed_model: str = "snowflake-arctic-embed2"
+    embedding_column: str = "embedding"
+    base_output_dir: Optional[Path] = None  # where to place JSON file
+
+
+class TsneExportOutput(BaseModel):
+    json_path: Path
+
+
+# ------------------------------------------------------------------ #
+class TsneExportWorker(Worker[TsneExportInput, TsneExportOutput]):
+    """Runs the t‑SNE export inside a Prefect worker."""  # noqa: D401
+
+    input_model = TsneExportInput
+    output_model = TsneExportOutput
+
+    async def __run__(self, payload: TsneExportInput) -> TsneExportOutput:
+        logger = _safe_get_logger(self.worker_name)
+        logger.info("🔨 %s startet (payload=%r)", self.worker_name, payload)
+
+        # Run get_tsne_json in a thread
+        data_json = await asyncio.to_thread(
+            get_tsne_json,
+            db_schema=payload.db_schema,
+            db_function=payload.rpc_function,
+            model_name=payload.embed_model,
+            limit=payload.limit,
+            course_id=payload.course_id,
+            perplexity=payload.perplexity,
+            embedding_column=payload.embedding_column,
+        )
+
+        # Determine output file
+        if payload.base_output_dir:
+            out_dir = Path(payload.base_output_dir).expanduser()
+            out_dir.mkdir(parents=True, exist_ok=True)
+            json_path = out_dir / f"{payload.course_id}_tsne.json"
+        else:
+            tf = tempfile.NamedTemporaryFile(
+                mode="w+", suffix="_tsne.json", prefix="vspace_", delete=False
+            )
+            json_path = Path(tf.name)
+
+        # Write JSON to file
+        json_path.write_text(data_json, encoding="utf-8")
+
+        # Stage file for Prefect
+        self.stage(json_path, new_name=json_path.name)
+
+        result = TsneExportOutput(json_path=json_path)
+        logger.info("✅ %s fertig: %r", self.worker_name, result)
+        return result
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/vecview.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vecview/vecview.py
@ -0,0 +1,104 @@
+
+"""Utility functions to fetch vectors from Supabase, apply t‑SNE, add simple K‑means
+clustering and hover text – prepared exactly like the `VectorVisualizer` expects.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional, List
+
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+
+from librarian_vspace.vutils.vector_query_loader import VectorQueryLoader, VectorQueryLoaderError
+from librarian_vspace.models.tsne_model import TSNEPoint, TSNEData
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_N_CLUSTERS = 8
+
+
+# --------------------------------------------------------------------- #
+# Internal helpers (kept minimal – no extra bells & whistles)
+# --------------------------------------------------------------------- #
+def _run_kmeans(df: pd.DataFrame, *, embedding_column: str, k: int = DEFAULT_N_CLUSTERS) -> pd.DataFrame:
+    """Adds a 'cluster' column using K‑means (string labels)."""
+    if df.empty or embedding_column not in df.columns:
+        df['cluster'] = "-1"
+        return df
+
+    embeddings = np.array(df[embedding_column].tolist(), dtype=float)
+    n_samples = embeddings.shape[0]
+    k = max(1, min(k, n_samples))  # ensure 1 ≤ k ≤ n_samples
+    if n_samples < 2:
+        df['cluster'] = "0"
+        return df
+
+    km = KMeans(n_clusters=k, random_state=42, n_init='auto')
+    df['cluster'] = km.fit_predict(embeddings).astype(str)
+    return df
+
+
+def _add_hover(df: pd.DataFrame) -> pd.DataFrame:
+    if df.empty:
+        return df
+
+    df = df.copy()
+
+    def _hover(row):
+        preview = str(row.get('chunk', ''))[:200]
+        if len(str(row.get('chunk', ''))) > 200:
+            preview += "..."
+        return (
+            f"ID: {row.get('file_id', 'N/A')}<br>"
+            f"Cluster: {row.get('cluster', 'N/A')}<br>"
+            f"Chunk: {preview}"
+        )
+
+    df['hover_text'] = df.apply(_hover, axis=1)
+    return df
+
+
+# --------------------------------------------------------------------- #
+# Public helpers
+# --------------------------------------------------------------------- #
+def get_tsne_dataframe(
+    db_schema: str,
+    db_function: str,
+    model_name: str,
+    *,
+    limit: Optional[int] = None,
+    course_id: Optional[int] = None,
+    perplexity: float = 30.0,
+    embedding_column: str = "embedding",
+    n_clusters: int = DEFAULT_N_CLUSTERS,
+) -> pd.DataFrame:
+    """Returns a pandas DataFrame with tsne (x,y,z) & metadata ready for plotting."""
+    loader = VectorQueryLoader(db_schema, db_function, model_name, embedding_column)
+    df = loader.load_and_reduce(
+        limit=limit,
+        course_id=course_id,
+        tsne_params={"perplexity": perplexity},
+    )
+
+    if df.empty:
+        return df
+
+    df = _run_kmeans(df, embedding_column=embedding_column, k=n_clusters)
+    df = _add_hover(df)
+    return df
+
+
+def get_tsne_json(**kwargs) -> str:
+    """Convenience wrapper returning DataFrame as JSON (orient='split')."""
+    df = get_tsne_dataframe(**kwargs)
+    return df.to_json(date_format='iso', orient='split')
+
+
+def get_tsne_response(**kwargs) -> TSNEData:
+    """Returns a validated `TSNEResponse` Pydantic model."""
+    df = get_tsne_dataframe(**kwargs)
+    points: List[TSNEPoint] = [TSNEPoint(**row.dropna().to_dict()) for _, row in df.iterrows()]
+    return TSNEData(course_id=kwargs.get('course_id'), total=len(points), points=points)
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/init.py
@ -0,0 +1,10 @@
+"""
+vquery package for high-level read operations against vector tables.
+"""
+from .query import VectorQuery
+
+__all__ = ["VectorQuery"] # Defines the public interface of the package
+
+# Optional: Add package-level logging setup if desired, but often handled by the application
+# import logging
+# logging.getLogger(__name__).addHandler(logging.NullHandler())
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/cluster_export.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/cluster_export.py
@ -0,0 +1,438 @@
+# --- START OF FILE cluster_export.py (Refactored & Workaround - Import Updated) ---
+
+"""
+cluster_export.py – Generate IVFFlat‑equivalent clusters from Supabase/Vectorbase
+pgvector data and export each cluster’s chunks to Markdown.
+
+This version fetches vectors filtered by course ID at the database level using
+VectorQueryLoader, performs k-means clustering, and exports to Markdown.
+
+Includes automatic k-downsizing.
+
+Environment variables (used by the script entry point)
+---------------------
+* **Vectorbase credentials** (auto‑mapped to Supabase):
+  * `VECTORBASE_URL`   → `SUPABASE_URL`
+  * `VECTORBASE_API_KEY` → `SUPABASE_KEY`
+  * `VECTORBASE_USER_UUID` → `SUPABASE_USER_UUID` (optional)
+* **Embedding/table config**
+  * `VECTOR_SCHEMA`    – Postgres schema (default `librarian`)
+  * `VECTOR_FUNCTION`  – RPC / Postgres function name (optional)
+  * `EMBED_MODEL`      – embedding model label (default `snowflake-arctic-embed2`)
+* **Clustering hyper‑parameters**
+  * `K`                – requested number of clusters / IVFFlat *nlist* (default 128)
+  * `TRAIN_SAMPLE`     – how many rows to feed into k‑means (default 20 000, but
+                         capped at the table size)
+* **Export**
+  * `OUTPUT_DIR`       – directory for the generated Markdown files (default
+                         `./cluster_md`)
+  * `CLUSTER_COURSE_ID` - Optional course ID to filter vectors (used by script)
+
+
+Usage
+~~~~~
+    # Via script entry point
+    export VECTORBASE_URL="https://xyz.vectorbase.co"
+    export VECTORBASE_API_KEY="service_role_key"
+    export VECTOR_SCHEMA=librarian
+    export EMBED_MODEL=snowflake-arctic-embed2
+    export CLUSTER_COURSE_ID=123 # Optional filtering
+    export K=64
+    python -m librarian_vspace.vquery.cluster_export
+
+    # As a callable function
+    from librarian_vspace.vquery.cluster_export import run_cluster_export_job
+    output_path = run_cluster_export_job(course_id=456, output_dir="/tmp/clusters_456", ...)
+"""
+from __future__ import annotations
+
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Union # Added Union
+
+import numpy as np
+import pandas as pd
+from sklearn.cluster import KMeans
+from sklearn.metrics import pairwise_distances_argmin_min
+
+# ---------------------------------------------------------------------------
+# Map Vectorbase credential names → Supabase names expected by loader code
+# ---------------------------------------------------------------------------
+_ALIAS_ENV_MAP = {
+    "VECTORBASE_URL": "SUPABASE_URL",
+    "VECTORBASE_API_KEY": "SUPABASE_KEY",
+    "VECTORBASE_USER_UUID": "SUPABASE_USER_UUID",  # optional
+}
+for src, dest in _ALIAS_ENV_MAP.items():
+    if dest not in os.environ and src in os.environ:
+        os.environ[dest] = os.environ[src]
+
+# Import the NEW data‑loading helper with filtering capabilities
+try:
+    # --- FIX: Import VectorQueryLoader from vutils ---
+    from librarian_vspace.vutils.vector_query_loader import VectorQueryLoader, VectorQueryLoaderError
+    # VectorLoaderError is now VectorQueryLoaderError
+# --- END FIX ---
+except ImportError as e:
+    # Keep the original script's error handling for standalone use
+    sys.stderr.write(
+        "\n[ERROR] Could not import VectorQueryLoader – check PYTHONPATH. "
+        f"Original error: {e}\n"
+    )
+    # For callable use, we should raise an ImportError or custom exception
+    raise ImportError(f"Could not import VectorQueryLoader: {e}") from e
+
+
+# ---------------------------------------------------------------------------
+# Logging setup (used by both script and callable function)
+# ---------------------------------------------------------------------------
+# This basicConfig runs when the module is imported.
+# Callers might want to configure logging before importing.
+# If logging is already configured, basicConfig does nothing.
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s: %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__) # Use __name__ for module-specific logger
+
+# ---------------------------------------------------------------------------
+# Helper – JSON dump for centroid in YAML front‑matter
+# ---------------------------------------------------------------------------
+def centroid_to_json(vec: np.ndarray) -> str:
+    """Converts a numpy vector to a JSON string suitable for YAML frontmatter."""
+    return json.dumps([float(x) for x in vec], ensure_ascii=False)
+
+
+# ---------------------------------------------------------------------------
+# Main clustering and export logic as a callable function
+# ---------------------------------------------------------------------------
+def run_cluster_export_job(
+    course_id: Optional[int] = None, # Added course_id parameter
+    output_dir: Union[str, Path] = "./cluster_md", # Output directory parameter
+    schema: str = "librarian",
+    rpc_function: str = "pdf_chunking", # Default to actual function name
+    model: str = "snowflake-arctic-embed2",
+    k_clusters: int = 128, # Requested number of clusters (k)
+    train_sample_size: int = 20000, # Sample size for K-means training
+    embedding_column: str = "embedding" # Added embedding column parameter
+) -> Path:
+    """
+    Fetches vectors, performs K-means clustering, and exports clustered chunks to Markdown.
+
+    Args:
+        course_id: Optional ID to filter vectors belonging to a specific course.
+        output_dir: Directory path where the cluster Markdown files will be saved.
+        schema: Postgres schema containing the vector table.
+        rpc_function: Optional RPC function name used by VectorQueryLoader (needed for table lookup).
+        model: Embedding model label used by VectorQueryLoader (needed for table lookup).
+        k_clusters: The requested number of clusters (k). Will be downsized if fewer
+                    vectors are available.
+        train_sample_size: The maximum number of vectors to use for K-means training.
+                           Capped by the total number of vectors fetched.
+        embedding_column: The name of the column containing the vector embeddings.
+
+    Returns:
+        The absolute path to the output directory.
+
+    Raises:
+        VectorQueryLoaderError: If vector loading fails.
+        RuntimeError: If no embeddings are retrieved or training sample is empty after filtering.
+        Exception: For other errors during clustering or export.
+    """
+    output_path = Path(output_dir).expanduser().resolve() # Resolve path early
+    output_path.mkdir(parents=True, exist_ok=True)
+    logger.info("Writing Markdown files to %s", output_path)
+
+    # ---------------------------------------------------------------------------
+    # Fetch embeddings - Now using VectorQueryLoader with filtering
+    # ---------------------------------------------------------------------------
+    try:
+        # Use parameters for loader config
+        # --- FIX: Instantiate VectorQueryLoader ---
+        loader = VectorQueryLoader(schema=schema, function=rpc_function, model=model, embedding_column=embedding_column)
+        # --- END FIX ---
+
+        # --- FIX: Call fetch_vectors WITH the course_id argument ---
+        # VectorQueryLoader.fetch_vectors handles the DB-level filtering
+        df = loader.fetch_vectors(limit=None, course_id=course_id)
+        # --- END FIX ---
+
+        # --- REMOVE: In-memory filtering logic is no longer needed ---
+        # initial_rows = len(df)
+        # if course_id is not None and not df.empty:
+        #    ... (removed filtering code) ...
+        # elif course_id is not None and df.empty:
+        #    ... (removed warning) ...
+        # --- END REMOVE ---
+
+    # --- FIX: Catch VectorQueryLoaderError ---
+    except VectorQueryLoaderError as e:
+        logger.error("Vector loading failed: %s", e)
+        raise e # Re-raise the specific exception for the caller
+    # --- END FIX ---
+    except Exception as e:
+         # Catch other unexpected errors during loading
+         logger.exception("An unexpected error occurred during vector loading.")
+         raise RuntimeError(f"An unexpected error occurred during vector loading: {e}") from e
+
+
+    # --- Check if DataFrame is empty *after* fetching (which includes DB filtering) ---
+    if df.empty:
+        logger.error("No embeddings retrieved or found for course_id %s – aborting.", course_id)
+        # Raise a RuntimeError as no clustering can be done
+        raise RuntimeError(f"No embeddings retrieved or found for course_id {course_id} – nothing to cluster.")
+    # ----------------------------------------------------------------------------------
+
+
+    # Use the actual embedding column name from the loader instance
+    # This check is crucial *after* fetching
+    if not hasattr(loader, 'embedding_column') or loader.embedding_column not in df.columns:
+         # This should ideally be caught by VectorQueryLoader's internal checks, but double-check
+         logger.error("Embedding column '%s' not found in fetched data.", embedding_column) # Use the input param name for error msg
+         raise RuntimeError(f"Embedding column '{embedding_column}' not found in fetched data.")
+
+
+    # --- Ensure embeddings are numeric lists before stacking ---
+    # The VectorQueryLoader.fetch_vectors method now handles parsing and dropping invalid rows.
+    # We just need to safely stack the potentially filtered/cleaned data.
+    try:
+        # Ensure data is list of floats before stacking
+        # This check might be redundant if VectorQueryLoader guarantees cleaned data,
+        # but it adds safety.
+        if not all(isinstance(x, list) and all(isinstance(n, float) for n in x) for x in df[embedding_column]):
+            logger.error(f"Data in '{embedding_column}' is not strictly list[float] format after fetching. Attempting conversion.")
+            # This might catch issues the loader missed or unexpected data structures
+            try:
+                # Attempt robust conversion similar to the loader's parse method
+                embeddings_list = []
+                for item in df[embedding_column]:
+                    parsed_item = None
+                    if isinstance(item, str):
+                        try: parsed_item = json.loads(item)
+                        except json.JSONDecodeError: pass
+                    elif isinstance(item, (list, tuple, np.ndarray)):
+                         parsed_item = item
+                    elif isinstance(item, dict) and 'vector' in item and isinstance(item['vector'], (list, tuple, np.ndarray)):
+                         parsed_item = item['vector']
+
+                    if isinstance(parsed_item, (list, tuple, np.ndarray)) and all(isinstance(val, (int, float, np.number)) for val in parsed_item):
+                         embeddings_list.append([float(n) for n in parsed_item])
+                    else:
+                         logger.debug(f"Skipping problematic embedding during secondary clean: {str(item)[:100]}...")
+
+
+                if not embeddings_list:
+                     logger.error("No valid embeddings remained after secondary cleaning.")
+                     raise ValueError("No valid embeddings for stacking.")
+                embeddings = np.array(embeddings_list, dtype=float)
+                logger.warning("Successfully converted problematic embedding data for stacking.")
+            except Exception as e:
+                 logger.exception(f"Failed secondary attempt to convert embeddings for stacking: {e}")
+                 raise RuntimeError(f"Failed to process embedding data for stacking: {e}") from e
+        else:
+            # Data is in the expected list of float format, proceed directly
+             embeddings = np.stack(df[embedding_column].to_list()).astype(float)
+
+        logger.info("Prepared %d embeddings for clustering.", embeddings.shape[0])
+
+
+    except ValueError as ve:
+         logger.exception(f"Failed to stack embeddings into a numpy array: {ve}. Ensure '{embedding_column}' contains valid vector data.")
+         raise RuntimeError(f"Failed to process embedding data: {ve}") from ve
+    except Exception as e:
+         logger.exception(f"An unexpected error occurred while processing '{embedding_column}' column for stacking.")
+         raise RuntimeError(f"An unexpected error occurred while processing embedding data for stacking: {e}") from e
+    # -------------------------------------------------------------
+
+
+    # ---------------------------------------------------------------------------
+    # Prepare training sample and determine effective k
+    # ---------------------------------------------------------------------------
+    # Use the parameter train_sample_size
+    train_vecs = embeddings[:train_sample_size]
+
+    if train_vecs.shape[0] == 0:
+        # If course_id filtering resulted in 0 vectors, this check prevents the crash
+        # but the df.empty check earlier should already handle this.
+        # Keep this check for robustness in case train_sample_size is 0 or negative.
+        logger.error("Training sample is empty – nothing to cluster.")
+        raise RuntimeError("Training sample is empty – nothing to cluster.")
+
+    # Use the parameter k_clusters
+    K = min(k_clusters, train_vecs.shape[0])
+    if K < k_clusters:
+        logger.warning(
+            "Requested k=%d but only %d training vectors available; "
+            "using k=%d.",
+            k_clusters,
+            train_vecs.shape[0],
+            K,
+        )
+        # Ensure K is at least 1 if there's any data
+        if K == 0 and train_vecs.shape[0] > 0:
+             K = 1
+             logger.warning("Adjusted k to 1 as requested k resulted in 0 but data exists.")
+
+    if K == 0:
+        # If after adjustments K is still 0 (meaning train_vecs.shape[0] was 0)
+        logger.error("Effective k is 0. Cannot train k-means.")
+        raise RuntimeError("Effective k is 0. Cannot train k-means.")
+
+
+    logger.info("Training k‑means (k=%d) on %d vectors", K, train_vecs.shape[0])
+
+    try:
+        kmeans = KMeans(
+            n_clusters=K,
+            init="k-means++",
+            n_init="auto", # Use 'auto' for better handling of small k/n_samples
+            algorithm="lloyd", # 'lloyd' is the standard, 'elkan' can be faster but has limitations
+            max_iter=300,
+            random_state=0,
+        )
+        kmeans.fit(train_vecs)
+        centroids: np.ndarray = kmeans.cluster_centers_
+        logger.info("K‑means converged in %d iterations", kmeans.n_iter_)
+    except Exception as e:
+        logger.exception("K-means clustering failed.")
+        raise RuntimeError(f"K-means clustering failed: {e}") from e
+
+
+    # ---------------------------------------------------------------------------
+    # Assign every vector to its nearest centroid (full table)
+    # ---------------------------------------------------------------------------
+    logger.info("Assigning vectors to centroids...")
+    try:
+        # Use the determined embedding column for assignment as well
+        labels_full, _ = pairwise_distances_argmin_min(embeddings, centroids, metric="euclidean")
+        df["cluster_id"] = labels_full
+        logger.info("Assigned cluster labels to all embeddings.")
+    except Exception as e:
+        logger.exception("Failed to assign vectors to centroids.")
+        raise RuntimeError(f"Failed to assign vectors to centroids: {e}") from e
+
+    # ---------------------------------------------------------------------------
+    # Write one Markdown file per cluster
+    # ---------------------------------------------------------------------------
+    files_written_count = 0
+
+    logger.info("Writing cluster Markdown files to %s", output_path)
+    try:
+        # Only iterate up to the number of actual clusters found by KMeans
+        # KMeans might return fewer clusters than K if there are issues or identical points
+        num_actual_clusters = len(centroids)
+        if num_actual_clusters < K:
+            logger.warning(f"KMeans returned only {num_actual_clusters} centroids, expected {K}. Iterating over actual centroids.")
+
+
+        for cid in range(num_actual_clusters): # Iterate over actual cluster IDs
+            # Find all data points assigned to this cluster ID
+            subset = df[df.cluster_id == cid]
+
+            # Ensure centroid_vec corresponds to the centroid of the *current* cluster ID (cid)
+            # This check is more robust now iterating up to num_actual_clusters
+            if cid < len(centroids):
+                centroid_vec = centroids[cid]
+            else:
+                 # This case should theoretically not be reached with the loop range
+                 logger.error(f"Centroid for cluster ID {cid} missing! Using zero vector.")
+                 centroid_vec = np.zeros(embeddings.shape[1])
+
+
+            # Use .get() and .fillna("") defensively in case 'chunk' column is missing
+            # Ensure chunk column exists - it should if SELECT * worked
+            if 'chunk' not in subset.columns:
+                logger.warning("'chunk' column missing in subset data for cluster %d. Using empty strings.", cid)
+                chunks = [""] * len(subset)
+            else:
+                 chunks = subset['chunk'].fillna("").tolist()
+
+
+            md_lines = [
+                #"---",
+                #f"cluster_id: {cid}",
+                #f"centroid: {centroid_to_json(centroid_vec)}",
+                #"---\n", # Separator between frontmatter and content
+            ]
+            # Add chunks, ensuring each chunk is on a new line or separated by blank lines
+            md_lines.extend(chunks)
+
+            outfile = output_path / f"cluster_{cid:03d}.md"
+            # Use a different separator for chunks within the file if needed,
+            # currently just joins with newline, but chunks might contain newlines.
+            # Joining with "\n\n" provides separation *between* chunks.
+            try:
+                outfile.write_text("\n\n".join(md_lines), encoding="utf-8")
+                files_written_count += 1
+                logger.debug("Wrote %s (%d chunks)", outfile.name, len(chunks)) # Use debug for per-file
+            except Exception as write_exc:
+                 logger.error(f"Failed to write cluster file {outfile}: {write_exc}", exc_info=True)
+                 # Decide whether to continue or raise here. Continuing allows other clusters to be saved.
+                 # For robustness in script, maybe continue. For library function, maybe raise.
+                 # For now, we'll just log and continue.
+
+
+    except Exception as e:
+        logger.exception("Failed during Markdown file writing loop.")
+        raise RuntimeError(f"Failed during Markdown file writing: {e}") from e
+
+
+    logger.info("Done. %d Markdown files created in %s", files_written_count, output_path)
+    return output_path
+
+
+# ---------------------------------------------------------------------------
+# Script entry point
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    # Configuration via environment for script
+    script_output_dir = Path(os.environ.get("OUTPUT_DIR", "./cluster_md")).expanduser()
+    script_schema = os.environ.get("VECTOR_SCHEMA", "librarian")
+    script_rpc_function = os.environ.get("VECTOR_FUNCTION", "pdf_chunking") # Default to actual function name
+    script_model = os.environ.get("EMBED_MODEL", "snowflake-arctic-embed2")
+    script_k_req = int(os.environ.get("K", "128"))
+    script_train_sample = int(os.environ.get("TRAIN_SAMPLE", "20000"))
+    # Added course ID specific to script entry point
+    script_course_id_str = os.environ.get("CLUSTER_COURSE_ID")
+    script_course_id = int(script_course_id_str) if script_course_id_str and script_course_id_str.isdigit() else None # Added isdigit check
+
+
+    # Configure basic logging for the script entry point
+    # (The module-level config above might not run if imported in specific ways)
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    # Re-get the logger after basicConfig to ensure it's configured
+    logger = logging.getLogger(__name__)
+
+
+    try:
+        logger.info("Starting cluster export script...")
+        final_output_path = run_cluster_export_job(
+            course_id=script_course_id,
+            output_dir=script_output_dir,
+            schema=script_schema,
+            rpc_function=script_rpc_function,
+            model=script_model,
+            k_clusters=script_k_req,
+            train_sample_size=script_train_sample,
+            # embedding_column defaults to 'embedding' in the function
+        )
+        logger.info("Script finished successfully. Output in %s", final_output_path)
+        sys.exit(0) # Explicit success exit
+    # --- FIX: Catch VectorQueryLoaderError ---
+    except (VectorQueryLoaderError, RuntimeError) as e: # Catch the new error type
+    # --- END FIX ---
+        # Specific errors we raised
+        logger.error("Script failed: %s", e)
+        sys.exit(1) # Indicate failure
+    except Exception as e:
+        # Catch any other unexpected errors
+        logger.exception("An unhandled error occurred during script execution.")
+        sys.exit(1) # Indicate failure
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/cluster_export_worker.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/cluster_export_worker.py
@ -0,0 +1,73 @@
+
+"""ClusterExportWorker – Prefect worker that wraps run_cluster_export_job."""
+
+from __future__ import annotations
+import asyncio
+import logging
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from prefect import get_run_logger
+from pydantic import BaseModel
+from librarian_core.workers.base import Worker
+
+from librarian_vspace.vquery.cluster_export import run_cluster_export_job
+
+def _safe_get_logger(name: str):
+    try:
+        return get_run_logger()
+    except Exception:
+        return logging.getLogger(name)
+
+class ClusterExportInput(BaseModel):
+    course_id: int
+    k_clusters: int = 128
+    train_sample_size: int = 20_000
+    db_schema: str = "librarian"
+    rpc_function: str = "pdf_chunking"
+    model: str = "snowflake-arctic-embed2"
+    embedding_column: str = "embedding"
+    base_output_dir: Optional[Path] = None
+
+class ClusterExportOutput(BaseModel):
+    output_dir: Path
+
+class ClusterExportWorker(Worker[ClusterExportInput, ClusterExportOutput]):
+    input_model = ClusterExportInput
+    output_model = ClusterExportOutput
+
+    async def __run__(self, payload: ClusterExportInput) -> ClusterExportOutput:
+        logger = _safe_get_logger(self.worker_name)
+        logger.info("🔨 %s startet (payload=%r)", self.worker_name, payload)
+
+        # Prepare output directory
+        if payload.base_output_dir:
+            base_dir = Path(payload.base_output_dir).expanduser()
+            base_dir.mkdir(parents=True, exist_ok=True)
+            tmp_base = tempfile.mkdtemp(dir=base_dir)
+        else:
+            tmp_base = tempfile.mkdtemp()
+
+        output_dir = Path(tmp_base) / str(payload.course_id)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        logger.debug("Output directory: %s", output_dir)
+
+        final_dir = await asyncio.to_thread(
+            run_cluster_export_job,
+            course_id=payload.course_id,
+            output_dir=output_dir,
+            schema=payload.db_schema,
+            rpc_function=payload.rpc_function,
+            model=payload.model,
+            k_clusters=payload.k_clusters,
+            train_sample_size=payload.train_sample_size,
+            embedding_column=payload.embedding_column,
+        )
+
+        self.stage(final_dir, new_name=final_dir.name)
+
+        result = ClusterExportOutput(output_dir=final_dir)
+        logger.info("✅ %s fertig: %r", self.worker_name, result)
+        return result
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/py.typed
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/py.typed
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/query.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/query.py
@ -0,0 +1,127 @@
+
+"""VectorQuery – helper for vector searches against chunklet tables.
+
+This module provides:
+* A Pydantic‑powered request / response API (see ``librarian_vspace.models.query_model``).
+* A single public method :py:meth:`VectorQuery.search` that returns a
+  :class:`~librarian_vspace.models.query_model.VectorSearchResponse`.
+* A thin legacy wrapper ``get_chucklets_by_vector`` that produces the
+  historical ``List[Dict[str, Any]]`` format, built on top of ``search``.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Dict, List, Optional
+
+try:
+    from librarian_vspace.vutils.vector_class import BaseVectorOperator
+    from librarian_vspace.vecembed.embedding_generator import EmbeddingGenerator
+except ImportError as exc:  # pragma: no cover
+    logging.error(
+        "Failed to import vutils or vecembed sub‑packages: %s. "            "Ensure they are on PYTHONPATH.", exc
+    )
+
+    class BaseVectorOperator:  # type: ignore
+        """Minimal stub if real class is unavailable (runtime error later)."""
+
+    class EmbeddingGenerator:  # type: ignore
+        """Minimal stub; will raise at runtime if used."""
+
+from librarian_vspace.models.query_model import (
+    VectorSearchRequest,
+    VectorSearchResponse,
+    Chunklet,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# --------------------------------------------------------------------- #
+# Main helper
+# --------------------------------------------------------------------- #
+class VectorQuery(BaseVectorOperator):
+    """High‑level helper for vector searches via Supabase RPC."""
+
+    # -----------------------------------------------------------------
+    # Public – modern API
+    # -----------------------------------------------------------------
+    def search(self, request: VectorSearchRequest) -> VectorSearchResponse:
+        """Perform a similarity search and return structured results."""
+
+        if not getattr(self, "table", None):
+            logger.error("VectorQuery: target table not determined (self.table is None).")
+            return VectorSearchResponse(total=0, results=[])
+
+        # 1) Generate query embedding
+        try:
+            _tts, query_vec, _ = EmbeddingGenerator().generate_embedding(
+                interface_name=request.interface_name,
+                model_name=request.model_name,
+                text_to_embed=request.search_string,
+                identifier="query",
+            )
+            if query_vec is None:
+                logger.error("Embedding generation returned None.")
+                return VectorSearchResponse(total=0, results=[])
+        except Exception as exc:  # pragma: no cover
+            logger.exception("Embedding generation failed: %s", exc)
+            return VectorSearchResponse(total=0, results=[])
+
+        # 2) Build RPC parameters
+        rpc_params = {
+            "p_query_embedding": query_vec,
+            "p_target_table": self.table,
+            "p_embedding_column": request.embedding_column,
+            "p_match_count": request.top_k,
+            "p_filters": request.filters or {},
+        }
+
+        # 3) Execute RPC
+        try:
+            if not getattr(self, "spc", None):
+                logger.error("Supabase client (self.spc) not available.")
+                return VectorSearchResponse(total=0, results=[])
+
+            resp = (
+                self.spc
+                .schema(self.schema)
+                .rpc("vector_search", rpc_params)
+                .execute()
+            )
+            data = resp.data or []
+            results = [
+                Chunklet(chunk=row.get("chunk"), file_id=row.get("file_id")) if isinstance(row.get("file_id"), str) else Chunklet(chunk=row.get("chunk"), file_id=str(row.get("file_id")))
+                for row in data
+            ]
+            return VectorSearchResponse(total=len(results), results=results)
+
+        except Exception as exc:  # pragma: no cover
+            logger.exception("RPC 'vector_search' failed: %s", exc)
+            return VectorSearchResponse(total=0, results=[])
+
+    # -----------------------------------------------------------------
+    # Public – legacy compatibility
+    # -----------------------------------------------------------------
+    def get_chucklets_by_vector(
+        self,
+        *,
+        interface_name: str,
+        model_name: str,
+        search_string: str,
+        filters: Optional[Dict[str, Any]] = None,
+        top_k: int = 10,
+        embedding_column: str = "embedding",
+    ) -> List[Dict[str, Any]]:
+        """Backward‑compatible wrapper returning ``{'chunk', 'file_id'}`` dicts."""
+
+        req = VectorSearchRequest(
+            interface_name=interface_name,
+            model_name=model_name,
+            search_string=search_string,
+            filters=filters,
+            top_k=top_k,
+            embedding_column=embedding_column,
+        )
+        resp = self.search(req)
+        return [ck.dict() for ck in resp.results]
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/query_worker.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vquery/query_worker.py
@ -0,0 +1,62 @@
+
+"""QueryWorker – Prefect worker that performs a vector search.
+
+It instantiates VectorQuery directly (no vspace dependency) and returns the
+VectorSearchResponse.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from pathlib import Path
+
+from prefect import get_run_logger
+from pydantic import BaseModel
+from librarian_core.workers.base import Worker
+
+from librarian_vspace.vquery.query import VectorQuery
+from librarian_vspace.models.query_model import VectorSearchRequest, VectorSearchResponse
+
+def _safe_get_logger(name: str):
+    try:
+        return get_run_logger()
+    except Exception:
+        return logging.getLogger(name)
+
+
+class QueryInput(BaseModel):
+    request: VectorSearchRequest
+    db_schema: str = "librarian"
+    rpc_function: str = "pdf_chunking"
+    embed_model: str = "snowflake-arctic-embed2"
+    embedding_column: str = "embedding"
+
+
+class QueryWorker(Worker[QueryInput, VectorSearchResponse]):
+    """Runs a Supabase vector search via VectorQuery."""
+
+    input_model = QueryInput
+    output_model = VectorSearchResponse
+
+    async def __run__(self, payload: QueryInput) -> VectorSearchResponse:
+        logger = _safe_get_logger(self.worker_name)
+        logger.info("🔨 %s startet (payload=%r)", self.worker_name, payload)
+
+        def _do_search() -> VectorSearchResponse:
+            try:
+                vq = VectorQuery(
+                    schema=payload.db_schema,
+                    function=payload.rpc_function,
+                    model=payload.embed_model,
+                    embedding_column=payload.embedding_column,
+                )
+            except TypeError:
+                # fallback to positional signature
+                vq = VectorQuery(payload.db_schema, payload.rpc_function, payload.embed_model)
+            return vq.search(payload.request)
+
+        response = await asyncio.to_thread(_do_search)
+
+        logger.info("✅ %s fertig: %s results", self.worker_name, response.total)
+        return response
--- a/librarian/plugins/librarian-vspace/src/librarian_vspace/vutils/init.py
+++ b/librarian/plugins/librarian-vspace/src/librarian_vspace/vutils/init.py
@ -0,0 +1,2 @@
+def hello() -> str:
+    return "Hello from librarian_vspace!"
--- a/Show More
+++ b/Show More