2025-05-24 12:15:48 +02:00

141 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
Loads vector data using vecmap.loader, reduces dimensions via t-SNE,
and launches an interactive 3D visualization using vecmap.visualizer (Dash/Plotly).
Configuration is primarily driven by environment variables.
"""
from __future__ import annotations
import logging
import os
import pathlib
import sys
import pandas as pd
# Define application directory relative to this script file
APP_DIR = pathlib.Path(__file__).resolve().parent
# Define the source directory containing vecmap, vutils, etc.
SRC_DIR = APP_DIR.parent / "src"
# Define path to .env file relative to APP_DIR
DOTENV_PATH = APP_DIR / ".env"
# --- Explicitly Manage sys.path ---
app_dir_str = str(APP_DIR)
src_dir_str = str(SRC_DIR)
if app_dir_str in sys.path:
try: sys.path.remove(app_dir_str)
except ValueError: pass
if src_dir_str not in sys.path:
sys.path.insert(0, src_dir_str)
elif sys.path[0] != src_dir_str:
try: sys.path.remove(src_dir_str)
except ValueError: pass
sys.path.insert(0, src_dir_str)
print(f"[DEBUG] sys.path start: {sys.path[:3]}")
# --- .env Loader ---
def _load_env_file(path: pathlib.Path) -> None:
print(f"Attempting to load .env file from: {path}")
if not path.is_file(): print(f".env file not found at {path}, skipping."); return
loaded, skipped = 0, 0
try:
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip();
if not line or line.startswith("#") or "=" not in line: continue
key, val = line.split("=", 1); key, val = key.strip(), val.strip()
if key not in os.environ: os.environ[key] = val; loaded += 1
else: skipped += 1
print(f"Loaded {loaded} new vars, skipped {skipped} existing vars from .env")
except Exception as e: print(f"Error reading .env file at {path}: {e}")
_load_env_file(DOTENV_PATH)
# --- Logging Setup ---
log_level_str = os.getenv("VECMAP_DEBUG", "false").lower()
log_level = logging.DEBUG if log_level_str in ("true", "1") else logging.INFO
logging.basicConfig(level=log_level, format='[%(asctime)s] [%(levelname)s] [%(name)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
if log_level > logging.DEBUG:
for logger_name in ["urllib3", "httpx", "supabase"]: logging.getLogger(logger_name).setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
# --- Imports ---
try:
from librarian_vspace.vecmap.loader import VectorLoader, VectorLoaderError
from librarian_vspace.vecmap.visualizer import VectorVisualizer # Removed DEFAULT_N_CLUSTERS import
import librarian_vspace.vutils
import librarian_vspace.vecembed
logger.debug("Successfully imported components.")
except ImportError as e:
logger.error(f"Failed to import necessary modules: {e}", exc_info=True)
sys.exit(1)
# --- Main Logic ---
def main() -> None:
logger.info("--- Starting VecMap Visualizer ---")
# --- Configuration ---
db_schema = os.getenv("VECTOR_SCHEMA", "librarian")
db_function = os.getenv("VECTOR_FUNCTION", "pdf_chunking")
model_name = os.getenv("EMBED_MODEL", "snowflake-arctic-embed2")
interface_name = os.getenv("EMBED_INTERFACE", "ollama")
embedding_column = os.getenv("EMBEDDING_COLUMN", "embedding")
try: limit_str = os.getenv("VECMAP_LIMIT"); data_limit = int(limit_str) if limit_str else None
except ValueError: logger.warning(f"Invalid VECMAP_LIMIT. Ignoring."); data_limit = None
try: perplexity_str = os.getenv("VECMAP_PERPLEXITY", "30.0"); tsne_perplexity = float(perplexity_str)
except ValueError: logger.warning(f"Invalid VECMAP_PERPLEXITY. Using 30.0."); tsne_perplexity = 30.0
# n_clusters configuration removed
dash_host = os.getenv("VECMAP_HOST", "127.0.0.1")
try: port_str = os.getenv("VECMAP_PORT", "8050"); dash_port = int(port_str)
except ValueError: logger.warning(f"Invalid VECMAP_PORT. Using 8050."); dash_port = 8050
dash_debug = log_level == logging.DEBUG
logger.info("Effective Configuration:")
logger.info(f" Database: schema={db_schema}, function={db_function}")
logger.info(f" Model/Interface: model={model_name}, interface={interface_name}")
logger.info(f" Data Params: column={embedding_column}, limit={data_limit}")
logger.info(f" Processing: perplexity={tsne_perplexity} (n_clusters is now dynamic)") # Updated log
logger.info(f" Server: host={dash_host}, port={dash_port}, debug={dash_debug}")
# --- 1. Initial Load and Reduce ---
initial_df_reduced = pd.DataFrame()
try:
logger.info("Performing initial data load and processing...")
loader = VectorLoader(schema=db_schema, function=db_function, model=model_name, embedding_column=embedding_column)
tsne_params = {"perplexity": tsne_perplexity}
initial_df_reduced = loader.load_and_reduce(limit=data_limit, tsne_params=tsne_params)
if initial_df_reduced.empty: logger.warning("Initial data load resulted in an empty dataset.")
else: logger.info(f"Successfully loaded and reduced {len(initial_df_reduced)} vectors initially.")
except VectorLoaderError as e: logger.error(f"Initial data load failed: {e}", exc_info=dash_debug)
except Exception as e: logger.error(f"Unexpected error during initial data load: {e}", exc_info=dash_debug)
# --- 2. Initialize and Start Visualization ---
try:
logger.info("Initializing VectorVisualizer...")
visualizer = VectorVisualizer(
initial_data=initial_df_reduced,
db_schema=db_schema,
db_function=db_function,
interface_name=interface_name,
model_name=model_name,
embedding_column=embedding_column,
initial_limit=data_limit,
initial_perplexity=tsne_perplexity
# n_clusters argument removed
)
logger.info("Launching visualizer...")
visualizer.run(host=dash_host, port=dash_port, debug=dash_debug)
except TypeError as te:
logger.error(f"TypeError during VectorVisualizer initialization: {te}", exc_info=True)
sys.exit(1)
except Exception as e:
logger.error(f"Failed to initialize or run visualizer: {e}", exc_info=dash_debug)
sys.exit(1)
logger.info("--- VecMap Visualizer finished ---")
if __name__ == "__main__":
main()