#!/usr/bin/env python3 """ Loads vector data using vecmap.loader, reduces dimensions via t-SNE, and launches an interactive 3D visualization using vecmap.visualizer (Dash/Plotly). Configuration is primarily driven by environment variables. """ from __future__ import annotations import logging import os import pathlib import sys import pandas as pd # Define application directory relative to this script file APP_DIR = pathlib.Path(__file__).resolve().parent # Define the source directory containing vecmap, vutils, etc. SRC_DIR = APP_DIR.parent / "src" # Define path to .env file relative to APP_DIR DOTENV_PATH = APP_DIR / ".env" # --- Explicitly Manage sys.path --- app_dir_str = str(APP_DIR) src_dir_str = str(SRC_DIR) if app_dir_str in sys.path: try: sys.path.remove(app_dir_str) except ValueError: pass if src_dir_str not in sys.path: sys.path.insert(0, src_dir_str) elif sys.path[0] != src_dir_str: try: sys.path.remove(src_dir_str) except ValueError: pass sys.path.insert(0, src_dir_str) print(f"[DEBUG] sys.path start: {sys.path[:3]}") # --- .env Loader --- def _load_env_file(path: pathlib.Path) -> None: print(f"Attempting to load .env file from: {path}") if not path.is_file(): print(f".env file not found at {path}, skipping."); return loaded, skipped = 0, 0 try: with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip(); if not line or line.startswith("#") or "=" not in line: continue key, val = line.split("=", 1); key, val = key.strip(), val.strip() if key not in os.environ: os.environ[key] = val; loaded += 1 else: skipped += 1 print(f"Loaded {loaded} new vars, skipped {skipped} existing vars from .env") except Exception as e: print(f"Error reading .env file at {path}: {e}") _load_env_file(DOTENV_PATH) # --- Logging Setup --- log_level_str = os.getenv("VECMAP_DEBUG", "false").lower() log_level = logging.DEBUG if log_level_str in ("true", "1") else logging.INFO logging.basicConfig(level=log_level, format='[%(asctime)s] [%(levelname)s] [%(name)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S') if log_level > logging.DEBUG: for logger_name in ["urllib3", "httpx", "supabase"]: logging.getLogger(logger_name).setLevel(logging.WARNING) logger = logging.getLogger(__name__) # --- Imports --- try: from librarian_vspace.vecmap.loader import VectorLoader, VectorLoaderError from librarian_vspace.vecmap.visualizer import VectorVisualizer # Removed DEFAULT_N_CLUSTERS import import librarian_vspace.vutils import librarian_vspace.vecembed logger.debug("Successfully imported components.") except ImportError as e: logger.error(f"Failed to import necessary modules: {e}", exc_info=True) sys.exit(1) # --- Main Logic --- def main() -> None: logger.info("--- Starting VecMap Visualizer ---") # --- Configuration --- db_schema = os.getenv("VECTOR_SCHEMA", "librarian") db_function = os.getenv("VECTOR_FUNCTION", "pdf_chunking") model_name = os.getenv("EMBED_MODEL", "snowflake-arctic-embed2") interface_name = os.getenv("EMBED_INTERFACE", "ollama") embedding_column = os.getenv("EMBEDDING_COLUMN", "embedding") try: limit_str = os.getenv("VECMAP_LIMIT"); data_limit = int(limit_str) if limit_str else None except ValueError: logger.warning(f"Invalid VECMAP_LIMIT. Ignoring."); data_limit = None try: perplexity_str = os.getenv("VECMAP_PERPLEXITY", "30.0"); tsne_perplexity = float(perplexity_str) except ValueError: logger.warning(f"Invalid VECMAP_PERPLEXITY. Using 30.0."); tsne_perplexity = 30.0 # n_clusters configuration removed dash_host = os.getenv("VECMAP_HOST", "127.0.0.1") try: port_str = os.getenv("VECMAP_PORT", "8050"); dash_port = int(port_str) except ValueError: logger.warning(f"Invalid VECMAP_PORT. Using 8050."); dash_port = 8050 dash_debug = log_level == logging.DEBUG logger.info("Effective Configuration:") logger.info(f" Database: schema={db_schema}, function={db_function}") logger.info(f" Model/Interface: model={model_name}, interface={interface_name}") logger.info(f" Data Params: column={embedding_column}, limit={data_limit}") logger.info(f" Processing: perplexity={tsne_perplexity} (n_clusters is now dynamic)") # Updated log logger.info(f" Server: host={dash_host}, port={dash_port}, debug={dash_debug}") # --- 1. Initial Load and Reduce --- initial_df_reduced = pd.DataFrame() try: logger.info("Performing initial data load and processing...") loader = VectorLoader(schema=db_schema, function=db_function, model=model_name, embedding_column=embedding_column) tsne_params = {"perplexity": tsne_perplexity} initial_df_reduced = loader.load_and_reduce(limit=data_limit, tsne_params=tsne_params) if initial_df_reduced.empty: logger.warning("Initial data load resulted in an empty dataset.") else: logger.info(f"Successfully loaded and reduced {len(initial_df_reduced)} vectors initially.") except VectorLoaderError as e: logger.error(f"Initial data load failed: {e}", exc_info=dash_debug) except Exception as e: logger.error(f"Unexpected error during initial data load: {e}", exc_info=dash_debug) # --- 2. Initialize and Start Visualization --- try: logger.info("Initializing VectorVisualizer...") visualizer = VectorVisualizer( initial_data=initial_df_reduced, db_schema=db_schema, db_function=db_function, interface_name=interface_name, model_name=model_name, embedding_column=embedding_column, initial_limit=data_limit, initial_perplexity=tsne_perplexity # n_clusters argument removed ) logger.info("Launching visualizer...") visualizer.run(host=dash_host, port=dash_port, debug=dash_debug) except TypeError as te: logger.error(f"TypeError during VectorVisualizer initialization: {te}", exc_info=True) sys.exit(1) except Exception as e: logger.error(f"Failed to initialize or run visualizer: {e}", exc_info=dash_debug) sys.exit(1) logger.info("--- VecMap Visualizer finished ---") if __name__ == "__main__": main()