141 lines
6.2 KiB
Python
141 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Loads vector data using vecmap.loader, reduces dimensions via t-SNE,
|
|
and launches an interactive 3D visualization using vecmap.visualizer (Dash/Plotly).
|
|
|
|
Configuration is primarily driven by environment variables.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import pathlib
|
|
import sys
|
|
import pandas as pd
|
|
|
|
# Define application directory relative to this script file
|
|
APP_DIR = pathlib.Path(__file__).resolve().parent
|
|
# Define the source directory containing vecmap, vutils, etc.
|
|
SRC_DIR = APP_DIR.parent / "src"
|
|
# Define path to .env file relative to APP_DIR
|
|
DOTENV_PATH = APP_DIR / ".env"
|
|
|
|
# --- Explicitly Manage sys.path ---
|
|
app_dir_str = str(APP_DIR)
|
|
src_dir_str = str(SRC_DIR)
|
|
if app_dir_str in sys.path:
|
|
try: sys.path.remove(app_dir_str)
|
|
except ValueError: pass
|
|
if src_dir_str not in sys.path:
|
|
sys.path.insert(0, src_dir_str)
|
|
elif sys.path[0] != src_dir_str:
|
|
try: sys.path.remove(src_dir_str)
|
|
except ValueError: pass
|
|
sys.path.insert(0, src_dir_str)
|
|
print(f"[DEBUG] sys.path start: {sys.path[:3]}")
|
|
|
|
# --- .env Loader ---
|
|
def _load_env_file(path: pathlib.Path) -> None:
|
|
print(f"Attempting to load .env file from: {path}")
|
|
if not path.is_file(): print(f".env file not found at {path}, skipping."); return
|
|
loaded, skipped = 0, 0
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
line = line.strip();
|
|
if not line or line.startswith("#") or "=" not in line: continue
|
|
key, val = line.split("=", 1); key, val = key.strip(), val.strip()
|
|
if key not in os.environ: os.environ[key] = val; loaded += 1
|
|
else: skipped += 1
|
|
print(f"Loaded {loaded} new vars, skipped {skipped} existing vars from .env")
|
|
except Exception as e: print(f"Error reading .env file at {path}: {e}")
|
|
_load_env_file(DOTENV_PATH)
|
|
|
|
# --- Logging Setup ---
|
|
log_level_str = os.getenv("VECMAP_DEBUG", "false").lower()
|
|
log_level = logging.DEBUG if log_level_str in ("true", "1") else logging.INFO
|
|
logging.basicConfig(level=log_level, format='[%(asctime)s] [%(levelname)s] [%(name)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
|
|
if log_level > logging.DEBUG:
|
|
for logger_name in ["urllib3", "httpx", "supabase"]: logging.getLogger(logger_name).setLevel(logging.WARNING)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- Imports ---
|
|
try:
|
|
from librarian_vspace.vecmap.loader import VectorLoader, VectorLoaderError
|
|
from librarian_vspace.vecmap.visualizer import VectorVisualizer # Removed DEFAULT_N_CLUSTERS import
|
|
import librarian_vspace.vutils
|
|
import librarian_vspace.vecembed
|
|
logger.debug("Successfully imported components.")
|
|
except ImportError as e:
|
|
logger.error(f"Failed to import necessary modules: {e}", exc_info=True)
|
|
sys.exit(1)
|
|
|
|
# --- Main Logic ---
|
|
def main() -> None:
|
|
logger.info("--- Starting VecMap Visualizer ---")
|
|
|
|
# --- Configuration ---
|
|
db_schema = os.getenv("VECTOR_SCHEMA", "librarian")
|
|
db_function = os.getenv("VECTOR_FUNCTION", "pdf_chunking")
|
|
model_name = os.getenv("EMBED_MODEL", "snowflake-arctic-embed2")
|
|
interface_name = os.getenv("EMBED_INTERFACE", "ollama")
|
|
embedding_column = os.getenv("EMBEDDING_COLUMN", "embedding")
|
|
try: limit_str = os.getenv("VECMAP_LIMIT"); data_limit = int(limit_str) if limit_str else None
|
|
except ValueError: logger.warning(f"Invalid VECMAP_LIMIT. Ignoring."); data_limit = None
|
|
try: perplexity_str = os.getenv("VECMAP_PERPLEXITY", "30.0"); tsne_perplexity = float(perplexity_str)
|
|
except ValueError: logger.warning(f"Invalid VECMAP_PERPLEXITY. Using 30.0."); tsne_perplexity = 30.0
|
|
|
|
# n_clusters configuration removed
|
|
|
|
dash_host = os.getenv("VECMAP_HOST", "127.0.0.1")
|
|
try: port_str = os.getenv("VECMAP_PORT", "8050"); dash_port = int(port_str)
|
|
except ValueError: logger.warning(f"Invalid VECMAP_PORT. Using 8050."); dash_port = 8050
|
|
dash_debug = log_level == logging.DEBUG
|
|
|
|
logger.info("Effective Configuration:")
|
|
logger.info(f" Database: schema={db_schema}, function={db_function}")
|
|
logger.info(f" Model/Interface: model={model_name}, interface={interface_name}")
|
|
logger.info(f" Data Params: column={embedding_column}, limit={data_limit}")
|
|
logger.info(f" Processing: perplexity={tsne_perplexity} (n_clusters is now dynamic)") # Updated log
|
|
logger.info(f" Server: host={dash_host}, port={dash_port}, debug={dash_debug}")
|
|
|
|
# --- 1. Initial Load and Reduce ---
|
|
initial_df_reduced = pd.DataFrame()
|
|
try:
|
|
logger.info("Performing initial data load and processing...")
|
|
loader = VectorLoader(schema=db_schema, function=db_function, model=model_name, embedding_column=embedding_column)
|
|
tsne_params = {"perplexity": tsne_perplexity}
|
|
initial_df_reduced = loader.load_and_reduce(limit=data_limit, tsne_params=tsne_params)
|
|
if initial_df_reduced.empty: logger.warning("Initial data load resulted in an empty dataset.")
|
|
else: logger.info(f"Successfully loaded and reduced {len(initial_df_reduced)} vectors initially.")
|
|
except VectorLoaderError as e: logger.error(f"Initial data load failed: {e}", exc_info=dash_debug)
|
|
except Exception as e: logger.error(f"Unexpected error during initial data load: {e}", exc_info=dash_debug)
|
|
|
|
# --- 2. Initialize and Start Visualization ---
|
|
try:
|
|
logger.info("Initializing VectorVisualizer...")
|
|
visualizer = VectorVisualizer(
|
|
initial_data=initial_df_reduced,
|
|
db_schema=db_schema,
|
|
db_function=db_function,
|
|
interface_name=interface_name,
|
|
model_name=model_name,
|
|
embedding_column=embedding_column,
|
|
initial_limit=data_limit,
|
|
initial_perplexity=tsne_perplexity
|
|
# n_clusters argument removed
|
|
)
|
|
logger.info("Launching visualizer...")
|
|
visualizer.run(host=dash_host, port=dash_port, debug=dash_debug)
|
|
except TypeError as te:
|
|
logger.error(f"TypeError during VectorVisualizer initialization: {te}", exc_info=True)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.error(f"Failed to initialize or run visualizer: {e}", exc_info=dash_debug)
|
|
sys.exit(1)
|
|
|
|
logger.info("--- VecMap Visualizer finished ---")
|
|
|
|
if __name__ == "__main__":
|
|
main() |