Spaces:

sanchitshaleen
/

chat-with-your-data

Running

chat-with-your-data / server /llm_system /config.py

sanchitshaleen

Initial deployment of RAG with Gemma-3 to Hugging Face Spaces

4aec76b 7 days ago

7.72 kB

	"""Configuration module for RAG system.

	Central configuration hub for all system parameters including:
	- LLM Model Selection and Parameters: Chat, summarization, temperature settings
	- Embedding Model: Vector representation for semantic search
	- Chunking Strategy: Document segmentation for retrieval
	- Vector Database: Qdrant configuration for similarity search
	- Inference Infrastructure: Ollama local LLM serving
	- Chat History: Redis backend for multi-turn conversation state
	- Evaluation Metrics: DeepEval LLM-as-Judge configuration
	- CORS/Security: Frontend origin whitelisting
	- Performance: Token streaming and dummy response simulation

	All configuration can be overridden via environment variables or direct modification.
	For production deployments, review EVALUATION_TIMEOUT, REDIS_URL, and OLLAMA_BASE_URL.

	CST Timezone: All timestamps use CST (America/Chicago) for consistent logging across deployment.
	"""

	import os
	from typing import Optional
	from dotenv import load_dotenv

	# Load environment variables from .env file
	# override=False ensures environment variables take precedence over .env file
	load_dotenv(override=False)


	# ============================================================================
	# LLM Model Configuration
	# ============================================================================

	LLM_CHAT_MODEL_NAME: str = "gemma3:latest" # Main chat model (Ollama)
	LLM_CHAT_TEMPERATURE: float = 0.75 # Temperature: 0=deterministic, 1=creative
	LLM_SUMMARY_MODEL_NAME: str = "gemma3:latest" # For conversation summarization
	LLM_SUMMARY_TEMPERATURE: float = 0.5 # Lower temp for consistent summaries
	EMB_MODEL_NAME: str = "mxbai-embed-large:latest" # Embedding model for semantic search (deprecated, kept for backward compatibility)

	# ============================================================================
	# Jina v4 Multi-Modal Embeddings Configuration
	# ============================================================================

	# Enable Jina v4 for multi-modal embeddings (text + images)
	USE_JINA_EMBEDDINGS: bool = os.getenv("USE_JINA_EMBEDDINGS", "true").lower() == "true"

	# Jina v4 model settings
	JINA_MODEL_NAME: str = "jinaai/jina-embeddings-v4" # HuggingFace model identifier
	JINA_TASK: str = "retrieval" # Task-specific adapter: 'retrieval', 'text-matching', 'code'
	JINA_EMBEDDING_DIM: int = 2048 # Default dimension (can be truncated to 1024, 512, 256, 128)
	JINA_EMBEDDING_DIM_TRUNCATE: int = int(os.getenv("JINA_EMBEDDING_DIM_TRUNCATE", "1024")) # Truncate to save memory
	JINA_DEVICE: str = os.getenv("JINA_DEVICE", "cuda") # 'cuda' or 'cpu'
	JINA_BATCH_SIZE: int = 32 # Batch size for inference
	JINA_MAX_LENGTH: int = 32768 # Max sequence length

	# Image extraction settings for multi-modal documents
	EXTRACT_IMAGES_FROM_PDF: bool = True # Extract images from PDFs
	IMAGE_OUTPUT_DIR: str = os.getenv("IMAGE_OUTPUT_DIR", "user_uploads/extracted_images") # Where to store extracted images (relative to server dir)
	IMAGE_MAX_SIZE: tuple = (1024, 1024) # Resize images to this size
	IMAGE_FORMAT: str = "PNG" # Image format (PNG, JPEG, WEBP)


	# ============================================================================
	# Content & Token Management
	# ============================================================================

	# Maximum total tokens allowed in context window (chat_history + input + context)
	# Adjust based on model capability (Gemma3: ~14k useful for 32k model)
	MAX_CONTENT_SIZE: int = 14000


	# ============================================================================
	# Connection Verification
	# ============================================================================

	# Whether to verify LLM/embedding model availability at startup
	# Set to True for Docker/production, False for local development
	VERIFY_LLM_CONNECTION: bool = False
	VERIFY_EMB_CONNECTION: bool = False


	# ============================================================================
	# Document Chunking Strategy
	# ============================================================================

	# Character limit per chunk (larger = fewer chunks but less precision)
	DOC_CHAR_LIMIT: int = 2000
	# Overlap between chunks for context continuity (prevents semantic breaks)
	DOC_OVERLAP_NO: int = 250

	# Token-based retrieval calculations (roughly 4 chars per token)
	DOC_TOKEN_SIZE: int = DOC_CHAR_LIMIT // 4 # ~500 tokens per chunk
	DOCS_NUM_COUNT: int = 3000 // DOC_TOKEN_SIZE # ~6 documents retrieved


	# ============================================================================
	# Infrastructure: Ollama & Qdrant
	# ============================================================================

	# Ollama Configuration (local LLM inference)
	# Use 'ollama' hostname in Docker Compose, 'localhost' for local development
	OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")

	# Vector Database Configuration (Qdrant)
	# Use 'qdrant' hostname in Docker Compose, 'localhost' for local development
	QDRANT_URL: str = os.getenv("QDRANT_URL", "http://localhost:6333")
	QDRANT_COLLECTION_NAME: str = "rag_documents" # Collection stores all document chunks
	QDRANT_API_KEY: Optional[str] = os.getenv("QDRANT_API_KEY", None) # Optional for Qdrant Cloud


	# ============================================================================
	# Response Streaming (Dummy Mode)
	# ============================================================================

	# Simulate token-by-token streaming in dummy/test mode
	TOKENS_PER_SEC: int = 50 # Tokens yielded per second
	BATCH_TOKEN_PS: int = 2 # Tokens per batch (for realism)


	# ============================================================================
	# Security & CORS
	# ============================================================================

	from typing import Optional

	# Allowed frontend origins (for browser-based requests)
	# Add Streamlit frontend and any external services here
	ALLOWED_ORIGINS: list = [
	"http://localhost:8501", # Local Streamlit development
	"http://127.0.0.1:5500", # Local Live Server
	]


	# ============================================================================
	# Chat History & Persistence
	# ============================================================================

	# Backend for conversation state: 'memory' or 'redis'
	# Use 'redis' for production (persistent across restarts)
	# Use 'memory' for lightweight testing
	HISTORY_BACKEND: str = "redis"

	# Redis connection string
	# Use 'redis' hostname in Docker Compose, 'localhost' for local development
	REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")

	# Session TTL in seconds (0 = no expiry, 2592000 = 30 days)
	# Set to auto-expire old sessions to save memory
	REDIS_HISTORY_TTL_SECONDS: int = 0


	# ============================================================================
	# Evaluation & Metrics (DeepEval)
	# ============================================================================

	# Enable/disable LLM-as-Judge evaluation metrics
	# False = faster response times (<100ms cache hits unaffected)
	# True = adds 5-8s for faithfulness & answer relevancy evaluation
	ENABLE_METRICS_EVALUATION: bool = os.getenv("ENABLE_METRICS_EVALUATION", "false").lower() == "true"

	# Timeout for complete evaluation suite (seconds)
	# Production: 3-5s, Development: 8-10s
	EVALUATION_TIMEOUT: float = float(os.getenv("EVALUATION_TIMEOUT", "8.0"))