| """Configuration module for RAG system. | |
| Central configuration hub for all system parameters including: | |
| - LLM Model Selection and Parameters: Chat, summarization, temperature settings | |
| - Embedding Model: Vector representation for semantic search | |
| - Chunking Strategy: Document segmentation for retrieval | |
| - Vector Database: Qdrant configuration for similarity search | |
| - Inference Infrastructure: Ollama local LLM serving | |
| - Chat History: Redis backend for multi-turn conversation state | |
| - Evaluation Metrics: DeepEval LLM-as-Judge configuration | |
| - CORS/Security: Frontend origin whitelisting | |
| - Performance: Token streaming and dummy response simulation | |
| All configuration can be overridden via environment variables or direct modification. | |
| For production deployments, review EVALUATION_TIMEOUT, REDIS_URL, and OLLAMA_BASE_URL. | |
| CST Timezone: All timestamps use CST (America/Chicago) for consistent logging across deployment. | |
| """ | |
| import os | |
| from typing import Optional | |
| from dotenv import load_dotenv | |
| # Load environment variables from .env file | |
| # override=False ensures environment variables take precedence over .env file | |
| load_dotenv(override=False) | |
| # ============================================================================ | |
| # LLM Model Configuration | |
| # ============================================================================ | |
| LLM_CHAT_MODEL_NAME: str = "gemma3:latest" # Main chat model (Ollama) | |
| LLM_CHAT_TEMPERATURE: float = 0.75 # Temperature: 0=deterministic, 1=creative | |
| LLM_SUMMARY_MODEL_NAME: str = "gemma3:latest" # For conversation summarization | |
| LLM_SUMMARY_TEMPERATURE: float = 0.5 # Lower temp for consistent summaries | |
| EMB_MODEL_NAME: str = "mxbai-embed-large:latest" # Embedding model for semantic search (deprecated, kept for backward compatibility) | |
| # ============================================================================ | |
| # Jina v4 Multi-Modal Embeddings Configuration | |
| # ============================================================================ | |
| # Enable Jina v4 for multi-modal embeddings (text + images) | |
| USE_JINA_EMBEDDINGS: bool = os.getenv("USE_JINA_EMBEDDINGS", "true").lower() == "true" | |
| # Jina v4 model settings | |
| JINA_MODEL_NAME: str = "jinaai/jina-embeddings-v4" # HuggingFace model identifier | |
| JINA_TASK: str = "retrieval" # Task-specific adapter: 'retrieval', 'text-matching', 'code' | |
| JINA_EMBEDDING_DIM: int = 2048 # Default dimension (can be truncated to 1024, 512, 256, 128) | |
| JINA_EMBEDDING_DIM_TRUNCATE: int = int(os.getenv("JINA_EMBEDDING_DIM_TRUNCATE", "1024")) # Truncate to save memory | |
| JINA_DEVICE: str = os.getenv("JINA_DEVICE", "cuda") # 'cuda' or 'cpu' | |
| JINA_BATCH_SIZE: int = 32 # Batch size for inference | |
| JINA_MAX_LENGTH: int = 32768 # Max sequence length | |
| # Image extraction settings for multi-modal documents | |
| EXTRACT_IMAGES_FROM_PDF: bool = True # Extract images from PDFs | |
| IMAGE_OUTPUT_DIR: str = os.getenv("IMAGE_OUTPUT_DIR", "user_uploads/extracted_images") # Where to store extracted images (relative to server dir) | |
| IMAGE_MAX_SIZE: tuple = (1024, 1024) # Resize images to this size | |
| IMAGE_FORMAT: str = "PNG" # Image format (PNG, JPEG, WEBP) | |
| # ============================================================================ | |
| # Content & Token Management | |
| # ============================================================================ | |
| # Maximum total tokens allowed in context window (chat_history + input + context) | |
| # Adjust based on model capability (Gemma3: ~14k useful for 32k model) | |
| MAX_CONTENT_SIZE: int = 14000 | |
| # ============================================================================ | |
| # Connection Verification | |
| # ============================================================================ | |
| # Whether to verify LLM/embedding model availability at startup | |
| # Set to True for Docker/production, False for local development | |
| VERIFY_LLM_CONNECTION: bool = False | |
| VERIFY_EMB_CONNECTION: bool = False | |
| # ============================================================================ | |
| # Document Chunking Strategy | |
| # ============================================================================ | |
| # Character limit per chunk (larger = fewer chunks but less precision) | |
| DOC_CHAR_LIMIT: int = 2000 | |
| # Overlap between chunks for context continuity (prevents semantic breaks) | |
| DOC_OVERLAP_NO: int = 250 | |
| # Token-based retrieval calculations (roughly 4 chars per token) | |
| DOC_TOKEN_SIZE: int = DOC_CHAR_LIMIT // 4 # ~500 tokens per chunk | |
| DOCS_NUM_COUNT: int = 3000 // DOC_TOKEN_SIZE # ~6 documents retrieved | |
| # ============================================================================ | |
| # Infrastructure: Ollama & Qdrant | |
| # ============================================================================ | |
| # Ollama Configuration (local LLM inference) | |
| # Use 'ollama' hostname in Docker Compose, 'localhost' for local development | |
| OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434") | |
| # Vector Database Configuration (Qdrant) | |
| # Use 'qdrant' hostname in Docker Compose, 'localhost' for local development | |
| QDRANT_URL: str = os.getenv("QDRANT_URL", "http://localhost:6333") | |
| QDRANT_COLLECTION_NAME: str = "rag_documents" # Collection stores all document chunks | |
| QDRANT_API_KEY: Optional[str] = os.getenv("QDRANT_API_KEY", None) # Optional for Qdrant Cloud | |
| # ============================================================================ | |
| # Response Streaming (Dummy Mode) | |
| # ============================================================================ | |
| # Simulate token-by-token streaming in dummy/test mode | |
| TOKENS_PER_SEC: int = 50 # Tokens yielded per second | |
| BATCH_TOKEN_PS: int = 2 # Tokens per batch (for realism) | |
| # ============================================================================ | |
| # Security & CORS | |
| # ============================================================================ | |
| from typing import Optional | |
| # Allowed frontend origins (for browser-based requests) | |
| # Add Streamlit frontend and any external services here | |
| ALLOWED_ORIGINS: list = [ | |
| "http://localhost:8501", # Local Streamlit development | |
| "http://127.0.0.1:5500", # Local Live Server | |
| ] | |
| # ============================================================================ | |
| # Chat History & Persistence | |
| # ============================================================================ | |
| # Backend for conversation state: 'memory' or 'redis' | |
| # Use 'redis' for production (persistent across restarts) | |
| # Use 'memory' for lightweight testing | |
| HISTORY_BACKEND: str = "redis" | |
| # Redis connection string | |
| # Use 'redis' hostname in Docker Compose, 'localhost' for local development | |
| REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0") | |
| # Session TTL in seconds (0 = no expiry, 2592000 = 30 days) | |
| # Set to auto-expire old sessions to save memory | |
| REDIS_HISTORY_TTL_SECONDS: int = 0 | |
| # ============================================================================ | |
| # Evaluation & Metrics (DeepEval) | |
| # ============================================================================ | |
| # Enable/disable LLM-as-Judge evaluation metrics | |
| # False = faster response times (<100ms cache hits unaffected) | |
| # True = adds 5-8s for faithfulness & answer relevancy evaluation | |
| ENABLE_METRICS_EVALUATION: bool = os.getenv("ENABLE_METRICS_EVALUATION", "false").lower() == "true" | |
| # Timeout for complete evaluation suite (seconds) | |
| # Production: 3-5s, Development: 8-10s | |
| EVALUATION_TIMEOUT: float = float(os.getenv("EVALUATION_TIMEOUT", "8.0")) | |