sanchitshaleen
Initial deployment of RAG with Gemma-3 to Hugging Face Spaces
4aec76b
"""Configuration module for RAG system.
Central configuration hub for all system parameters including:
- LLM Model Selection and Parameters: Chat, summarization, temperature settings
- Embedding Model: Vector representation for semantic search
- Chunking Strategy: Document segmentation for retrieval
- Vector Database: Qdrant configuration for similarity search
- Inference Infrastructure: Ollama local LLM serving
- Chat History: Redis backend for multi-turn conversation state
- Evaluation Metrics: DeepEval LLM-as-Judge configuration
- CORS/Security: Frontend origin whitelisting
- Performance: Token streaming and dummy response simulation
All configuration can be overridden via environment variables or direct modification.
For production deployments, review EVALUATION_TIMEOUT, REDIS_URL, and OLLAMA_BASE_URL.
CST Timezone: All timestamps use CST (America/Chicago) for consistent logging across deployment.
"""
import os
from typing import Optional
from dotenv import load_dotenv
# Load environment variables from .env file
# override=False ensures environment variables take precedence over .env file
load_dotenv(override=False)
# ============================================================================
# LLM Model Configuration
# ============================================================================
LLM_CHAT_MODEL_NAME: str = "gemma3:latest" # Main chat model (Ollama)
LLM_CHAT_TEMPERATURE: float = 0.75 # Temperature: 0=deterministic, 1=creative
LLM_SUMMARY_MODEL_NAME: str = "gemma3:latest" # For conversation summarization
LLM_SUMMARY_TEMPERATURE: float = 0.5 # Lower temp for consistent summaries
EMB_MODEL_NAME: str = "mxbai-embed-large:latest" # Embedding model for semantic search (deprecated, kept for backward compatibility)
# ============================================================================
# Jina v4 Multi-Modal Embeddings Configuration
# ============================================================================
# Enable Jina v4 for multi-modal embeddings (text + images)
USE_JINA_EMBEDDINGS: bool = os.getenv("USE_JINA_EMBEDDINGS", "true").lower() == "true"
# Jina v4 model settings
JINA_MODEL_NAME: str = "jinaai/jina-embeddings-v4" # HuggingFace model identifier
JINA_TASK: str = "retrieval" # Task-specific adapter: 'retrieval', 'text-matching', 'code'
JINA_EMBEDDING_DIM: int = 2048 # Default dimension (can be truncated to 1024, 512, 256, 128)
JINA_EMBEDDING_DIM_TRUNCATE: int = int(os.getenv("JINA_EMBEDDING_DIM_TRUNCATE", "1024")) # Truncate to save memory
JINA_DEVICE: str = os.getenv("JINA_DEVICE", "cuda") # 'cuda' or 'cpu'
JINA_BATCH_SIZE: int = 32 # Batch size for inference
JINA_MAX_LENGTH: int = 32768 # Max sequence length
# Image extraction settings for multi-modal documents
EXTRACT_IMAGES_FROM_PDF: bool = True # Extract images from PDFs
IMAGE_OUTPUT_DIR: str = os.getenv("IMAGE_OUTPUT_DIR", "user_uploads/extracted_images") # Where to store extracted images (relative to server dir)
IMAGE_MAX_SIZE: tuple = (1024, 1024) # Resize images to this size
IMAGE_FORMAT: str = "PNG" # Image format (PNG, JPEG, WEBP)
# ============================================================================
# Content & Token Management
# ============================================================================
# Maximum total tokens allowed in context window (chat_history + input + context)
# Adjust based on model capability (Gemma3: ~14k useful for 32k model)
MAX_CONTENT_SIZE: int = 14000
# ============================================================================
# Connection Verification
# ============================================================================
# Whether to verify LLM/embedding model availability at startup
# Set to True for Docker/production, False for local development
VERIFY_LLM_CONNECTION: bool = False
VERIFY_EMB_CONNECTION: bool = False
# ============================================================================
# Document Chunking Strategy
# ============================================================================
# Character limit per chunk (larger = fewer chunks but less precision)
DOC_CHAR_LIMIT: int = 2000
# Overlap between chunks for context continuity (prevents semantic breaks)
DOC_OVERLAP_NO: int = 250
# Token-based retrieval calculations (roughly 4 chars per token)
DOC_TOKEN_SIZE: int = DOC_CHAR_LIMIT // 4 # ~500 tokens per chunk
DOCS_NUM_COUNT: int = 3000 // DOC_TOKEN_SIZE # ~6 documents retrieved
# ============================================================================
# Infrastructure: Ollama & Qdrant
# ============================================================================
# Ollama Configuration (local LLM inference)
# Use 'ollama' hostname in Docker Compose, 'localhost' for local development
OLLAMA_BASE_URL: str = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
# Vector Database Configuration (Qdrant)
# Use 'qdrant' hostname in Docker Compose, 'localhost' for local development
QDRANT_URL: str = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION_NAME: str = "rag_documents" # Collection stores all document chunks
QDRANT_API_KEY: Optional[str] = os.getenv("QDRANT_API_KEY", None) # Optional for Qdrant Cloud
# ============================================================================
# Response Streaming (Dummy Mode)
# ============================================================================
# Simulate token-by-token streaming in dummy/test mode
TOKENS_PER_SEC: int = 50 # Tokens yielded per second
BATCH_TOKEN_PS: int = 2 # Tokens per batch (for realism)
# ============================================================================
# Security & CORS
# ============================================================================
from typing import Optional
# Allowed frontend origins (for browser-based requests)
# Add Streamlit frontend and any external services here
ALLOWED_ORIGINS: list = [
"http://localhost:8501", # Local Streamlit development
"http://127.0.0.1:5500", # Local Live Server
]
# ============================================================================
# Chat History & Persistence
# ============================================================================
# Backend for conversation state: 'memory' or 'redis'
# Use 'redis' for production (persistent across restarts)
# Use 'memory' for lightweight testing
HISTORY_BACKEND: str = "redis"
# Redis connection string
# Use 'redis' hostname in Docker Compose, 'localhost' for local development
REDIS_URL: Optional[str] = os.getenv("REDIS_URL", "redis://localhost:6379/0")
# Session TTL in seconds (0 = no expiry, 2592000 = 30 days)
# Set to auto-expire old sessions to save memory
REDIS_HISTORY_TTL_SECONDS: int = 0
# ============================================================================
# Evaluation & Metrics (DeepEval)
# ============================================================================
# Enable/disable LLM-as-Judge evaluation metrics
# False = faster response times (<100ms cache hits unaffected)
# True = adds 5-8s for faithfulness & answer relevancy evaluation
ENABLE_METRICS_EVALUATION: bool = os.getenv("ENABLE_METRICS_EVALUATION", "false").lower() == "true"
# Timeout for complete evaluation suite (seconds)
# Production: 3-5s, Development: 8-10s
EVALUATION_TIMEOUT: float = float(os.getenv("EVALUATION_TIMEOUT", "8.0"))