Spaces:

sanchitshaleen
/

chat-with-your-data

Sleeping

File size: 6,906 Bytes

4aec76b

"""Evaluation Module for RAG System using DeepEval
- Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval
- Integrates with local Ollama instance for fast, offline evaluation
- Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed)
"""

from typing import List, Dict
from deepeval.models import OllamaModel
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase

from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME
from logger import get_logger

log = get_logger(name="core_evaluation_deepeval")


class RAGEvaluator:
    """Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend.
    
    Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth.
    All metrics are reference-free (do NOT require ground truth).
    
    Metrics (Reference-Free - No Ground Truth Needed):
        - answer_relevancy: How relevant the answer is to the question (0-1)
        - faithfulness: How well the answer is grounded in retrieved documents (0-1)
    """
    
    def __init__(
        self,
        llm_model: str = LLM_CHAT_MODEL_NAME,
        ollama_base_url: str = OLLAMA_BASE_URL,
        temperature: float = 0.0,
    ):
        """Initialize RAGEvaluator with Ollama backend.
        
        Args:
            llm_model: Name of the Ollama model to use (e.g., "gemma3:latest")
            ollama_base_url: Base URL of Ollama server
            temperature: Model temperature for evaluation (0 = deterministic)
        """
        self.llm_model = llm_model
        self.ollama_base_url = ollama_base_url
        self.temperature = temperature
        
        log.info(f"Initializing RAGEvaluator with DeepEval + Ollama")
        log.info(f"  Model: {llm_model}")
        log.info(f"  Ollama URL: {ollama_base_url}")
        
        try:
            # Initialize Ollama model
            self.model = OllamaModel(
                model=llm_model,
                base_url=ollama_base_url,
                temperature=temperature
            )
            
            # Initialize metrics (all reference-free, no ground truth needed)
            self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model)
            self.faithfulness_metric = FaithfulnessMetric(model=self.model)
            
            log.info("✅ RAGEvaluator initialized successfully with reference-free DeepEval metrics")
            
        except Exception as e:
            log.error(f"❌ Failed to initialize RAGEvaluator: {e}")
            raise

    def evaluate_response(
        self,
        question: str,
        answer: str,
        contexts: List[str],
    ) -> Dict[str, float]:
        """Evaluate a single RAG response using reference-free DeepEval metrics.
        
        NOTE: No ground truth needed - all metrics are reference-free.
        
        Args:
            question: The user's question
            answer: The generated answer from RAG
            contexts: List of retrieved context chunks
        
        Returns:
            Dictionary with metric names and scores (0-1 range)
        """
        try:
            log.info(f"Evaluating response for question: '{question[:50]}...'")
            
            # Create test case for DeepEval
            # All metrics are reference-free (no ground truth required)
            test_case = LLMTestCase(
                input=question,
                actual_output=answer,
                retrieval_context=contexts,  # For context-based metrics
            )
            
            scores = {}
            
            # Evaluate Answer Relevancy
            try:
                log.info("Evaluating answer relevancy...")
                self.answer_relevancy_metric.measure(test_case)
                relevancy_score = self.answer_relevancy_metric.score
                scores["answer_relevancy"] = relevancy_score
                log.info(f"  Answer Relevancy: {relevancy_score:.3f}")
            except Exception as e:
                log.error(f"Failed to evaluate answer relevancy: {e}")
                scores["answer_relevancy"] = 0.0
            
            # Evaluate Faithfulness
            try:
                log.info("Evaluating faithfulness...")
                self.faithfulness_metric.measure(test_case)
                faithfulness_score = self.faithfulness_metric.score
                scores["faithfulness"] = faithfulness_score
                log.info(f"  Faithfulness: {faithfulness_score:.3f}")
            except Exception as e:
                log.error(f"Failed to evaluate faithfulness: {e}")
                scores["faithfulness"] = 0.0
            
            log.info(f"✅ Evaluation complete: {scores}")
            return scores
            
        except Exception as e:
            log.error(f"❌ Evaluation failed: {e}")
            return {
                "answer_relevancy": 0.0,
                "faithfulness": 0.0,
                "error": str(e),
            }
    
    def evaluate_batch(
        self,
        questions: List[str],
        answers: List[str],
        contexts_list: List[List[str]],
    ) -> Dict[str, List[float]]:
        """Evaluate multiple RAG responses in batch using reference-free metrics.
        
        NOTE: No ground truth needed - all metrics are reference-free.
        
        Args:
            questions: List of user questions
            answers: List of generated answers
            contexts_list: List of context lists (one per question)
        
        Returns:
            Dictionary with metric names and lists of scores
        """
        try:
            log.info(f"Evaluating batch of {len(questions)} responses")
            
            all_scores = {
                "answer_relevancy": [],
                "faithfulness": [],
            }
            
            for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)):
                log.info(f"Evaluating batch item {i+1}/{len(questions)}")
                
                scores = self.evaluate_response(question, answer, contexts)
                
                all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0))
                all_scores["faithfulness"].append(scores.get("faithfulness", 0.0))
            
            log.info(f"✅ Batch evaluation complete")
            return all_scores
            
        except Exception as e:
            log.error(f"❌ Batch evaluation failed: {e}")
            return {
                "answer_relevancy": [0.0] * len(questions),
                "faithfulness": [0.0] * len(questions),
                "error": str(e),
            }


def create_evaluator() -> RAGEvaluator:
    """Factory function to create a RAGEvaluator instance with DeepEval backend."""
    return RAGEvaluator()