Spaces:
Sleeping
Sleeping
| """Evaluation Module for RAG System using DeepEval | |
| - Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval | |
| - Integrates with local Ollama instance for fast, offline evaluation | |
| - Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed) | |
| """ | |
| from typing import List, Dict | |
| from deepeval.models import OllamaModel | |
| from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric | |
| from deepeval.test_case import LLMTestCase | |
| from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME | |
| from logger import get_logger | |
| log = get_logger(name="core_evaluation_deepeval") | |
| class RAGEvaluator: | |
| """Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend. | |
| Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth. | |
| All metrics are reference-free (do NOT require ground truth). | |
| Metrics (Reference-Free - No Ground Truth Needed): | |
| - answer_relevancy: How relevant the answer is to the question (0-1) | |
| - faithfulness: How well the answer is grounded in retrieved documents (0-1) | |
| """ | |
| def __init__( | |
| self, | |
| llm_model: str = LLM_CHAT_MODEL_NAME, | |
| ollama_base_url: str = OLLAMA_BASE_URL, | |
| temperature: float = 0.0, | |
| ): | |
| """Initialize RAGEvaluator with Ollama backend. | |
| Args: | |
| llm_model: Name of the Ollama model to use (e.g., "gemma3:latest") | |
| ollama_base_url: Base URL of Ollama server | |
| temperature: Model temperature for evaluation (0 = deterministic) | |
| """ | |
| self.llm_model = llm_model | |
| self.ollama_base_url = ollama_base_url | |
| self.temperature = temperature | |
| log.info(f"Initializing RAGEvaluator with DeepEval + Ollama") | |
| log.info(f" Model: {llm_model}") | |
| log.info(f" Ollama URL: {ollama_base_url}") | |
| try: | |
| # Initialize Ollama model | |
| self.model = OllamaModel( | |
| model=llm_model, | |
| base_url=ollama_base_url, | |
| temperature=temperature | |
| ) | |
| # Initialize metrics (all reference-free, no ground truth needed) | |
| self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model) | |
| self.faithfulness_metric = FaithfulnessMetric(model=self.model) | |
| log.info("β RAGEvaluator initialized successfully with reference-free DeepEval metrics") | |
| except Exception as e: | |
| log.error(f"β Failed to initialize RAGEvaluator: {e}") | |
| raise | |
| def evaluate_response( | |
| self, | |
| question: str, | |
| answer: str, | |
| contexts: List[str], | |
| ) -> Dict[str, float]: | |
| """Evaluate a single RAG response using reference-free DeepEval metrics. | |
| NOTE: No ground truth needed - all metrics are reference-free. | |
| Args: | |
| question: The user's question | |
| answer: The generated answer from RAG | |
| contexts: List of retrieved context chunks | |
| Returns: | |
| Dictionary with metric names and scores (0-1 range) | |
| """ | |
| try: | |
| log.info(f"Evaluating response for question: '{question[:50]}...'") | |
| # Create test case for DeepEval | |
| # All metrics are reference-free (no ground truth required) | |
| test_case = LLMTestCase( | |
| input=question, | |
| actual_output=answer, | |
| retrieval_context=contexts, # For context-based metrics | |
| ) | |
| scores = {} | |
| # Evaluate Answer Relevancy | |
| try: | |
| log.info("Evaluating answer relevancy...") | |
| self.answer_relevancy_metric.measure(test_case) | |
| relevancy_score = self.answer_relevancy_metric.score | |
| scores["answer_relevancy"] = relevancy_score | |
| log.info(f" Answer Relevancy: {relevancy_score:.3f}") | |
| except Exception as e: | |
| log.error(f"Failed to evaluate answer relevancy: {e}") | |
| scores["answer_relevancy"] = 0.0 | |
| # Evaluate Faithfulness | |
| try: | |
| log.info("Evaluating faithfulness...") | |
| self.faithfulness_metric.measure(test_case) | |
| faithfulness_score = self.faithfulness_metric.score | |
| scores["faithfulness"] = faithfulness_score | |
| log.info(f" Faithfulness: {faithfulness_score:.3f}") | |
| except Exception as e: | |
| log.error(f"Failed to evaluate faithfulness: {e}") | |
| scores["faithfulness"] = 0.0 | |
| log.info(f"β Evaluation complete: {scores}") | |
| return scores | |
| except Exception as e: | |
| log.error(f"β Evaluation failed: {e}") | |
| return { | |
| "answer_relevancy": 0.0, | |
| "faithfulness": 0.0, | |
| "error": str(e), | |
| } | |
| def evaluate_batch( | |
| self, | |
| questions: List[str], | |
| answers: List[str], | |
| contexts_list: List[List[str]], | |
| ) -> Dict[str, List[float]]: | |
| """Evaluate multiple RAG responses in batch using reference-free metrics. | |
| NOTE: No ground truth needed - all metrics are reference-free. | |
| Args: | |
| questions: List of user questions | |
| answers: List of generated answers | |
| contexts_list: List of context lists (one per question) | |
| Returns: | |
| Dictionary with metric names and lists of scores | |
| """ | |
| try: | |
| log.info(f"Evaluating batch of {len(questions)} responses") | |
| all_scores = { | |
| "answer_relevancy": [], | |
| "faithfulness": [], | |
| } | |
| for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)): | |
| log.info(f"Evaluating batch item {i+1}/{len(questions)}") | |
| scores = self.evaluate_response(question, answer, contexts) | |
| all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0)) | |
| all_scores["faithfulness"].append(scores.get("faithfulness", 0.0)) | |
| log.info(f"β Batch evaluation complete") | |
| return all_scores | |
| except Exception as e: | |
| log.error(f"β Batch evaluation failed: {e}") | |
| return { | |
| "answer_relevancy": [0.0] * len(questions), | |
| "faithfulness": [0.0] * len(questions), | |
| "error": str(e), | |
| } | |
| def create_evaluator() -> RAGEvaluator: | |
| """Factory function to create a RAGEvaluator instance with DeepEval backend.""" | |
| return RAGEvaluator() | |