Spaces:
Sleeping
Sleeping
File size: 6,906 Bytes
4aec76b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
"""Evaluation Module for RAG System using DeepEval
- Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval
- Integrates with local Ollama instance for fast, offline evaluation
- Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed)
"""
from typing import List, Dict
from deepeval.models import OllamaModel
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME
from logger import get_logger
log = get_logger(name="core_evaluation_deepeval")
class RAGEvaluator:
"""Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend.
Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth.
All metrics are reference-free (do NOT require ground truth).
Metrics (Reference-Free - No Ground Truth Needed):
- answer_relevancy: How relevant the answer is to the question (0-1)
- faithfulness: How well the answer is grounded in retrieved documents (0-1)
"""
def __init__(
self,
llm_model: str = LLM_CHAT_MODEL_NAME,
ollama_base_url: str = OLLAMA_BASE_URL,
temperature: float = 0.0,
):
"""Initialize RAGEvaluator with Ollama backend.
Args:
llm_model: Name of the Ollama model to use (e.g., "gemma3:latest")
ollama_base_url: Base URL of Ollama server
temperature: Model temperature for evaluation (0 = deterministic)
"""
self.llm_model = llm_model
self.ollama_base_url = ollama_base_url
self.temperature = temperature
log.info(f"Initializing RAGEvaluator with DeepEval + Ollama")
log.info(f" Model: {llm_model}")
log.info(f" Ollama URL: {ollama_base_url}")
try:
# Initialize Ollama model
self.model = OllamaModel(
model=llm_model,
base_url=ollama_base_url,
temperature=temperature
)
# Initialize metrics (all reference-free, no ground truth needed)
self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model)
self.faithfulness_metric = FaithfulnessMetric(model=self.model)
log.info("β
RAGEvaluator initialized successfully with reference-free DeepEval metrics")
except Exception as e:
log.error(f"β Failed to initialize RAGEvaluator: {e}")
raise
def evaluate_response(
self,
question: str,
answer: str,
contexts: List[str],
) -> Dict[str, float]:
"""Evaluate a single RAG response using reference-free DeepEval metrics.
NOTE: No ground truth needed - all metrics are reference-free.
Args:
question: The user's question
answer: The generated answer from RAG
contexts: List of retrieved context chunks
Returns:
Dictionary with metric names and scores (0-1 range)
"""
try:
log.info(f"Evaluating response for question: '{question[:50]}...'")
# Create test case for DeepEval
# All metrics are reference-free (no ground truth required)
test_case = LLMTestCase(
input=question,
actual_output=answer,
retrieval_context=contexts, # For context-based metrics
)
scores = {}
# Evaluate Answer Relevancy
try:
log.info("Evaluating answer relevancy...")
self.answer_relevancy_metric.measure(test_case)
relevancy_score = self.answer_relevancy_metric.score
scores["answer_relevancy"] = relevancy_score
log.info(f" Answer Relevancy: {relevancy_score:.3f}")
except Exception as e:
log.error(f"Failed to evaluate answer relevancy: {e}")
scores["answer_relevancy"] = 0.0
# Evaluate Faithfulness
try:
log.info("Evaluating faithfulness...")
self.faithfulness_metric.measure(test_case)
faithfulness_score = self.faithfulness_metric.score
scores["faithfulness"] = faithfulness_score
log.info(f" Faithfulness: {faithfulness_score:.3f}")
except Exception as e:
log.error(f"Failed to evaluate faithfulness: {e}")
scores["faithfulness"] = 0.0
log.info(f"β
Evaluation complete: {scores}")
return scores
except Exception as e:
log.error(f"β Evaluation failed: {e}")
return {
"answer_relevancy": 0.0,
"faithfulness": 0.0,
"error": str(e),
}
def evaluate_batch(
self,
questions: List[str],
answers: List[str],
contexts_list: List[List[str]],
) -> Dict[str, List[float]]:
"""Evaluate multiple RAG responses in batch using reference-free metrics.
NOTE: No ground truth needed - all metrics are reference-free.
Args:
questions: List of user questions
answers: List of generated answers
contexts_list: List of context lists (one per question)
Returns:
Dictionary with metric names and lists of scores
"""
try:
log.info(f"Evaluating batch of {len(questions)} responses")
all_scores = {
"answer_relevancy": [],
"faithfulness": [],
}
for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)):
log.info(f"Evaluating batch item {i+1}/{len(questions)}")
scores = self.evaluate_response(question, answer, contexts)
all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0))
all_scores["faithfulness"].append(scores.get("faithfulness", 0.0))
log.info(f"β
Batch evaluation complete")
return all_scores
except Exception as e:
log.error(f"β Batch evaluation failed: {e}")
return {
"answer_relevancy": [0.0] * len(questions),
"faithfulness": [0.0] * len(questions),
"error": str(e),
}
def create_evaluator() -> RAGEvaluator:
"""Factory function to create a RAGEvaluator instance with DeepEval backend."""
return RAGEvaluator()
|