File size: 6,906 Bytes
4aec76b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""Evaluation Module for RAG System using DeepEval
- Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval
- Integrates with local Ollama instance for fast, offline evaluation
- Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed)
"""

from typing import List, Dict
from deepeval.models import OllamaModel
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval.test_case import LLMTestCase

from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME
from logger import get_logger

log = get_logger(name="core_evaluation_deepeval")


class RAGEvaluator:
    """Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend.
    
    Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth.
    All metrics are reference-free (do NOT require ground truth).
    
    Metrics (Reference-Free - No Ground Truth Needed):
        - answer_relevancy: How relevant the answer is to the question (0-1)
        - faithfulness: How well the answer is grounded in retrieved documents (0-1)
    """
    
    def __init__(
        self,
        llm_model: str = LLM_CHAT_MODEL_NAME,
        ollama_base_url: str = OLLAMA_BASE_URL,
        temperature: float = 0.0,
    ):
        """Initialize RAGEvaluator with Ollama backend.
        
        Args:
            llm_model: Name of the Ollama model to use (e.g., "gemma3:latest")
            ollama_base_url: Base URL of Ollama server
            temperature: Model temperature for evaluation (0 = deterministic)
        """
        self.llm_model = llm_model
        self.ollama_base_url = ollama_base_url
        self.temperature = temperature
        
        log.info(f"Initializing RAGEvaluator with DeepEval + Ollama")
        log.info(f"  Model: {llm_model}")
        log.info(f"  Ollama URL: {ollama_base_url}")
        
        try:
            # Initialize Ollama model
            self.model = OllamaModel(
                model=llm_model,
                base_url=ollama_base_url,
                temperature=temperature
            )
            
            # Initialize metrics (all reference-free, no ground truth needed)
            self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model)
            self.faithfulness_metric = FaithfulnessMetric(model=self.model)
            
            log.info("βœ… RAGEvaluator initialized successfully with reference-free DeepEval metrics")
            
        except Exception as e:
            log.error(f"❌ Failed to initialize RAGEvaluator: {e}")
            raise

    def evaluate_response(
        self,
        question: str,
        answer: str,
        contexts: List[str],
    ) -> Dict[str, float]:
        """Evaluate a single RAG response using reference-free DeepEval metrics.
        
        NOTE: No ground truth needed - all metrics are reference-free.
        
        Args:
            question: The user's question
            answer: The generated answer from RAG
            contexts: List of retrieved context chunks
        
        Returns:
            Dictionary with metric names and scores (0-1 range)
        """
        try:
            log.info(f"Evaluating response for question: '{question[:50]}...'")
            
            # Create test case for DeepEval
            # All metrics are reference-free (no ground truth required)
            test_case = LLMTestCase(
                input=question,
                actual_output=answer,
                retrieval_context=contexts,  # For context-based metrics
            )
            
            scores = {}
            
            # Evaluate Answer Relevancy
            try:
                log.info("Evaluating answer relevancy...")
                self.answer_relevancy_metric.measure(test_case)
                relevancy_score = self.answer_relevancy_metric.score
                scores["answer_relevancy"] = relevancy_score
                log.info(f"  Answer Relevancy: {relevancy_score:.3f}")
            except Exception as e:
                log.error(f"Failed to evaluate answer relevancy: {e}")
                scores["answer_relevancy"] = 0.0
            
            # Evaluate Faithfulness
            try:
                log.info("Evaluating faithfulness...")
                self.faithfulness_metric.measure(test_case)
                faithfulness_score = self.faithfulness_metric.score
                scores["faithfulness"] = faithfulness_score
                log.info(f"  Faithfulness: {faithfulness_score:.3f}")
            except Exception as e:
                log.error(f"Failed to evaluate faithfulness: {e}")
                scores["faithfulness"] = 0.0
            
            log.info(f"βœ… Evaluation complete: {scores}")
            return scores
            
        except Exception as e:
            log.error(f"❌ Evaluation failed: {e}")
            return {
                "answer_relevancy": 0.0,
                "faithfulness": 0.0,
                "error": str(e),
            }
    
    def evaluate_batch(
        self,
        questions: List[str],
        answers: List[str],
        contexts_list: List[List[str]],
    ) -> Dict[str, List[float]]:
        """Evaluate multiple RAG responses in batch using reference-free metrics.
        
        NOTE: No ground truth needed - all metrics are reference-free.
        
        Args:
            questions: List of user questions
            answers: List of generated answers
            contexts_list: List of context lists (one per question)
        
        Returns:
            Dictionary with metric names and lists of scores
        """
        try:
            log.info(f"Evaluating batch of {len(questions)} responses")
            
            all_scores = {
                "answer_relevancy": [],
                "faithfulness": [],
            }
            
            for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)):
                log.info(f"Evaluating batch item {i+1}/{len(questions)}")
                
                scores = self.evaluate_response(question, answer, contexts)
                
                all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0))
                all_scores["faithfulness"].append(scores.get("faithfulness", 0.0))
            
            log.info(f"βœ… Batch evaluation complete")
            return all_scores
            
        except Exception as e:
            log.error(f"❌ Batch evaluation failed: {e}")
            return {
                "answer_relevancy": [0.0] * len(questions),
                "faithfulness": [0.0] * len(questions),
                "error": str(e),
            }


def create_evaluator() -> RAGEvaluator:
    """Factory function to create a RAGEvaluator instance with DeepEval backend."""
    return RAGEvaluator()