Spaces:

sanchitshaleen
/

chat-with-your-data

Sleeping

chat-with-your-data / server /llm_system /core /evaluation_deepeval.py

sanchitshaleen

Initial deployment of RAG with Gemma-3 to Hugging Face Spaces

4aec76b 9 days ago

6.91 kB

	"""Evaluation Module for RAG System using DeepEval
	- Provides evaluation metrics using LLM-as-a-Judge approach via DeepEval
	- Integrates with local Ollama instance for fast, offline evaluation
	- Metrics include: answer_relevancy, faithfulness (reference-free, no ground truth needed)
	"""

	from typing import List, Dict
	from deepeval.models import OllamaModel
	from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
	from deepeval.test_case import LLMTestCase

	from llm_system.config import OLLAMA_BASE_URL, LLM_CHAT_MODEL_NAME
	from logger import get_logger

	log = get_logger(name="core_evaluation_deepeval")


	class RAGEvaluator:
	"""Evaluates RAG responses using reference-free DeepEval metrics with Ollama backend.

	Uses LLM-as-a-Judge approach for accurate evaluation WITHOUT requiring ground truth.
	All metrics are reference-free (do NOT require ground truth).

	Metrics (Reference-Free - No Ground Truth Needed):
	- answer_relevancy: How relevant the answer is to the question (0-1)
	- faithfulness: How well the answer is grounded in retrieved documents (0-1)
	"""

	def __init__(
	self,
	llm_model: str = LLM_CHAT_MODEL_NAME,
	ollama_base_url: str = OLLAMA_BASE_URL,
	temperature: float = 0.0,
	):
	"""Initialize RAGEvaluator with Ollama backend.

	Args:
	llm_model: Name of the Ollama model to use (e.g., "gemma3:latest")
	ollama_base_url: Base URL of Ollama server
	temperature: Model temperature for evaluation (0 = deterministic)
	"""
	self.llm_model = llm_model
	self.ollama_base_url = ollama_base_url
	self.temperature = temperature

	log.info(f"Initializing RAGEvaluator with DeepEval + Ollama")
	log.info(f" Model: {llm_model}")
	log.info(f" Ollama URL: {ollama_base_url}")

	try:
	# Initialize Ollama model
	self.model = OllamaModel(
	model=llm_model,
	base_url=ollama_base_url,
	temperature=temperature
	)

	# Initialize metrics (all reference-free, no ground truth needed)
	self.answer_relevancy_metric = AnswerRelevancyMetric(model=self.model)
	self.faithfulness_metric = FaithfulnessMetric(model=self.model)

	log.info("✅ RAGEvaluator initialized successfully with reference-free DeepEval metrics")

	except Exception as e:
	log.error(f"❌ Failed to initialize RAGEvaluator: {e}")
	raise

	def evaluate_response(
	self,
	question: str,
	answer: str,
	contexts: List[str],
	) -> Dict[str, float]:
	"""Evaluate a single RAG response using reference-free DeepEval metrics.

	NOTE: No ground truth needed - all metrics are reference-free.

	Args:
	question: The user's question
	answer: The generated answer from RAG
	contexts: List of retrieved context chunks

	Returns:
	Dictionary with metric names and scores (0-1 range)
	"""
	try:
	log.info(f"Evaluating response for question: '{question[:50]}...'")

	# Create test case for DeepEval
	# All metrics are reference-free (no ground truth required)
	test_case = LLMTestCase(
	input=question,
	actual_output=answer,
	retrieval_context=contexts, # For context-based metrics
	)

	scores = {}

	# Evaluate Answer Relevancy
	try:
	log.info("Evaluating answer relevancy...")
	self.answer_relevancy_metric.measure(test_case)
	relevancy_score = self.answer_relevancy_metric.score
	scores["answer_relevancy"] = relevancy_score
	log.info(f" Answer Relevancy: {relevancy_score:.3f}")
	except Exception as e:
	log.error(f"Failed to evaluate answer relevancy: {e}")
	scores["answer_relevancy"] = 0.0

	# Evaluate Faithfulness
	try:
	log.info("Evaluating faithfulness...")
	self.faithfulness_metric.measure(test_case)
	faithfulness_score = self.faithfulness_metric.score
	scores["faithfulness"] = faithfulness_score
	log.info(f" Faithfulness: {faithfulness_score:.3f}")
	except Exception as e:
	log.error(f"Failed to evaluate faithfulness: {e}")
	scores["faithfulness"] = 0.0

	log.info(f"✅ Evaluation complete: {scores}")
	return scores

	except Exception as e:
	log.error(f"❌ Evaluation failed: {e}")
	return {
	"answer_relevancy": 0.0,
	"faithfulness": 0.0,
	"error": str(e),
	}

	def evaluate_batch(
	self,
	questions: List[str],
	answers: List[str],
	contexts_list: List[List[str]],
	) -> Dict[str, List[float]]:
	"""Evaluate multiple RAG responses in batch using reference-free metrics.

	NOTE: No ground truth needed - all metrics are reference-free.

	Args:
	questions: List of user questions
	answers: List of generated answers
	contexts_list: List of context lists (one per question)

	Returns:
	Dictionary with metric names and lists of scores
	"""
	try:
	log.info(f"Evaluating batch of {len(questions)} responses")

	all_scores = {
	"answer_relevancy": [],
	"faithfulness": [],
	}

	for i, (question, answer, contexts) in enumerate(zip(questions, answers, contexts_list)):
	log.info(f"Evaluating batch item {i+1}/{len(questions)}")

	scores = self.evaluate_response(question, answer, contexts)

	all_scores["answer_relevancy"].append(scores.get("answer_relevancy", 0.0))
	all_scores["faithfulness"].append(scores.get("faithfulness", 0.0))

	log.info(f"✅ Batch evaluation complete")
	return all_scores

	except Exception as e:
	log.error(f"❌ Batch evaluation failed: {e}")
	return {
	"answer_relevancy": [0.0] * len(questions),
	"faithfulness": [0.0] * len(questions),
	"error": str(e),
	}


	def create_evaluator() -> RAGEvaluator:
	"""Factory function to create a RAGEvaluator instance with DeepEval backend."""
	return RAGEvaluator()