LLM Evaluation Frameworks
LLM evaluation frameworks provide structured approaches to measure faithfulness, relevancy, and correctness of language model outputs, enabling systematic quality assurance.
RAGAS Evaluation
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
from datasets import Dataset
# Prepare evaluation dataset
eval_data = {
"question": [
"What is machine learning?",
"How does neural network work?",
"What are transformers in AI?"
],
"answer": [
"Machine learning is a subset of AI...",
"Neural networks are computing systems...",
"Transformers are deep learning architectures..."
],
"contexts": [
["Machine learning (ML) is a branch of AI..."],
["A neural network consists of layers..."],
["The transformer architecture uses self-attention..."]
],
"ground_truth": [
"ML is a subset of AI that enables systems to learn from data.",
"Neural networks are computing systems inspired by biological brains.",
"Transformers use self-attention mechanisms for sequence processing."
]
}
dataset = Dataset.from_dict(eval_data)
# Run evaluation
results = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
)
print(results)
# {'faithfulness': 0.89, 'answer_relevancy': 0.85,
# 'context_precision': 0.82, 'context_recall': 0.91}
DeepEval Metrics
from deepeval import evaluate
from deepeval.metrics import (
AnswerRelevancyMetric,
FaithfulnessMetric,
ContextualPrecisionMetric
)
from deepeval.test_case import LLMTestCase
# Create test cases
test_cases = [
LLMTestCase(
input="What is deep learning?",
actual_output="Deep learning is a subset of machine learning using neural networks.",
expected_output="Deep learning uses neural networks with multiple layers.",
retrieval_context=["Deep learning is a subset of ML that uses neural networks with multiple layers."],
context=["Deep learning is a subset of ML that uses neural networks with multiple layers."]
)
]
# Define metrics
relevancy_metric = AnswerRelevancyMetric(threshold=0.7)
faithfulness_metric = FaithfulnessMetric(threshold=0.8)
# Evaluate
results = evaluate(
test_cases=test_cases,
metrics=[relevancy_metric, faithfulness_metric]
)
print(results)
Custom Evaluation Pipeline
from dataclasses import dataclass
from typing import List, Callable
import numpy as np
@dataclass
class EvalResult:
metric_name: str
score: float
passed: bool
details: dict
class LLMEvaluator:
def __init__(self, llm):
self.llm = llm
self.metrics = {}
def register_metric(self, name: str, evaluator: Callable, threshold: float = 0.5):
self.metrics[name] = {"evaluator": evaluator, "threshold": threshold}
def evaluate_faithfulness(self, answer: str, context: str) -> EvalResult:
prompt = f"""Rate faithfulness 0-1: Does the answer only use info from context?
Context: {context}
Answer: {answer}
Score (0-1):"""
score = float(self.llm.invoke(prompt).content)
return EvalResult("faithfulness", score, score >= 0.7, {"context_len": len(context)})
def evaluate_relevancy(self, question: str, answer: str) -> EvalResult:
prompt = f"""Rate relevancy 0-1: Does the answer address the question?
Question: {question}
Answer: {answer}
Score (0-1):"""
score = float(self.llm.invoke(prompt).content)
return EvalResult("relevancy", score, score >= 0.7, {"answer_len": len(answer)})
def evaluate_correctness(self, answer: str, ground_truth: str) -> EvalResult:
prompt = f"""Compare answer to ground truth. Rate 0-1.
Answer: {answer}
Ground Truth: {ground_truth}
Score (0-1):"""
score = float(self.llm.invoke(prompt).content)
return EvalResult("correctness", score, score >= 0.8, {})
def run_evaluation(self, test_cases: list) -> dict:
results = []
for case in test_cases:
case_results = {
"faithfulness": self.evaluate_faithfulness(case["answer"], case["context"]),
"relevancy": self.evaluate_relevancy(case["question"], case["answer"]),
"correctness": self.evaluate_correctness(case["answer"], case["ground_truth"])
}
results.append(case_results)
avg_scores = {}
for metric in ["faithfulness", "relevancy", "correctness"]:
scores = [r[metric].score for r in results]
avg_scores[metric] = np.mean(scores)
return {"results": results, "averages": avg_scores}
A/B Testing Framework
import random
from dataclasses import dataclass
from typing import List
@dataclass
class ABTestResult:
variant_a_score: float
variant_b_score: float
winner: str
confidence: float
class PromptABTester:
def __init__(self, llm, evaluator):
self.llm = llm
self.evaluator = evaluator
self.results = {"a": [], "b": []}
def run_test(self, prompt_a: str, prompt_b: str, test_cases: list,
sample_size: int = 100) -> ABTestResult:
for case in test_cases[:sample_size]:
output_a = self.llm.invoke(prompt_a.format(**case)).content
output_b = self.llm.invoke(prompt_b.format(**case)).content
score_a = self.evaluator.evaluate_relevancy(case["question"], output_a).score
score_b = self.evaluator.evaluate_relevancy(case["question"], output_b).score
self.results["a"].append(score_a)
self.results["b"].append(score_b)
mean_a = np.mean(self.results["a"])
mean_b = np.mean(self.results["b"])
from scipy import stats
t_stat, p_value = stats.ttest_ind(self.results["a"], self.results["b"])
winner = "a" if mean_a > mean_b else "b"
return ABTestResult(mean_a, mean_b, winner, 1 - p_value)
Key Takeaways
- RAGAS provides reference-free evaluation for RAG systems
- DeepEval offers comprehensive LLM testing with assertions
- Custom metrics enable domain-specific evaluation criteria
- A/B testing compares prompt variants with statistical rigor
- Continuous evaluation catches quality regressions in production