πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

LLM Evaluation Frameworks

🟒 Free Lesson

Advertisement

LLM Evaluation Frameworks

Test DatasetQuestions + Ground TruthExpected OutputsLLM SystemModel Under TestRAG PipelineOutputsGenerated AnswersRetrieved ContextFaithfulnessAnswer grounded in contextRelevancyAnswer addresses questionContext PrecisionRetrieval qualityAnswer CorrectnessMatches ground truthResultsScore: 0.87Pass Rate: 92%Cost: $0.45

LLM evaluation frameworks provide structured approaches to measure faithfulness, relevancy, and correctness of language model outputs, enabling systematic quality assurance.

RAGAS Evaluation

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
)
from datasets import Dataset

# Prepare evaluation dataset
eval_data = {
    "question": [
        "What is machine learning?",
        "How does neural network work?",
        "What are transformers in AI?"
    ],
    "answer": [
        "Machine learning is a subset of AI...",
        "Neural networks are computing systems...",
        "Transformers are deep learning architectures..."
    ],
    "contexts": [
        ["Machine learning (ML) is a branch of AI..."],
        ["A neural network consists of layers..."],
        ["The transformer architecture uses self-attention..."]
    ],
    "ground_truth": [
        "ML is a subset of AI that enables systems to learn from data.",
        "Neural networks are computing systems inspired by biological brains.",
        "Transformers use self-attention mechanisms for sequence processing."
    ]
}

dataset = Dataset.from_dict(eval_data)

# Run evaluation
results = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall]
)

print(results)
# {'faithfulness': 0.89, 'answer_relevancy': 0.85,
#  'context_precision': 0.82, 'context_recall': 0.91}

DeepEval Metrics

from deepeval import evaluate
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    ContextualPrecisionMetric
)
from deepeval.test_case import LLMTestCase

# Create test cases
test_cases = [
    LLMTestCase(
        input="What is deep learning?",
        actual_output="Deep learning is a subset of machine learning using neural networks.",
        expected_output="Deep learning uses neural networks with multiple layers.",
        retrieval_context=["Deep learning is a subset of ML that uses neural networks with multiple layers."],
        context=["Deep learning is a subset of ML that uses neural networks with multiple layers."]
    )
]

# Define metrics
relevancy_metric = AnswerRelevancyMetric(threshold=0.7)
faithfulness_metric = FaithfulnessMetric(threshold=0.8)

# Evaluate
results = evaluate(
    test_cases=test_cases,
    metrics=[relevancy_metric, faithfulness_metric]
)

print(results)

Custom Evaluation Pipeline

from dataclasses import dataclass
from typing import List, Callable
import numpy as np

@dataclass
class EvalResult:
    metric_name: str
    score: float
    passed: bool
    details: dict

class LLMEvaluator:
    def __init__(self, llm):
        self.llm = llm
        self.metrics = {}

    def register_metric(self, name: str, evaluator: Callable, threshold: float = 0.5):
        self.metrics[name] = {"evaluator": evaluator, "threshold": threshold}

    def evaluate_faithfulness(self, answer: str, context: str) -> EvalResult:
        prompt = f"""Rate faithfulness 0-1: Does the answer only use info from context?
        Context: {context}
        Answer: {answer}
        Score (0-1):"""
        score = float(self.llm.invoke(prompt).content)
        return EvalResult("faithfulness", score, score >= 0.7, {"context_len": len(context)})

    def evaluate_relevancy(self, question: str, answer: str) -> EvalResult:
        prompt = f"""Rate relevancy 0-1: Does the answer address the question?
        Question: {question}
        Answer: {answer}
        Score (0-1):"""
        score = float(self.llm.invoke(prompt).content)
        return EvalResult("relevancy", score, score >= 0.7, {"answer_len": len(answer)})

    def evaluate_correctness(self, answer: str, ground_truth: str) -> EvalResult:
        prompt = f"""Compare answer to ground truth. Rate 0-1.
        Answer: {answer}
        Ground Truth: {ground_truth}
        Score (0-1):"""
        score = float(self.llm.invoke(prompt).content)
        return EvalResult("correctness", score, score >= 0.8, {})

    def run_evaluation(self, test_cases: list) -> dict:
        results = []
        for case in test_cases:
            case_results = {
                "faithfulness": self.evaluate_faithfulness(case["answer"], case["context"]),
                "relevancy": self.evaluate_relevancy(case["question"], case["answer"]),
                "correctness": self.evaluate_correctness(case["answer"], case["ground_truth"])
            }
            results.append(case_results)

        avg_scores = {}
        for metric in ["faithfulness", "relevancy", "correctness"]:
            scores = [r[metric].score for r in results]
            avg_scores[metric] = np.mean(scores)

        return {"results": results, "averages": avg_scores}

A/B Testing Framework

import random
from dataclasses import dataclass
from typing import List

@dataclass
class ABTestResult:
    variant_a_score: float
    variant_b_score: float
    winner: str
    confidence: float

class PromptABTester:
    def __init__(self, llm, evaluator):
        self.llm = llm
        self.evaluator = evaluator
        self.results = {"a": [], "b": []}

    def run_test(self, prompt_a: str, prompt_b: str, test_cases: list,
                 sample_size: int = 100) -> ABTestResult:
        for case in test_cases[:sample_size]:
            output_a = self.llm.invoke(prompt_a.format(**case)).content
            output_b = self.llm.invoke(prompt_b.format(**case)).content

            score_a = self.evaluator.evaluate_relevancy(case["question"], output_a).score
            score_b = self.evaluator.evaluate_relevancy(case["question"], output_b).score

            self.results["a"].append(score_a)
            self.results["b"].append(score_b)

        mean_a = np.mean(self.results["a"])
        mean_b = np.mean(self.results["b"])

        from scipy import stats
        t_stat, p_value = stats.ttest_ind(self.results["a"], self.results["b"])

        winner = "a" if mean_a > mean_b else "b"
        return ABTestResult(mean_a, mean_b, winner, 1 - p_value)

Key Takeaways

  • RAGAS provides reference-free evaluation for RAG systems
  • DeepEval offers comprehensive LLM testing with assertions
  • Custom metrics enable domain-specific evaluation criteria
  • A/B testing compares prompt variants with statistical rigor
  • Continuous evaluation catches quality regressions in production
⭐

Premium Content

LLM Evaluation Frameworks

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement