Hallucination Detection
What are Hallucinations?
Hallucinations occur when AI models generate plausible-sounding but factually incorrect or fabricated information. They're a critical challenge in deploying generative AI systems.
Types of Hallucinations
- Factual Errors: Incorrect statements about established facts
- Fabricated Citations: Invented references or sources
- Logical Inconsistencies: Contradictions within the same response
- Temporal Confusion: Incorrect dates or timelines
Detection Methods
class HallucinationDetector:
def __init__(self, knowledge_base, confidence_threshold=0.7):
self.kb = knowledge_base
self.threshold = confidence_threshold
def check_consistency(self, response):
"""Check for internal consistency in the response."""
sentences = split_into_sentences(response)
contradictions = []
for i, s1 in enumerate(sentences):
for s2 in sentences[i+1:]:
if self.are_contradictory(s1, s2):
contradictions.append((s1, s2))
return contradictions
def verify_facts(self, response):
"""Verify factual claims against knowledge base."""
claims = extract_claims(response)
results = []
for claim in claims:
verification = self.kb.verify(claim)
results.append({
'claim': claim,
'verified': verification['is_true'],
'confidence': verification['confidence'],
'source': verification.get('source')
})
return results
def estimate_confidence(self, model, prompt, response):
"""Estimate model confidence in its response."""
logprobs = get_logprobs(model, prompt, response)
avg_logprob = sum(logprobs) / len(logprobs)
return {
'confidence': sigmoid(avg_logprob),
'is_confident': avg_logprob > self.threshold
}
RAG-based Detection
class RAGHallucinationDetector:
def __init__(self, retriever, generator):
self.retriever = retriever
self.generator = generator
def detect_and_correct(self, query):
# Retrieve relevant documents
docs = self.retriever.retrieve(query)
context = "\n".join([doc.content for doc in docs])
# Generate response
response = self.generator.generate(query, context)
# Verify claims against context
claims = extract_claims(response)
unsupported = []
for claim in claims:
if not self.claim_supported_by_context(claim, context):
unsupported.append(claim)
return {
'response': response,
'unsupported_claims': unsupported,
'confidence': 1 - (len(unsupported) / len(claims))
}
Summary
Hallucination detection is crucial for building trustworthy AI systems. Combine multiple detection methods for robust results.
Next: We'll explore AI safety and alignment.