Question Answering
Question answering systems extract or generate answers from text given a natural language question, supporting factoid, open-domain, and knowledge-base QA.
Extractive QA with Transformers
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
class ExtractiveQA:
def __init__(self, model_name: str = "bert-large-uncased-whole-word-masking-finetuned-squad"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
def answer(self, question: str, context: str) -> dict:
inputs = self.tokenizer(question, context, return_tensors="pt",
truncation=True, max_length=512)
with torch.no_grad():
outputs = self.model(**inputs)
start_scores = outputs.start_logits
end_scores = outputs.end_logits
start_idx = torch.argmax(start_scores)
end_idx = torch.argmax(end_scores) + 1
answer_tokens = inputs["input_ids"][0][start_idx:end_idx]
answer = self.tokenizer.decode(answer_tokens)
confidence = torch.softmax(start_scores, dim=-1)[0][start_idx].item() * \
torch.softmax(end_scores, dim=-1)[0][end_idx-1].item()
return {
"answer": answer,
"confidence": round(confidence, 4),
"start_idx": start_idx.item(),
"end_idx": end_idx.item()
}
def answer_with_scores(self, question: str, context: str, top_k: int = 3) -> list:
inputs = self.tokenizer(question, context, return_tensors="pt",
truncation=True, max_length=512)
with torch.no_grad():
outputs = self.model(**inputs)
start_scores = torch.softmax(outputs.start_logits, dim=-1)[0]
end_scores = torch.softmax(outputs.end_logits, dim=-1)[0]
candidates = []
for start_idx in torch.argsort(start_scores)[-top_k:]:
for end_idx in torch.argsort(end_scores)[-top_k:]:
if end_idx >= start_idx:
answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]
answer = self.tokenizer.decode(answer_tokens)
confidence = start_scores[start_idx].item() * end_scores[end_idx].item()
candidates.append({
"answer": answer,
"confidence": round(confidence, 4),
"start": start_idx.item(),
"end": end_idx.item()
})
candidates.sort(key=lambda x: x["confidence"], reverse=True)
return candidates[:top_k]
# Usage
qa = ExtractiveQA()
result = qa.answer("What is AI?", "Artificial intelligence is the simulation of human intelligence by machines.")
Open-Domain QA Pipeline
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
class OpenDomainQA:
def __init__(self, retriever, llm):
self.retriever = retriever
self.llm = llm
self.qa_prompt = PromptTemplate.from_template(
"""Answer the question based on the context below.
If the answer is not in the context, say "I don't know".
Context: {context}
Question: {question}
Answer:"""
)
def retrieve_context(self, question: str, top_k: int = 3) -> str:
docs = self.retriever.get_relevant_documents(question)
return "\n".join([d.page_content for d in docs[:top_k]])
def answer(self, question: str) -> dict:
context = self.retrieve_context(question)
chain = self.qa_prompt | self.llm
result = chain.invoke({"context": context, "question": question})
return {
"question": question,
"answer": result.content,
"context_used": context[:500]
}
def answer_with_sources(self, question: str) -> dict:
docs = self.retriever.get_relevant_documents(question)
context = "\n".join([d.page_content for d in docs[:3]])
chain = self.qa_prompt | self.llm
result = chain.invoke({"context": context, "question": question})
return {
"question": question,
"answer": result.content,
"sources": [{"content": d.page_content[:200], "metadata": d.metadata} for d in docs[:3]]
}
# Usage
qa = OpenDomainQA(retriever, llm)
result = qa.answer("What are the benefits of machine learning?")
Multi-Hop Question Answering
class MultiHopQA:
def __init__(self, llm, retriever):
self.llm = llm
self.retriever = retriever
def decompose_question(self, question: str) -> list:
prompt = f"""Break this complex question into simpler sub-questions:
Question: {question}
Sub-questions (one per line):"""
response = self.llm.invoke(prompt).content
return [q.strip() for q in response.split("\n") if q.strip()]
def answer_sub_question(self, sub_question: str) -> str:
docs = self.retriever.get_relevant_documents(sub_question)
context = "\n".join([d.page_content for d in docs[:2]])
prompt = f"""Answer based on context:
Context: {context}
Question: {sub_question}
Answer:"""
return self.llm.invoke(prompt).content
def multi_hop_answer(self, question: str) -> dict:
sub_questions = self.decompose_question(question)
intermediate_answers = []
for sq in sub_questions:
answer = self.answer_sub_question(sq)
intermediate_answers.append({"question": sq, "answer": answer})
combined_context = "\n".join([
f"Q: {a['question']}\nA: {a['answer']}" for a in intermediate_answers
])
final_prompt = f"""Using these intermediate findings, answer the final question:
{combined_context}
Final Question: {question}
Final Answer:"""
final_answer = self.llm.invoke(final_prompt).content
return {
"question": question,
"sub_questions": sub_questions,
"intermediate_answers": intermediate_answers,
"final_answer": final_answer
}
# Usage
multi_hop = MultiHopQA(llm, retriever)
result = multi_hop.multi_hop_answer("Who invented the programming language used to build Python?")
Question Answering Evaluator
class QAEvaluator:
def __init__(self):
pass
def exact_match(self, predicted: str, ground_truth: str) -> bool:
return predicted.strip().lower() == ground_truth.strip().lower()
def f1_score(self, predicted: str, ground_truth: str) -> float:
pred_tokens = set(predicted.lower().split())
gt_tokens = set(ground_truth.lower().split())
if not pred_tokens or not gt_tokens:
return 0.0
common = pred_tokens & gt_tokens
precision = len(common) / len(pred_tokens)
recall = len(common) / len(gt_tokens)
if precision + recall == 0:
return 0.0
return 2 * (precision * recall) / (precision + recall)
def evaluate_dataset(self, predictions: list, ground_truths: list) -> dict:
em_scores = [self.exact_match(p, g) for p, g in zip(predictions, ground_truths)]
f1_scores = [self.f1_score(p, g) for p, g in zip(predictions, ground_truths)]
return {
"exact_match": sum(em_scores) / len(em_scores),
"f1": sum(f1_scores) / len(f1_scores),
"total": len(predictions)
}
# Usage
evaluator = QAEvaluator()
results = evaluator.evaluate_dataset(
["AI", "ML", "DL"],
["Artificial Intelligence", "Machine Learning", "Deep Learning"]
)
# {"exact_match": 0.0, "f1": 0.57}
Key Takeaways
- Extractive QA identifies answer spans within the context
- Generative QA creates new answer text from context
- Multi-hop QA chains reasoning across multiple documents
- Confidence scoring helps identify uncertain answers
- Evaluation uses exact match and F1 metrics