Question Answering

Question answering systems extract or generate answers from text given a natural language question, supporting factoid, open-domain, and knowledge-base QA.

Extractive QA with Transformers

from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

class ExtractiveQA:
    def __init__(self, model_name: str = "bert-large-uncased-whole-word-masking-finetuned-squad"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    def answer(self, question: str, context: str) -> dict:
        inputs = self.tokenizer(question, context, return_tensors="pt",
                               truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)

        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores) + 1

        answer_tokens = inputs["input_ids"][0][start_idx:end_idx]
        answer = self.tokenizer.decode(answer_tokens)

        confidence = torch.softmax(start_scores, dim=-1)[0][start_idx].item() * \
                    torch.softmax(end_scores, dim=-1)[0][end_idx-1].item()

        return {
            "answer": answer,
            "confidence": round(confidence, 4),
            "start_idx": start_idx.item(),
            "end_idx": end_idx.item()
        }

    def answer_with_scores(self, question: str, context: str, top_k: int = 3) -> list:
        inputs = self.tokenizer(question, context, return_tensors="pt",
                               truncation=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)

        start_scores = torch.softmax(outputs.start_logits, dim=-1)[0]
        end_scores = torch.softmax(outputs.end_logits, dim=-1)[0]

        candidates = []
        for start_idx in torch.argsort(start_scores)[-top_k:]:
            for end_idx in torch.argsort(end_scores)[-top_k:]:
                if end_idx >= start_idx:
                    answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]
                    answer = self.tokenizer.decode(answer_tokens)
                    confidence = start_scores[start_idx].item() * end_scores[end_idx].item()
                    candidates.append({
                        "answer": answer,
                        "confidence": round(confidence, 4),
                        "start": start_idx.item(),
                        "end": end_idx.item()
                    })

        candidates.sort(key=lambda x: x["confidence"], reverse=True)
        return candidates[:top_k]

# Usage
qa = ExtractiveQA()
result = qa.answer("What is AI?", "Artificial intelligence is the simulation of human intelligence by machines.")

Open-Domain QA Pipeline

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

class OpenDomainQA:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.qa_prompt = PromptTemplate.from_template(
            """Answer the question based on the context below.
            If the answer is not in the context, say "I don't know".

            Context: {context}
            Question: {question}
            Answer:"""
        )

    def retrieve_context(self, question: str, top_k: int = 3) -> str:
        docs = self.retriever.get_relevant_documents(question)
        return "\n".join([d.page_content for d in docs[:top_k]])

    def answer(self, question: str) -> dict:
        context = self.retrieve_context(question)
        chain = self.qa_prompt | self.llm
        result = chain.invoke({"context": context, "question": question})
        return {
            "question": question,
            "answer": result.content,
            "context_used": context[:500]
        }

    def answer_with_sources(self, question: str) -> dict:
        docs = self.retriever.get_relevant_documents(question)
        context = "\n".join([d.page_content for d in docs[:3]])
        chain = self.qa_prompt | self.llm
        result = chain.invoke({"context": context, "question": question})
        return {
            "question": question,
            "answer": result.content,
            "sources": [{"content": d.page_content[:200], "metadata": d.metadata} for d in docs[:3]]
        }

# Usage
qa = OpenDomainQA(retriever, llm)
result = qa.answer("What are the benefits of machine learning?")

Multi-Hop Question Answering

class MultiHopQA:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

    def decompose_question(self, question: str) -> list:
        prompt = f"""Break this complex question into simpler sub-questions:
        Question: {question}
        Sub-questions (one per line):"""
        response = self.llm.invoke(prompt).content
        return [q.strip() for q in response.split("\n") if q.strip()]

    def answer_sub_question(self, sub_question: str) -> str:
        docs = self.retriever.get_relevant_documents(sub_question)
        context = "\n".join([d.page_content for d in docs[:2]])
        prompt = f"""Answer based on context:
        Context: {context}
        Question: {sub_question}
        Answer:"""
        return self.llm.invoke(prompt).content

    def multi_hop_answer(self, question: str) -> dict:
        sub_questions = self.decompose_question(question)
        intermediate_answers = []

        for sq in sub_questions:
            answer = self.answer_sub_question(sq)
            intermediate_answers.append({"question": sq, "answer": answer})

        combined_context = "\n".join([
            f"Q: {a['question']}\nA: {a['answer']}" for a in intermediate_answers
        ])

        final_prompt = f"""Using these intermediate findings, answer the final question:
        {combined_context}
        Final Question: {question}
        Final Answer:"""

        final_answer = self.llm.invoke(final_prompt).content
        return {
            "question": question,
            "sub_questions": sub_questions,
            "intermediate_answers": intermediate_answers,
            "final_answer": final_answer
        }

# Usage
multi_hop = MultiHopQA(llm, retriever)
result = multi_hop.multi_hop_answer("Who invented the programming language used to build Python?")

Question Answering Evaluator

class QAEvaluator:
    def __init__(self):
        pass

    def exact_match(self, predicted: str, ground_truth: str) -> bool:
        return predicted.strip().lower() == ground_truth.strip().lower()

    def f1_score(self, predicted: str, ground_truth: str) -> float:
        pred_tokens = set(predicted.lower().split())
        gt_tokens = set(ground_truth.lower().split())
        if not pred_tokens or not gt_tokens:
            return 0.0
        common = pred_tokens & gt_tokens
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(gt_tokens)
        if precision + recall == 0:
            return 0.0
        return 2 * (precision * recall) / (precision + recall)

    def evaluate_dataset(self, predictions: list, ground_truths: list) -> dict:
        em_scores = [self.exact_match(p, g) for p, g in zip(predictions, ground_truths)]
        f1_scores = [self.f1_score(p, g) for p, g in zip(predictions, ground_truths)]
        return {
            "exact_match": sum(em_scores) / len(em_scores),
            "f1": sum(f1_scores) / len(f1_scores),
            "total": len(predictions)
        }

# Usage
evaluator = QAEvaluator()
results = evaluator.evaluate_dataset(
    ["AI", "ML", "DL"],
    ["Artificial Intelligence", "Machine Learning", "Deep Learning"]
)
# {"exact_match": 0.0, "f1": 0.57}

Key Takeaways

Extractive QA identifies answer spans within the context
Generative QA creates new answer text from context
Multi-hop QA chains reasoning across multiple documents
Confidence scoring helps identify uncertain answers
Evaluation uses exact match and F1 metrics

Question Answering

Question Answering

Extractive QA with Transformers

Open-Domain QA Pipeline

Multi-Hop Question Answering

Question Answering Evaluator

Key Takeaways

Premium Content

Need Expert Generative AI Help?