Document QA
Document QA systems extract and answer questions from complex documents including PDFs, scanned images, and documents with tables and figures.
PDF Document Parser
from dataclasses import dataclass
from typing import List
import fitz # PyMuPDF
@dataclass
class DocumentBlock:
text: str
block_type: str # text, table, image
page: int
bbox: tuple = None
class PDFParser:
def __init__(self, pdf_path: str):
self.doc = fitz.open(pdf_path)
def extract_text_blocks(self) -> List[DocumentBlock]:
blocks = []
for page_num in range(len(self.doc)):
page = self.doc[page_num]
page_blocks = page.get_text("dict")["blocks"]
for block in page_blocks:
if block["type"] == 0: # Text
text = "".join([span["text"] for line in block["lines"] for span in line["spans"]])
blocks.append(DocumentBlock(text=text, block_type="text", page=page_num))
elif block["type"] == 1: # Image
blocks.append(DocumentBlock(text="[Image]", block_type="image", page=page_num))
return blocks
def extract_tables(self) -> List[DocumentBlock]:
tables = []
for page_num in range(len(self.doc)):
page = self.doc[page_num]
tabs = page.find_tables()
for tab in tabs:
table_data = tab.extract()
text = "\n".join([" | ".join(row) for row in table_data])
tables.append(DocumentBlock(text=text, block_type="table", page=page_num))
return tables
def extract_all(self) -> List[DocumentBlock]:
blocks = self.extract_text_blocks()
tables = self.extract_tables()
return blocks + tables
# Usage
parser = PDFParser("report.pdf")
blocks = parser.extract_all()
Document QA Pipeline
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
class DocumentQA:
def __init__(self, parser, llm=None):
self.parser = parser
self.llm = llm or ChatOpenAI(model="gpt-4-vision-preview")
self.blocks = []
def load_document(self):
self.blocks = self.parser.extract_all()
def get_relevant_blocks(self, question: str, top_k: int = 5) -> List[DocumentBlock]:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = [b.text for b in self.blocks if b.text != "[Image]"]
embeddings = model.encode(texts)
q_embedding = model.encode([question])
scores = np.dot(embeddings, q_embedding.T).flatten()
top_indices = np.argsort(scores)[-top_k:]
return [self.blocks[i] for i in top_indices]
def answer(self, question: str) -> dict:
relevant = self.get_relevant_blocks(question)
context = "\n\n".join([f"[Page {b.page}]: {b.text[:500]}" for b in relevant])
prompt = PromptTemplate.from_template(
"""Answer the question based on the document context.
Include page references.
Context: {context}
Question: {question}
Answer:"""
)
chain = prompt | self.llm
answer = chain.invoke({"context": context, "question": question}).content
return {
"answer": answer,
"sources": [{"page": b.page, "text": b.text[:200]} for b in relevant]
}
# Usage
qa = DocumentQA(PDFParser("report.pdf"))
qa.load_document()
result = qa.answer("What were the Q3 revenue numbers?")
Multi-Modal Document Processing
class MultiModalDocumentQA:
def __init__(self, llm):
self.llm = llm
def process_image(self, image_path: str, question: str) -> str:
from PIL import Image
prompt = f"""Analyze this image and answer: {question}"""
return self.llm.invoke([{"type": "image", "image": image_path}, prompt]).content
def process_table(self, table_text: str, question: str) -> str:
prompt = f"""Answer based on this table:
{table_text}
Question: {question}
Answer:"""
return self.llm.invoke(prompt).content
def process_mixed_content(self, blocks: list, question: str) -> str:
context_parts = []
for block in blocks:
if block.block_type == "text":
context_parts.append(f"[Text p.{block.page}]: {block.text[:300]}")
elif block.block_type == "table":
context_parts.append(f"[Table p.{block.page}]: {block.text[:300]}")
context = "\n".join(context_parts)
prompt = f"""Answer based on this document content:
{context}
Question: {question}
Answer with citations:"""
return self.llm.invoke(prompt).content
# Usage
mm_qa = MultiModalDocumentQA(llm)
answer = mm_qa.process_mixed_content(blocks, "Summarize the financial data")
Key Takeaways
- PDF parsing extracts text, tables, and images from documents
- Layout understanding preserves document structure
- Multi-modal LLMs process images and text together
- Citation tracking provides source references for answers
- Chunking strategies handle long documents effectively