πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Document QA

🟒 Free Lesson

Advertisement

Document QA

DocumentsPDF, DOCXScanned ImagesParserLayout DetectionOCR + TablesText BlocksTablesImagesMulti-Modal LLMGPT-4V / ClaudeLayout UnderstandingAnswerWith CitationsPage References

Document QA systems extract and answer questions from complex documents including PDFs, scanned images, and documents with tables and figures.

PDF Document Parser

from dataclasses import dataclass
from typing import List
import fitz  # PyMuPDF

@dataclass
class DocumentBlock:
    text: str
    block_type: str  # text, table, image
    page: int
    bbox: tuple = None

class PDFParser:
    def __init__(self, pdf_path: str):
        self.doc = fitz.open(pdf_path)

    def extract_text_blocks(self) -> List[DocumentBlock]:
        blocks = []
        for page_num in range(len(self.doc)):
            page = self.doc[page_num]
            page_blocks = page.get_text("dict")["blocks"]
            for block in page_blocks:
                if block["type"] == 0:  # Text
                    text = "".join([span["text"] for line in block["lines"] for span in line["spans"]])
                    blocks.append(DocumentBlock(text=text, block_type="text", page=page_num))
                elif block["type"] == 1:  # Image
                    blocks.append(DocumentBlock(text="[Image]", block_type="image", page=page_num))
        return blocks

    def extract_tables(self) -> List[DocumentBlock]:
        tables = []
        for page_num in range(len(self.doc)):
            page = self.doc[page_num]
            tabs = page.find_tables()
            for tab in tabs:
                table_data = tab.extract()
                text = "\n".join([" | ".join(row) for row in table_data])
                tables.append(DocumentBlock(text=text, block_type="table", page=page_num))
        return tables

    def extract_all(self) -> List[DocumentBlock]:
        blocks = self.extract_text_blocks()
        tables = self.extract_tables()
        return blocks + tables

# Usage
parser = PDFParser("report.pdf")
blocks = parser.extract_all()

Document QA Pipeline

from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate

class DocumentQA:
    def __init__(self, parser, llm=None):
        self.parser = parser
        self.llm = llm or ChatOpenAI(model="gpt-4-vision-preview")
        self.blocks = []

    def load_document(self):
        self.blocks = self.parser.extract_all()

    def get_relevant_blocks(self, question: str, top_k: int = 5) -> List[DocumentBlock]:
        from sentence_transformers import SentenceTransformer
        import numpy as np

        model = SentenceTransformer("all-MiniLM-L6-v2")
        texts = [b.text for b in self.blocks if b.text != "[Image]"]
        embeddings = model.encode(texts)
        q_embedding = model.encode([question])
        scores = np.dot(embeddings, q_embedding.T).flatten()
        top_indices = np.argsort(scores)[-top_k:]
        return [self.blocks[i] for i in top_indices]

    def answer(self, question: str) -> dict:
        relevant = self.get_relevant_blocks(question)
        context = "\n\n".join([f"[Page {b.page}]: {b.text[:500]}" for b in relevant])

        prompt = PromptTemplate.from_template(
            """Answer the question based on the document context.
            Include page references.
            Context: {context}
            Question: {question}
            Answer:"""
        )
        chain = prompt | self.llm
        answer = chain.invoke({"context": context, "question": question}).content

        return {
            "answer": answer,
            "sources": [{"page": b.page, "text": b.text[:200]} for b in relevant]
        }

# Usage
qa = DocumentQA(PDFParser("report.pdf"))
qa.load_document()
result = qa.answer("What were the Q3 revenue numbers?")

Multi-Modal Document Processing

class MultiModalDocumentQA:
    def __init__(self, llm):
        self.llm = llm

    def process_image(self, image_path: str, question: str) -> str:
        from PIL import Image
        prompt = f"""Analyze this image and answer: {question}"""
        return self.llm.invoke([{"type": "image", "image": image_path}, prompt]).content

    def process_table(self, table_text: str, question: str) -> str:
        prompt = f"""Answer based on this table:
        {table_text}
        Question: {question}
        Answer:"""
        return self.llm.invoke(prompt).content

    def process_mixed_content(self, blocks: list, question: str) -> str:
        context_parts = []
        for block in blocks:
            if block.block_type == "text":
                context_parts.append(f"[Text p.{block.page}]: {block.text[:300]}")
            elif block.block_type == "table":
                context_parts.append(f"[Table p.{block.page}]: {block.text[:300]}")
        context = "\n".join(context_parts)
        prompt = f"""Answer based on this document content:
        {context}
        Question: {question}
        Answer with citations:"""
        return self.llm.invoke(prompt).content

# Usage
mm_qa = MultiModalDocumentQA(llm)
answer = mm_qa.process_mixed_content(blocks, "Summarize the financial data")

Key Takeaways

  • PDF parsing extracts text, tables, and images from documents
  • Layout understanding preserves document structure
  • Multi-modal LLMs process images and text together
  • Citation tracking provides source references for answers
  • Chunking strategies handle long documents effectively
⭐

Premium Content

Document QA

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement