Document Processing with AI

Document Processing Challenges

Documents come in various formats (PDFs, images, scanned documents) with complex layouts, tables, handwriting, and multiple languages. AI-powered document processing addresses these challenges through computer vision and natural language understanding.

OCR with Python

import pytesseract
from PIL import Image
import fitz  # PyMuPDF
from typing import List, Dict

class DocumentOCR:
    def __init__(self, tesseract_path=None):
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
    
    def extract_from_image(self, image_path: str) -> Dict:
        image = Image.open(image_path)
        
        text = pytesseract.image_to_string(image)
        
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        return {
            "text": text,
            "words": [
                {
                    "text": word,
                    "confidence": conf,
                    "bbox": (left, top, width, height)
                }
                for word, conf, left, top, width, height in zip(
                    data["text"],
                    data["conf"],
                    data["left"],
                    data["top"],
                    data["width"],
                    data["height"]
                )
                if word.strip()
            ]
        }
    
    def extract_from_pdf(self, pdf_path: str) -> List[Dict]:
        doc = fitz.open(pdf_path)
        pages = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", pix.width, pix.height, pix.samples)
            
            text = pytesseract.image_to_string(image)
            pages.append({
                "page": page_num + 1,
                "text": text
            })
        
        return pages

ocr = DocumentOCR()
result = ocr.extract_from_image("document.png")
print(f"Extracted {len(result['words'])} words")

Layout Analysis

from transformers import AutoProcessor, AutoModelForObjectDetection
import torch

class LayoutAnalyzer:
    def __init__(self):
        self.processor = AutoProcessor.from_pretrained(
            "microsoft/layoutlmv3-base"
        )
        self.model = AutoModelForObjectDetection.from_pretrained(
            "microsoft/layoutlmv3-base"
        )
    
    def analyze_layout(self, image):
        inputs = self.processor(images=image, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        results = self.processor.post_process_object_detection(
            outputs, threshold=0.5
        )[0]
        
        layout_elements = []
        for score, label, box in zip(
            results["scores"],
            results["labels"],
            results["boxes"]
        ):
            layout_elements.append({
                "label": self.model.config.id2label[label.item()],
                "score": score.item(),
                "bbox": box.tolist()
            })
        
        return layout_elements
    
    def categorize_elements(self, elements):
        categories = {
            "text": [],
            "table": [],
            "figure": [],
            "title": [],
            "list": []
        }
        
        for elem in elements:
            label = elem["label"]
            if label in categories:
                categories[label].append(elem)
        
        return categories

analyzer = LayoutAnalyzer()
elements = analyzer.analyze_layout(image)
categorized = analyzer.categorize_elements(elements)

Information Extraction with GenAI

import openai
from typing import Dict, List
import json

class DocumentExtractor:
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
    
    def extract_entities(self, text: str, doc_type: str = "general") -> Dict:
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"""Extract structured information from {doc_type} documents.
Return valid JSON with extracted entities."""},
                {"role": "user", "content": f"Extract key information from:\n\n{text[:3000]}"}
            ],
            temperature=0,
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)
    
    def classify_document(self, text: str, categories: List[str]) -> str:
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"Classify document into: {', '.join(categories)}"},
                {"role": "user", "content": text[:2000]}
            ],
            temperature=0
        )
        
        return response.choices[0].message.content.strip()
    
    def summarize_document(self, text: str, max_words: int = 100) -> str:
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"Summarize in {max_words} words."},
                {"role": "user", "content": text[:4000]}
            ],
            temperature=0.3
        )
        
        return response.choices[0].message.content

extractor = DocumentExtractor(api_key="your-api-key")

invoice_data = extractor.extract_entities(invoice_text, doc_type="invoice")
category = extractor.classify_document(
    document_text, 
    categories=["invoice", "receipt", "contract", "report"]
)
summary = extractor.summarize_document(long_report_text)

Table Extraction

import pandas as pd
from typing import List

class TableExtractor:
    def __init__(self):
        self.detector = None
    
    def extract_tables_from_image(self, image) -> List[pd.DataFrame]:
        try:
            import camelot
            tables = camelot.read_pdf(image, pages="1")
            return [table.df for table in tables]
        except ImportError:
            return self._extract_with_ocr(image)
    
    def _extract_with_ocr(self, image) -> List[pd.DataFrame]:
        import pytesseract
        
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        rows = []
        current_row = []
        last_top = None
        
        for i, word in enumerate(data["text"]):
            if word.strip():
                top = data["top"][i]
                
                if last_top is not None and abs(top - last_top) > 10:
                    if current_row:
                        rows.append(current_row)
                    current_row = []
                
                current_row.append(word)
                last_top = top
        
        if current_row:
            rows.append(current_row)
        
        if rows:
            max_cols = max(len(row) for row in rows)
            padded_rows = [row + [""] * (max_cols - len(row)) for row in rows]
            return [pd.DataFrame(padded_rows[1:], columns=padded_rows[0])]
        
        return []

extractor = TableExtractor()
tables = extractor.extract_tables_from_image("table_image.png")
if tables:
    print(tables[0].to_string())

Document Classification

from transformers import pipeline
from typing import List

class DocumentClassifier:
    def __init__(self, model_name="facebook/bart-large-mnli"):
        self.classifier = pipeline("zero-shot-classification", model=model_name)
    
    def classify(self, text: str, candidate_labels: List[str]) -> Dict:
        result = self.classifier(text, candidate_labels)
        
        return {
            "label": result["labels"][0],
            "score": result["scores"][0],
            "all_scores": dict(zip(result["labels"], result["scores"]))
        }
    
    def batch_classify(self, texts: List[str], candidate_labels: List[str]) -> List[Dict]:
        return [self.classify(text, candidate_labels) for text in texts]

classifier = DocumentClassifier()
result = classifier.classify(
    "Invoice #12345 dated 01/15/2024 for $5,000",
    candidate_labels=["invoice", "receipt", "contract", "letter"]
)
print(f"Classification: {result['label']} ({result['score']:.2f})")

Best Practices

Pre-process images for better OCR accuracy
Handle multi-page documents with proper pagination
Implement confidence scoring for extracted data
Use human-in-the-loop for validation
Version control document templates
Monitor extraction accuracy over time

Document Processing with AI

Document Processing with AI

Document Processing Challenges

OCR with Python

Layout Analysis

Information Extraction with GenAI

Table Extraction

Document Classification

Best Practices

Premium Content

Need Expert Generative AI Help?