πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Document Processing with AI

🟒 Free Lesson

Advertisement

Document Processing with AI

Document AI PipelineInputDocumentsOCRText ExtractLayoutAnalysisNLUUnderstandingExtractKey InfoClassifyDocumentStoreOCR Technologiesβ€’ Tesseract (Open Source)β€’ Google Cloud Visionβ€’ AWS Textractβ€’ Azure Computer Visionβ€’ TrOCR (Transformer-based)Document Understandingβ€’ LayoutLM (Microsoft)β€’ DocFormerβ€’ Donut (OCR-free)β€’ Pix2Structβ€’ GPT-4 Vision

Document Processing Challenges

Documents come in various formats (PDFs, images, scanned documents) with complex layouts, tables, handwriting, and multiple languages. AI-powered document processing addresses these challenges through computer vision and natural language understanding.

OCR with Python

import pytesseract
from PIL import Image
import fitz  # PyMuPDF
from typing import List, Dict

class DocumentOCR:
    def __init__(self, tesseract_path=None):
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
    
    def extract_from_image(self, image_path: str) -> Dict:
        image = Image.open(image_path)
        
        text = pytesseract.image_to_string(image)
        
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        return {
            "text": text,
            "words": [
                {
                    "text": word,
                    "confidence": conf,
                    "bbox": (left, top, width, height)
                }
                for word, conf, left, top, width, height in zip(
                    data["text"],
                    data["conf"],
                    data["left"],
                    data["top"],
                    data["width"],
                    data["height"]
                )
                if word.strip()
            ]
        }
    
    def extract_from_pdf(self, pdf_path: str) -> List[Dict]:
        doc = fitz.open(pdf_path)
        pages = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", pix.width, pix.height, pix.samples)
            
            text = pytesseract.image_to_string(image)
            pages.append({
                "page": page_num + 1,
                "text": text
            })
        
        return pages

ocr = DocumentOCR()
result = ocr.extract_from_image("document.png")
print(f"Extracted {len(result['words'])} words")

Layout Analysis

from transformers import AutoProcessor, AutoModelForObjectDetection
import torch

class LayoutAnalyzer:
    def __init__(self):
        self.processor = AutoProcessor.from_pretrained(
            "microsoft/layoutlmv3-base"
        )
        self.model = AutoModelForObjectDetection.from_pretrained(
            "microsoft/layoutlmv3-base"
        )
    
    def analyze_layout(self, image):
        inputs = self.processor(images=image, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        results = self.processor.post_process_object_detection(
            outputs, threshold=0.5
        )[0]
        
        layout_elements = []
        for score, label, box in zip(
            results["scores"],
            results["labels"],
            results["boxes"]
        ):
            layout_elements.append({
                "label": self.model.config.id2label[label.item()],
                "score": score.item(),
                "bbox": box.tolist()
            })
        
        return layout_elements
    
    def categorize_elements(self, elements):
        categories = {
            "text": [],
            "table": [],
            "figure": [],
            "title": [],
            "list": []
        }
        
        for elem in elements:
            label = elem["label"]
            if label in categories:
                categories[label].append(elem)
        
        return categories

analyzer = LayoutAnalyzer()
elements = analyzer.analyze_layout(image)
categorized = analyzer.categorize_elements(elements)

Information Extraction with GenAI

import openai
from typing import Dict, List
import json

class DocumentExtractor:
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
    
    def extract_entities(self, text: str, doc_type: str = "general") -> Dict:
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"""Extract structured information from {doc_type} documents.
Return valid JSON with extracted entities."""},
                {"role": "user", "content": f"Extract key information from:\n\n{text[:3000]}"}
            ],
            temperature=0,
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)
    
    def classify_document(self, text: str, categories: List[str]) -> str:
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"Classify document into: {', '.join(categories)}"},
                {"role": "user", "content": text[:2000]}
            ],
            temperature=0
        )
        
        return response.choices[0].message.content.strip()
    
    def summarize_document(self, text: str, max_words: int = 100) -> str:
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"Summarize in {max_words} words."},
                {"role": "user", "content": text[:4000]}
            ],
            temperature=0.3
        )
        
        return response.choices[0].message.content

extractor = DocumentExtractor(api_key="your-api-key")

invoice_data = extractor.extract_entities(invoice_text, doc_type="invoice")
category = extractor.classify_document(
    document_text, 
    categories=["invoice", "receipt", "contract", "report"]
)
summary = extractor.summarize_document(long_report_text)

Table Extraction

import pandas as pd
from typing import List

class TableExtractor:
    def __init__(self):
        self.detector = None
    
    def extract_tables_from_image(self, image) -> List[pd.DataFrame]:
        try:
            import camelot
            tables = camelot.read_pdf(image, pages="1")
            return [table.df for table in tables]
        except ImportError:
            return self._extract_with_ocr(image)
    
    def _extract_with_ocr(self, image) -> List[pd.DataFrame]:
        import pytesseract
        
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        rows = []
        current_row = []
        last_top = None
        
        for i, word in enumerate(data["text"]):
            if word.strip():
                top = data["top"][i]
                
                if last_top is not None and abs(top - last_top) > 10:
                    if current_row:
                        rows.append(current_row)
                    current_row = []
                
                current_row.append(word)
                last_top = top
        
        if current_row:
            rows.append(current_row)
        
        if rows:
            max_cols = max(len(row) for row in rows)
            padded_rows = [row + [""] * (max_cols - len(row)) for row in rows]
            return [pd.DataFrame(padded_rows[1:], columns=padded_rows[0])]
        
        return []

extractor = TableExtractor()
tables = extractor.extract_tables_from_image("table_image.png")
if tables:
    print(tables[0].to_string())

Document Classification

from transformers import pipeline
from typing import List

class DocumentClassifier:
    def __init__(self, model_name="facebook/bart-large-mnli"):
        self.classifier = pipeline("zero-shot-classification", model=model_name)
    
    def classify(self, text: str, candidate_labels: List[str]) -> Dict:
        result = self.classifier(text, candidate_labels)
        
        return {
            "label": result["labels"][0],
            "score": result["scores"][0],
            "all_scores": dict(zip(result["labels"], result["scores"]))
        }
    
    def batch_classify(self, texts: List[str], candidate_labels: List[str]) -> List[Dict]:
        return [self.classify(text, candidate_labels) for text in texts]

classifier = DocumentClassifier()
result = classifier.classify(
    "Invoice #12345 dated 01/15/2024 for $5,000",
    candidate_labels=["invoice", "receipt", "contract", "letter"]
)
print(f"Classification: {result['label']} ({result['score']:.2f})")

Best Practices

  • Pre-process images for better OCR accuracy
  • Handle multi-page documents with proper pagination
  • Implement confidence scoring for extracted data
  • Use human-in-the-loop for validation
  • Version control document templates
  • Monitor extraction accuracy over time
⭐

Premium Content

Document Processing with AI

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement