Document Processing with AI
Document Processing Challenges
Documents come in various formats (PDFs, images, scanned documents) with complex layouts, tables, handwriting, and multiple languages. AI-powered document processing addresses these challenges through computer vision and natural language understanding.
OCR with Python
import pytesseract
from PIL import Image
import fitz # PyMuPDF
from typing import List, Dict
class DocumentOCR:
def __init__(self, tesseract_path=None):
if tesseract_path:
pytesseract.pytesseract.tesseract_cmd = tesseract_path
def extract_from_image(self, image_path: str) -> Dict:
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
return {
"text": text,
"words": [
{
"text": word,
"confidence": conf,
"bbox": (left, top, width, height)
}
for word, conf, left, top, width, height in zip(
data["text"],
data["conf"],
data["left"],
data["top"],
data["width"],
data["height"]
)
if word.strip()
]
}
def extract_from_pdf(self, pdf_path: str) -> List[Dict]:
doc = fitz.open(pdf_path)
pages = []
for page_num in range(len(doc)):
page = doc[page_num]
pix = page.get_pixmap()
image = Image.frombytes("RGB", pix.width, pix.height, pix.samples)
text = pytesseract.image_to_string(image)
pages.append({
"page": page_num + 1,
"text": text
})
return pages
ocr = DocumentOCR()
result = ocr.extract_from_image("document.png")
print(f"Extracted {len(result['words'])} words")
Layout Analysis
from transformers import AutoProcessor, AutoModelForObjectDetection
import torch
class LayoutAnalyzer:
def __init__(self):
self.processor = AutoProcessor.from_pretrained(
"microsoft/layoutlmv3-base"
)
self.model = AutoModelForObjectDetection.from_pretrained(
"microsoft/layoutlmv3-base"
)
def analyze_layout(self, image):
inputs = self.processor(images=image, return_tensors="pt")
with torch.no_grad():
outputs = self.model(**inputs)
results = self.processor.post_process_object_detection(
outputs, threshold=0.5
)[0]
layout_elements = []
for score, label, box in zip(
results["scores"],
results["labels"],
results["boxes"]
):
layout_elements.append({
"label": self.model.config.id2label[label.item()],
"score": score.item(),
"bbox": box.tolist()
})
return layout_elements
def categorize_elements(self, elements):
categories = {
"text": [],
"table": [],
"figure": [],
"title": [],
"list": []
}
for elem in elements:
label = elem["label"]
if label in categories:
categories[label].append(elem)
return categories
analyzer = LayoutAnalyzer()
elements = analyzer.analyze_layout(image)
categorized = analyzer.categorize_elements(elements)
Information Extraction with GenAI
import openai
from typing import Dict, List
import json
class DocumentExtractor:
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
def extract_entities(self, text: str, doc_type: str = "general") -> Dict:
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": f"""Extract structured information from {doc_type} documents.
Return valid JSON with extracted entities."""},
{"role": "user", "content": f"Extract key information from:\n\n{text[:3000]}"}
],
temperature=0,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def classify_document(self, text: str, categories: List[str]) -> str:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"Classify document into: {', '.join(categories)}"},
{"role": "user", "content": text[:2000]}
],
temperature=0
)
return response.choices[0].message.content.strip()
def summarize_document(self, text: str, max_words: int = 100) -> str:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"Summarize in {max_words} words."},
{"role": "user", "content": text[:4000]}
],
temperature=0.3
)
return response.choices[0].message.content
extractor = DocumentExtractor(api_key="your-api-key")
invoice_data = extractor.extract_entities(invoice_text, doc_type="invoice")
category = extractor.classify_document(
document_text,
categories=["invoice", "receipt", "contract", "report"]
)
summary = extractor.summarize_document(long_report_text)
Table Extraction
import pandas as pd
from typing import List
class TableExtractor:
def __init__(self):
self.detector = None
def extract_tables_from_image(self, image) -> List[pd.DataFrame]:
try:
import camelot
tables = camelot.read_pdf(image, pages="1")
return [table.df for table in tables]
except ImportError:
return self._extract_with_ocr(image)
def _extract_with_ocr(self, image) -> List[pd.DataFrame]:
import pytesseract
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
rows = []
current_row = []
last_top = None
for i, word in enumerate(data["text"]):
if word.strip():
top = data["top"][i]
if last_top is not None and abs(top - last_top) > 10:
if current_row:
rows.append(current_row)
current_row = []
current_row.append(word)
last_top = top
if current_row:
rows.append(current_row)
if rows:
max_cols = max(len(row) for row in rows)
padded_rows = [row + [""] * (max_cols - len(row)) for row in rows]
return [pd.DataFrame(padded_rows[1:], columns=padded_rows[0])]
return []
extractor = TableExtractor()
tables = extractor.extract_tables_from_image("table_image.png")
if tables:
print(tables[0].to_string())
Document Classification
from transformers import pipeline
from typing import List
class DocumentClassifier:
def __init__(self, model_name="facebook/bart-large-mnli"):
self.classifier = pipeline("zero-shot-classification", model=model_name)
def classify(self, text: str, candidate_labels: List[str]) -> Dict:
result = self.classifier(text, candidate_labels)
return {
"label": result["labels"][0],
"score": result["scores"][0],
"all_scores": dict(zip(result["labels"], result["scores"]))
}
def batch_classify(self, texts: List[str], candidate_labels: List[str]) -> List[Dict]:
return [self.classify(text, candidate_labels) for text in texts]
classifier = DocumentClassifier()
result = classifier.classify(
"Invoice #12345 dated 01/15/2024 for $5,000",
candidate_labels=["invoice", "receipt", "contract", "letter"]
)
print(f"Classification: {result['label']} ({result['score']:.2f})")
Best Practices
- Pre-process images for better OCR accuracy
- Handle multi-page documents with proper pagination
- Implement confidence scoring for extracted data
- Use human-in-the-loop for validation
- Version control document templates
- Monitor extraction accuracy over time