Summarization
Summarization condenses long documents into concise, coherent summaries using extractive methods (selecting key sentences) or abstractive methods (generating new text).
Extractive Summarization
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
class ExtractiveSummarizer:
def __init__(self):
self.vectorizer = TfidfVectorizer()
def text_rank_summarize(self, text: str, num_sentences: int = 3) -> str:
sentences = text.split('. ')
if len(sentences) <= num_sentences:
return text
tfidf_matrix = self.vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)),
reverse=True)
summary = '. '.join([s for _, s in ranked[:num_sentences]])
return summary
def tfidf_summarize(self, text: str, num_sentences: int = 3) -> str:
sentences = text.split('. ')
tfidf_matrix = self.vectorizer.fit_transform(sentences)
scores = tfidf_matrix.sum(axis=1).A1
top_indices = np.argsort(scores)[-num_sentences:]
top_indices = sorted(top_indices)
return '. '.join([sentences[i] for i in top_indices])
def lsa_summarize(self, text: str, num_sentences: int = 3) -> str:
from sklearn.decomposition import TruncatedSVD
sentences = text.split('. ')
tfidf_matrix = self.vectorizer.fit_transform(sentences)
svd = TruncatedSVD(n_components=min(3, len(sentences)))
reduced = svd.fit_transform(tfidf_matrix)
importance = np.abs(reduced).sum(axis=1)
top_indices = np.argsort(importance)[-num_sentences:]
top_indices = sorted(top_indices)
return '. '.join([sentences[i] for i in top_indices])
# Usage
summarizer = ExtractiveSummarizer()
summary = summarizer.text_rank_summarize(long_article, num_sentences=5)
Abstractive Summarization with Transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
class AbstractiveSummarizer:
def __init__(self, model_name: str = "facebook/bart-large-cnn"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
def summarize(self, text: str, max_length: int = 150,
min_length: int = 50) -> str:
inputs = self.tokenizer(text, return_tensors="pt",
truncation=True, max_length=1024)
summary_ids = self.model.generate(
inputs["input_ids"],
max_length=max_length,
min_length=min_length,
num_beams=4,
length_penalty=2.0,
early_stopping=True
)
return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
def summarize_with_options(self, text: str, options: dict) -> dict:
default_options = {
"max_length": 150,
"min_length": 50,
"num_beams": 4,
"length_penalty": 2.0,
"no_repeat_ngram_size": 3
}
default_options.update(options)
inputs = self.tokenizer(text, return_tensors="pt",
truncation=True, max_length=1024)
summary_ids = self.model.generate(
inputs["input_ids"],
**default_options
)
return {
"summary": self.tokenizer.decode(summary_ids[0], skip_special_tokens=True),
"options": default_options
}
def batch_summarize(self, texts: list, batch_size: int = 8) -> list:
summaries = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
inputs = self.tokenizer(batch, return_tensors="pt",
truncation=True, padding=True,
max_length=1024)
summary_ids = self.model.generate(
inputs["input_ids"],
max_length=150,
num_beams=4
)
batch_summaries = self.tokenizer.batch_decode(
summary_ids, skip_special_tokens=True
)
summaries.extend(batch_summaries)
return summaries
# Usage
summarizer = AbstractiveSummarizer()
summary = summarizer.summarize(long_article, max_length=100)
Map-Reduce Summarization
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
class MapReduceSummarizer:
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4", temperature=0)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=4000,
chunk_overlap=200
)
def create_chains(self):
map_prompt = PromptTemplate.from_template(
"""Summarize this text chunk:
{doc}
Summary:"""
)
reduce_prompt = PromptTemplate.from_template(
"""Combine these summaries into a final summary:
{doc_summaries}
Final Summary:"""
)
return map_prompt, reduce_prompt
def summarize(self, documents: list) -> str:
chunks = self.text_splitter.split_documents(documents)
map_prompt, reduce_prompt = self.create_chains()
map_chain = map_prompt | self.llm
summaries = []
for chunk in chunks:
result = map_chain.invoke({"doc": chunk.page_content})
summaries.append(result.content)
combined = "\n".join(summaries)
reduce_chain = reduce_prompt | self.llm
final = reduce_chain.invoke({"doc_summaries": combined})
return final.content
# Usage
summarizer = MapReduceSummarizer()
summary = summarizer.summarize(documents)
Custom Summary Evaluator
class SummaryEvaluator:
def __init__(self, llm):
self.llm = llm
def evaluate_coverage(self, original: str, summary: str) -> dict:
prompt = f"""Evaluate how well this summary covers the original text.
Original (first 500 chars): {original[:500]}
Summary: {summary}
Rate coverage 0-1 and list missing key points:"""
return self.llm.invoke(prompt).content
def evaluate_conciseness(self, summary: str) -> float:
words = len(summary.split())
sentences = len(summary.split('.'))
return words / max(sentences, 1)
def evaluate_coherence(self, summary: str) -> dict:
prompt = f"""Evaluate the coherence and flow of this summary:
{summary}
Rate coherence 0-1 and identify any issues:"""
return self.llm.invoke(prompt).content
def full_evaluation(self, original: str, summary: str) -> dict:
return {
"coverage": self.evaluate_coverage(original, summary),
"conciseness": self.evaluate_conciseness(summary),
"coherence": self.evaluate_coherence(summary),
"compression_ratio": len(summary) / len(original)
}
# Usage
evaluator = SummaryEvaluator(llm)
metrics = evaluator.full_evaluation(original_text, generated_summary)
Key Takeaways
- Extractive methods select important sentences from the original
- Abstractive methods generate new, paraphrased summaries
- Map-Reduce handles long documents by summarizing chunks then combining
- Compression ratio controls summary length vs. detail
- Evaluation should assess coverage, coherence, and conciseness