πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Summarization

🟒 Free Lesson

Advertisement

Summarization

DocumentLong Text1000+ wordsExtractiveSelect Key SentencesTextRank, LexRankAbstractiveGenerate New TextBART, T5, GPTProcessingSentence ScoringCompression RatioCompressionShort (10%) - Long (50%)LLMMap-ReduceRefine ChainSummaryConciseCoherentAccurateExtractive: Selects sentences from original textAbstractive: Generates new paraphrased textHybrid: Best of both approaches

Summarization condenses long documents into concise, coherent summaries using extractive methods (selecting key sentences) or abstractive methods (generating new text).

Extractive Summarization

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

class ExtractiveSummarizer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def text_rank_summarize(self, text: str, num_sentences: int = 3) -> str:
        sentences = text.split('. ')
        if len(sentences) <= num_sentences:
            return text

        tfidf_matrix = self.vectorizer.fit_transform(sentences)
        similarity_matrix = cosine_similarity(tfidf_matrix)

        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)

        ranked = sorted(((scores[i], s) for i, s in enumerate(sentences)),
                       reverse=True)
        summary = '. '.join([s for _, s in ranked[:num_sentences]])
        return summary

    def tfidf_summarize(self, text: str, num_sentences: int = 3) -> str:
        sentences = text.split('. ')
        tfidf_matrix = self.vectorizer.fit_transform(sentences)
        scores = tfidf_matrix.sum(axis=1).A1
        top_indices = np.argsort(scores)[-num_sentences:]
        top_indices = sorted(top_indices)
        return '. '.join([sentences[i] for i in top_indices])

    def lsa_summarize(self, text: str, num_sentences: int = 3) -> str:
        from sklearn.decomposition import TruncatedSVD
        sentences = text.split('. ')
        tfidf_matrix = self.vectorizer.fit_transform(sentences)
        svd = TruncatedSVD(n_components=min(3, len(sentences)))
        reduced = svd.fit_transform(tfidf_matrix)
        importance = np.abs(reduced).sum(axis=1)
        top_indices = np.argsort(importance)[-num_sentences:]
        top_indices = sorted(top_indices)
        return '. '.join([sentences[i] for i in top_indices])

# Usage
summarizer = ExtractiveSummarizer()
summary = summarizer.text_rank_summarize(long_article, num_sentences=5)

Abstractive Summarization with Transformers

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

class AbstractiveSummarizer:
    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    def summarize(self, text: str, max_length: int = 150,
                  min_length: int = 50) -> str:
        inputs = self.tokenizer(text, return_tensors="pt",
                               truncation=True, max_length=1024)
        summary_ids = self.model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=min_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
        return self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    def summarize_with_options(self, text: str, options: dict) -> dict:
        default_options = {
            "max_length": 150,
            "min_length": 50,
            "num_beams": 4,
            "length_penalty": 2.0,
            "no_repeat_ngram_size": 3
        }
        default_options.update(options)

        inputs = self.tokenizer(text, return_tensors="pt",
                               truncation=True, max_length=1024)
        summary_ids = self.model.generate(
            inputs["input_ids"],
            **default_options
        )
        return {
            "summary": self.tokenizer.decode(summary_ids[0], skip_special_tokens=True),
            "options": default_options
        }

    def batch_summarize(self, texts: list, batch_size: int = 8) -> list:
        summaries = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt",
                                   truncation=True, padding=True,
                                   max_length=1024)
            summary_ids = self.model.generate(
                inputs["input_ids"],
                max_length=150,
                num_beams=4
            )
            batch_summaries = self.tokenizer.batch_decode(
                summary_ids, skip_special_tokens=True
            )
            summaries.extend(batch_summaries)
        return summaries

# Usage
summarizer = AbstractiveSummarizer()
summary = summarizer.summarize(long_article, max_length=100)

Map-Reduce Summarization

from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

class MapReduceSummarizer:
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            chunk_overlap=200
        )

    def create_chains(self):
        map_prompt = PromptTemplate.from_template(
            """Summarize this text chunk:
            {doc}
            Summary:"""
        )
        reduce_prompt = PromptTemplate.from_template(
            """Combine these summaries into a final summary:
            {doc_summaries}
            Final Summary:"""
        )
        return map_prompt, reduce_prompt

    def summarize(self, documents: list) -> str:
        chunks = self.text_splitter.split_documents(documents)
        map_prompt, reduce_prompt = self.create_chains()

        map_chain = map_prompt | self.llm
        summaries = []
        for chunk in chunks:
            result = map_chain.invoke({"doc": chunk.page_content})
            summaries.append(result.content)

        combined = "\n".join(summaries)
        reduce_chain = reduce_prompt | self.llm
        final = reduce_chain.invoke({"doc_summaries": combined})
        return final.content

# Usage
summarizer = MapReduceSummarizer()
summary = summarizer.summarize(documents)

Custom Summary Evaluator

class SummaryEvaluator:
    def __init__(self, llm):
        self.llm = llm

    def evaluate_coverage(self, original: str, summary: str) -> dict:
        prompt = f"""Evaluate how well this summary covers the original text.
        Original (first 500 chars): {original[:500]}
        Summary: {summary}

        Rate coverage 0-1 and list missing key points:"""
        return self.llm.invoke(prompt).content

    def evaluate_conciseness(self, summary: str) -> float:
        words = len(summary.split())
        sentences = len(summary.split('.'))
        return words / max(sentences, 1)

    def evaluate_coherence(self, summary: str) -> dict:
        prompt = f"""Evaluate the coherence and flow of this summary:
        {summary}

        Rate coherence 0-1 and identify any issues:"""
        return self.llm.invoke(prompt).content

    def full_evaluation(self, original: str, summary: str) -> dict:
        return {
            "coverage": self.evaluate_coverage(original, summary),
            "conciseness": self.evaluate_conciseness(summary),
            "coherence": self.evaluate_coherence(summary),
            "compression_ratio": len(summary) / len(original)
        }

# Usage
evaluator = SummaryEvaluator(llm)
metrics = evaluator.full_evaluation(original_text, generated_summary)

Key Takeaways

  • Extractive methods select important sentences from the original
  • Abstractive methods generate new, paraphrased summaries
  • Map-Reduce handles long documents by summarizing chunks then combining
  • Compression ratio controls summary length vs. detail
  • Evaluation should assess coverage, coherence, and conciseness
⭐

Premium Content

Summarization

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement