πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Information Extraction

🟒 Free Lesson

Advertisement

Information Extraction

Unstructured Text"Apple CEO Tim Cook""announced iPhone 15""in Cupertino on Sept 12"NEREntity DetectionType ClassificationRelationsEntity PairsPredicate DetectionEventsTrigger DetectionArgument ExtractionKnowledge GraphAppleCookiPhoneStructured Output{"entity": "Apple","type": "ORG","relation": "CEO","target": "Tim Cook"}ApplicationsSearchQuestion AnsweringRecommendationAnalyticsCompliance

Information extraction converts unstructured text into structured data through entity recognition, relation extraction, and event detection.

NER with spaCy

import spacy

class SpacyNERExtractor:
    def __init__(self, model: str = "en_core_web_sm"):
        self.nlp = spacy.load(model)

    def extract_entities(self, text: str) -> list:
        doc = self.nlp(text)
        return [
            {
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char
            }
            for ent in doc.ents
        ]

    def extract_entities_detailed(self, text: str) -> dict:
        doc = self.nlp(text)
        entities = {}
        for ent in doc.ents:
            if ent.label_ not in entities:
                entities[ent.label_] = []
            entities[ent.label_].append({
                "text": ent.text,
                "start": ent.start_char,
                "end": ent.end_char
            })
        return entities

    def extract_with_context(self, text: str, window: int = 50) -> list:
        doc = self.nlp(text)
        results = []
        for ent in doc.ents:
            start = max(0, ent.start_char - window)
            end = min(len(text), ent.end_char + window)
            context = text[start:end]
            results.append({
                "entity": ent.text,
                "type": ent.label_,
                "context": context
            })
        return results

# Usage
ner = SpacyNERExtractor()
entities = ner.extract_entities("Apple CEO Tim Cook announced new iPhone in Cupertino.")
# [{"text": "Apple", "label": "ORG"}, {"text": "Tim Cook", "label": "PERSON"},
#  {"text": "Cupertino", "label": "GPE"}]

Relation Extraction

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

class RelationExtractor:
    def __init__(self, model_name: str = "Babelscape/rebel-large"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def extract_relations(self, text: str) -> list:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        outputs = self.model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        return self._decode_predictions(text, predictions)

    def extract_relations_llm(self, text: str, llm) -> list:
        prompt = f"""Extract all relations from this text as JSON:
        Text: {text}
        Return format: [{{"head": "entity1", "relation": "relation", "tail": "entity2"}}]
        Relations:"""
        response = llm.invoke(prompt).content
        import json
        return json.loads(response)

    def _decode_predictions(self, text: str, predictions: torch.Tensor) -> list:
        relation_labels = [
            "employed_by", "located_in", "founded_by", "produces",
            "part_of", "subsidiary_of"
        ]
        return [{"relation": relation_labels[p.item()]} for p in predictions[:5]]

    def extract_entity_pairs(self, text: str) -> list:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        pairs = []
        entities = list(doc.ents)
        for i, e1 in enumerate(entities):
            for e2 in entities[i+1:]:
                pairs.append({
                    "entity1": {"text": e1.text, "type": e1.label_},
                    "entity2": {"text": e2.text, "type": e2.label_},
                    "distance": abs(e1.end - e2.start)
                })
        return pairs

# Usage
extractor = RelationExtractor()
relations = extractor.extract_relations_llm(
    "Apple CEO Tim Cook announced new products at the event in Cupertino.",
    llm
)

Event Extraction

class EventExtractor:
    def __init__(self, llm):
        self.llm = llm
        self.event_types = [
            "MERGER", "ACQUISITION", "PRODUCT_LAUNCH",
            "FINANCIAL", "LEGAL", "PERSONNEL"
        ]

    def extract_events(self, text: str) -> list:
        prompt = f"""Extract events from this text. For each event provide:
        - type: event type
        - trigger: the word/phrase triggering the event
        - arguments: who, what, when, where, why
        Text: {text}
        Events (JSON):"""

        response = self.llm.invoke(prompt).content
        import json
        return json.loads(response)

    def extract_events_rule_based(self, text: str) -> list:
        import re
        events = []
        patterns = {
            "MERGER": r"(merger|acquisition|acquired|merged)",
            "PRODUCT_LAUNCH": r"(launched|released|announced|unveiled)",
            "PERSONNEL": r"(appointed|hired|fired|resigned|joined)"
        }
        for event_type, pattern in patterns.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                start = max(0, match.start() - 100)
                end = min(len(text), match.end() + 100)
                events.append({
                    "type": event_type,
                    "trigger": match.group(),
                    "context": text[start:end]
                })
        return events

    def event_to_triple(self, event: dict) -> tuple:
        subject = event.get("arguments", {}).get("who", "Unknown")
        predicate = event.get("type", "UNKNOWN")
        obj = event.get("arguments", {}).get("what", "Unknown")
        return (subject, predicate, obj)

# Usage
extractor = EventExtractor(llm)
events = extractor.extract_events(
    "Apple acquired Beats Electronics in 2014 for $3 billion."
)

OpenIE Pipeline

class OpenIEPipeline:
    def __init__(self, llm):
        self.llm = llm

    def extract_open_relations(self, text: str) -> list:
        prompt = f"""Extract all (subject, predicate, object) triples from this text:
        Text: {text}
        Return as JSON list of triples:"""

        response = self.llm.invoke(prompt).content
        import json
        return json.loads(response)

    def extract_and_link(self, text: str) -> dict:
        triples = self.extract_open_relations(text)
        entities = {}
        for subj, pred, obj in triples:
            entities[subj] = entities.get(subj, {"relations": []})
            entities[subj]["relations"].append({"predicate": pred, "object": obj})
        return {"triples": triples, "entity_graph": entities}

    def batch_extract(self, texts: list) -> list:
        return [self.extract_open_relations(text) for text in texts]

# Usage
openie = OpenIEPipeline(llm)
result = openie.extract_and_link(
    "Tim Cook is the CEO of Apple. Apple makes iPhones."
)
# {"triples": [["Tim Cook", "CEO of", "Apple"], ["Apple", "makes", "iPhones"]]}

Key Takeaways

  • NER identifies and classifies named entities in text
  • Relation extraction discovers relationships between entities
  • Event extraction captures actions and their participants
  • Knowledge graphs store extracted information as connected nodes
  • LLM-based extraction enables flexible, zero-shot information extraction
⭐

Premium Content

Information Extraction

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement