Information Extraction

Information extraction converts unstructured text into structured data through entity recognition, relation extraction, and event detection.

NER with spaCy

import spacy

class SpacyNERExtractor:
    def __init__(self, model: str = "en_core_web_sm"):
        self.nlp = spacy.load(model)

    def extract_entities(self, text: str) -> list:
        doc = self.nlp(text)
        return [
            {
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char
            }
            for ent in doc.ents
        ]

    def extract_entities_detailed(self, text: str) -> dict:
        doc = self.nlp(text)
        entities = {}
        for ent in doc.ents:
            if ent.label_ not in entities:
                entities[ent.label_] = []
            entities[ent.label_].append({
                "text": ent.text,
                "start": ent.start_char,
                "end": ent.end_char
            })
        return entities

    def extract_with_context(self, text: str, window: int = 50) -> list:
        doc = self.nlp(text)
        results = []
        for ent in doc.ents:
            start = max(0, ent.start_char - window)
            end = min(len(text), ent.end_char + window)
            context = text[start:end]
            results.append({
                "entity": ent.text,
                "type": ent.label_,
                "context": context
            })
        return results

# Usage
ner = SpacyNERExtractor()
entities = ner.extract_entities("Apple CEO Tim Cook announced new iPhone in Cupertino.")
# [{"text": "Apple", "label": "ORG"}, {"text": "Tim Cook", "label": "PERSON"},
#  {"text": "Cupertino", "label": "GPE"}]

Relation Extraction

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

class RelationExtractor:
    def __init__(self, model_name: str = "Babelscape/rebel-large"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def extract_relations(self, text: str) -> list:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        outputs = self.model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
        return self._decode_predictions(text, predictions)

    def extract_relations_llm(self, text: str, llm) -> list:
        prompt = f"""Extract all relations from this text as JSON:
        Text: {text}
        Return format: [{{"head": "entity1", "relation": "relation", "tail": "entity2"}}]
        Relations:"""
        response = llm.invoke(prompt).content
        import json
        return json.loads(response)

    def _decode_predictions(self, text: str, predictions: torch.Tensor) -> list:
        relation_labels = [
            "employed_by", "located_in", "founded_by", "produces",
            "part_of", "subsidiary_of"
        ]
        return [{"relation": relation_labels[p.item()]} for p in predictions[:5]]

    def extract_entity_pairs(self, text: str) -> list:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text)
        pairs = []
        entities = list(doc.ents)
        for i, e1 in enumerate(entities):
            for e2 in entities[i+1:]:
                pairs.append({
                    "entity1": {"text": e1.text, "type": e1.label_},
                    "entity2": {"text": e2.text, "type": e2.label_},
                    "distance": abs(e1.end - e2.start)
                })
        return pairs

# Usage
extractor = RelationExtractor()
relations = extractor.extract_relations_llm(
    "Apple CEO Tim Cook announced new products at the event in Cupertino.",
    llm
)

Event Extraction

class EventExtractor:
    def __init__(self, llm):
        self.llm = llm
        self.event_types = [
            "MERGER", "ACQUISITION", "PRODUCT_LAUNCH",
            "FINANCIAL", "LEGAL", "PERSONNEL"
        ]

    def extract_events(self, text: str) -> list:
        prompt = f"""Extract events from this text. For each event provide:
        - type: event type
        - trigger: the word/phrase triggering the event
        - arguments: who, what, when, where, why
        Text: {text}
        Events (JSON):"""

        response = self.llm.invoke(prompt).content
        import json
        return json.loads(response)

    def extract_events_rule_based(self, text: str) -> list:
        import re
        events = []
        patterns = {
            "MERGER": r"(merger|acquisition|acquired|merged)",
            "PRODUCT_LAUNCH": r"(launched|released|announced|unveiled)",
            "PERSONNEL": r"(appointed|hired|fired|resigned|joined)"
        }
        for event_type, pattern in patterns.items():
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                start = max(0, match.start() - 100)
                end = min(len(text), match.end() + 100)
                events.append({
                    "type": event_type,
                    "trigger": match.group(),
                    "context": text[start:end]
                })
        return events

    def event_to_triple(self, event: dict) -> tuple:
        subject = event.get("arguments", {}).get("who", "Unknown")
        predicate = event.get("type", "UNKNOWN")
        obj = event.get("arguments", {}).get("what", "Unknown")
        return (subject, predicate, obj)

# Usage
extractor = EventExtractor(llm)
events = extractor.extract_events(
    "Apple acquired Beats Electronics in 2014 for $3 billion."
)

OpenIE Pipeline

class OpenIEPipeline:
    def __init__(self, llm):
        self.llm = llm

    def extract_open_relations(self, text: str) -> list:
        prompt = f"""Extract all (subject, predicate, object) triples from this text:
        Text: {text}
        Return as JSON list of triples:"""

        response = self.llm.invoke(prompt).content
        import json
        return json.loads(response)

    def extract_and_link(self, text: str) -> dict:
        triples = self.extract_open_relations(text)
        entities = {}
        for subj, pred, obj in triples:
            entities[subj] = entities.get(subj, {"relations": []})
            entities[subj]["relations"].append({"predicate": pred, "object": obj})
        return {"triples": triples, "entity_graph": entities}

    def batch_extract(self, texts: list) -> list:
        return [self.extract_open_relations(text) for text in texts]

# Usage
openie = OpenIEPipeline(llm)
result = openie.extract_and_link(
    "Tim Cook is the CEO of Apple. Apple makes iPhones."
)
# {"triples": [["Tim Cook", "CEO of", "Apple"], ["Apple", "makes", "iPhones"]]}

Key Takeaways

NER identifies and classifies named entities in text
Relation extraction discovers relationships between entities
Event extraction captures actions and their participants
Knowledge graphs store extracted information as connected nodes
LLM-based extraction enables flexible, zero-shot information extraction

Information Extraction

Information Extraction

NER with spaCy

Relation Extraction

Event Extraction

OpenIE Pipeline

Key Takeaways

Premium Content

Need Expert Generative AI Help?