Information Extraction
Information extraction converts unstructured text into structured data through entity recognition, relation extraction, and event detection.
NER with spaCy
import spacy
class SpacyNERExtractor:
def __init__(self, model: str = "en_core_web_sm"):
self.nlp = spacy.load(model)
def extract_entities(self, text: str) -> list:
doc = self.nlp(text)
return [
{
"text": ent.text,
"label": ent.label_,
"start": ent.start_char,
"end": ent.end_char
}
for ent in doc.ents
]
def extract_entities_detailed(self, text: str) -> dict:
doc = self.nlp(text)
entities = {}
for ent in doc.ents:
if ent.label_ not in entities:
entities[ent.label_] = []
entities[ent.label_].append({
"text": ent.text,
"start": ent.start_char,
"end": ent.end_char
})
return entities
def extract_with_context(self, text: str, window: int = 50) -> list:
doc = self.nlp(text)
results = []
for ent in doc.ents:
start = max(0, ent.start_char - window)
end = min(len(text), ent.end_char + window)
context = text[start:end]
results.append({
"entity": ent.text,
"type": ent.label_,
"context": context
})
return results
# Usage
ner = SpacyNERExtractor()
entities = ner.extract_entities("Apple CEO Tim Cook announced new iPhone in Cupertino.")
# [{"text": "Apple", "label": "ORG"}, {"text": "Tim Cook", "label": "PERSON"},
# {"text": "Cupertino", "label": "GPE"}]
Relation Extraction
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
class RelationExtractor:
def __init__(self, model_name: str = "Babelscape/rebel-large"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
def extract_relations(self, text: str) -> list:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
outputs = self.model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
return self._decode_predictions(text, predictions)
def extract_relations_llm(self, text: str, llm) -> list:
prompt = f"""Extract all relations from this text as JSON:
Text: {text}
Return format: [{{"head": "entity1", "relation": "relation", "tail": "entity2"}}]
Relations:"""
response = llm.invoke(prompt).content
import json
return json.loads(response)
def _decode_predictions(self, text: str, predictions: torch.Tensor) -> list:
relation_labels = [
"employed_by", "located_in", "founded_by", "produces",
"part_of", "subsidiary_of"
]
return [{"relation": relation_labels[p.item()]} for p in predictions[:5]]
def extract_entity_pairs(self, text: str) -> list:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
pairs = []
entities = list(doc.ents)
for i, e1 in enumerate(entities):
for e2 in entities[i+1:]:
pairs.append({
"entity1": {"text": e1.text, "type": e1.label_},
"entity2": {"text": e2.text, "type": e2.label_},
"distance": abs(e1.end - e2.start)
})
return pairs
# Usage
extractor = RelationExtractor()
relations = extractor.extract_relations_llm(
"Apple CEO Tim Cook announced new products at the event in Cupertino.",
llm
)
Event Extraction
class EventExtractor:
def __init__(self, llm):
self.llm = llm
self.event_types = [
"MERGER", "ACQUISITION", "PRODUCT_LAUNCH",
"FINANCIAL", "LEGAL", "PERSONNEL"
]
def extract_events(self, text: str) -> list:
prompt = f"""Extract events from this text. For each event provide:
- type: event type
- trigger: the word/phrase triggering the event
- arguments: who, what, when, where, why
Text: {text}
Events (JSON):"""
response = self.llm.invoke(prompt).content
import json
return json.loads(response)
def extract_events_rule_based(self, text: str) -> list:
import re
events = []
patterns = {
"MERGER": r"(merger|acquisition|acquired|merged)",
"PRODUCT_LAUNCH": r"(launched|released|announced|unveiled)",
"PERSONNEL": r"(appointed|hired|fired|resigned|joined)"
}
for event_type, pattern in patterns.items():
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
start = max(0, match.start() - 100)
end = min(len(text), match.end() + 100)
events.append({
"type": event_type,
"trigger": match.group(),
"context": text[start:end]
})
return events
def event_to_triple(self, event: dict) -> tuple:
subject = event.get("arguments", {}).get("who", "Unknown")
predicate = event.get("type", "UNKNOWN")
obj = event.get("arguments", {}).get("what", "Unknown")
return (subject, predicate, obj)
# Usage
extractor = EventExtractor(llm)
events = extractor.extract_events(
"Apple acquired Beats Electronics in 2014 for $3 billion."
)
OpenIE Pipeline
class OpenIEPipeline:
def __init__(self, llm):
self.llm = llm
def extract_open_relations(self, text: str) -> list:
prompt = f"""Extract all (subject, predicate, object) triples from this text:
Text: {text}
Return as JSON list of triples:"""
response = self.llm.invoke(prompt).content
import json
return json.loads(response)
def extract_and_link(self, text: str) -> dict:
triples = self.extract_open_relations(text)
entities = {}
for subj, pred, obj in triples:
entities[subj] = entities.get(subj, {"relations": []})
entities[subj]["relations"].append({"predicate": pred, "object": obj})
return {"triples": triples, "entity_graph": entities}
def batch_extract(self, texts: list) -> list:
return [self.extract_open_relations(text) for text in texts]
# Usage
openie = OpenIEPipeline(llm)
result = openie.extract_and_link(
"Tim Cook is the CEO of Apple. Apple makes iPhones."
)
# {"triples": [["Tim Cook", "CEO of", "Apple"], ["Apple", "makes", "iPhones"]]}
Key Takeaways
- NER identifies and classifies named entities in text
- Relation extraction discovers relationships between entities
- Event extraction captures actions and their participants
- Knowledge graphs store extracted information as connected nodes
- LLM-based extraction enables flexible, zero-shot information extraction