Named Entity Recognition
Named Entity Recognition (NER) identifies and classifies named entities in text into predefined categories like persons, organizations, locations, and dates.
CRF-Based NER
import sklearn_crfsuite
from sklearn_crfsuite import metrics
class CRFNER:
def __init__(self):
self.model = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100
)
def _word_features(self, sentence: list, i: int) -> dict:
word = sentence[i][0]
features = {
'bias': 1.0,
'word.lower()': word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
}
if i > 0:
prev_word = sentence[i-1][0]
features.update({
'-1:word.lower()': prev_word.lower(),
'-1:word.istitle()': prev_word.istitle(),
'-1:word.isupper()': prev_word.isupper(),
})
else:
features['BOS'] = True
if i < len(sentence)-1:
next_word = sentence[i+1][0]
features.update({
'+1:word.lower()': next_word.lower(),
'+1:word.istitle()': next_word.istitle(),
'+1:word.isupper()': next_word.isupper(),
})
else:
features['EOS'] = True
return features
def prepare_data(self, sentences: list) -> tuple:
X = [[self._word_features(s, i) for i in range(len(s))] for s in sentences]
y = [[tag for _, tag in s] for s in sentences]
return X, y
def train(self, train_sentences: list):
X_train, y_train = self.prepare_data(train_sentences)
self.model.fit(X_train, y_train)
def predict(self, sentence: list) -> list:
features = [self._word_features(sentence, i) for i in range(len(sentence))]
return self.model.predict_single(features)
def evaluate(self, test_sentences: list) -> dict:
X_test, y_test = self.prepare_data(test_sentences)
y_pred = self.model.predict(X_test)
labels = list(self.model.classes_)
return {
'report': metrics.flat_classification_report(y_test, y_pred, labels=labels),
'f1': metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
}
# Usage
ner = CRFNER()
train_data = [
[("Apple", "B-ORG"), ("CEO", "O"), ("Tim", "B-PER"), ("Cook", "I-PER")],
[("visited", "O"), ("New", "B-LOC"), ("York", "I-LOC")]
]
ner.train(train_data)
BiLSTM-CRF Model
import torch
import torch.nn as nn
class BiLSTMCRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
super().__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.tagset_size = len(tag_to_ix)
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
num_layers=2, bidirectional=True, batch_first=True)
self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
self.crf = CRF(self.tagset_size)
def _get_lstm_features(self, sentence):
embeds = self.word_embeds(sentence)
lstm_out, _ = self.lstm(embeds)
lstm_feats = self.hidden2tag(lstm_out)
return lstm_feats
def neg_log_likelihood(self, sentence, tags):
feats = self._get_lstm_features(sentence)
return -self.crf.forward(feats, tags)
def forward(self, sentence):
lstm_feats = self._get_lstm_features(sentence)
tag_seq = self.crf.viterbi_decode(lstm_feats)
return tag_seq
class CRF(nn.Module):
def __init__(self, tagset_size):
super().__init__()
self.tagset_size = tagset_size
self.transitions = nn.Parameter(torch.randn(tagset_size, tagset_size))
def forward(self, feats, tags):
forward_score = self._forward_algorithm(feats)
gold_score = self._score_sentence(feats, tags)
return forward_score - gold_score
def _forward_algorithm(self, feats):
init_alphas = torch.full((1, self.tagset_size), -10000.)
init_alphas[0][0] = 0.
forward_var = init_alphas
for feat in feats:
alphas_t = []
for next_tag in range(self.tagset_size):
emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
trans_score = self.transitions[next_tag].view(1, -1)
next_tag_var = forward_var + trans_score + emit_score
alphas_t.append(self._log_sum_exp(next_tag_var))
forward_var = torch.cat(alphas_t).view(1, -1)
return self._log_sum_exp(forward_var)
def _score_sentence(self, feats, tags):
score = torch.zeros(1)
start = torch.tensor([0], dtype=torch.long)
tags = torch.cat([start, tags])
for i, feat in enumerate(feats):
score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
return score
def _log_sum_exp(self, vec):
max_score = vec[0, vec.argmax()]
max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
Transformer-Based NER
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import torch
class TransformerNER:
def __init__(self, model_name: str = "dbmdz/bert-large-cased-finetuned-conll03-english"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(model_name)
self.pipe = pipeline("ner", model=self.model, tokenizer=self.tokenizer)
def extract_entities(self, text: str) -> list:
entities = self.pipe(text)
return [
{
"text": e["word"],
"label": e["entity"],
"score": round(e["score"], 4),
"start": e["start"],
"end": e["end"]
}
for e in entities
]
def extract_entities_grouped(self, text: str) -> list:
raw_entities = self.extract_entities(text)
grouped = []
current_entity = None
for entity in raw_entities:
if entity["label"].startswith("B-"):
if current_entity:
grouped.append(current_entity)
current_entity = {
"text": entity["text"],
"label": entity["label"][2:],
"start": entity["start"],
"end": entity["end"],
"score": entity["score"]
}
elif entity["label"].startswith("I-") and current_entity:
current_entity["text"] += " " + entity["text"]
current_entity["end"] = entity["end"]
current_entity["score"] = min(current_entity["score"], entity["score"])
if current_entity:
grouped.append(current_entity)
return grouped
def batch_extract(self, texts: list) -> list:
return [self.extract_entities(text) for text in texts]
# Usage
ner = TransformerNER()
entities = ner.extract_entities("Apple CEO Tim Cook visited New York yesterday.")
# [{"text": "Apple", "label": "ORG", "score": 0.99, ...},
# {"text": "Tim Cook", "label": "PER", "score": 0.98, ...},
# {"text": "New York", "label": "LOC", "score": 0.99, ...}]
Custom NER Training
from transformers import (
AutoTokenizer, AutoModelForTokenClassification,
TrainingArguments, Trainer
)
from datasets import Dataset
import numpy as np
class CustomNERTrainer:
def __init__(self, model_name: str, label_list: list):
self.label_list = label_list
self.label_to_id = {l: i for i, l in enumerate(label_list)}
self.id_to_label = {i: l for l, i in self.label_to_id.items()}
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForTokenClassification.from_pretrained(
model_name, num_labels=len(label_list)
)
def tokenize_and_align_labels(self, examples):
tokenized = self.tokenizer(
examples["tokens"],
truncation=True,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples["ner_tags"]):
word_ids = tokenized.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(self.label_to_id[label[word_idx]])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized["labels"] = labels
return tokenized
def train(self, train_data: dict, val_data: dict, output_dir: str = "./ner_model"):
train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
train_dataset = train_dataset.map(
self.tokenize_and_align_labels, batched=True
)
val_dataset = val_dataset.map(
self.tokenize_and_align_labels, batched=True
)
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
warmup_steps=500,
weight_decay=0.01,
evaluation_strategy="epoch"
)
trainer = Trainer(
model=self.model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=self.compute_metrics
)
trainer.train()
def compute_metrics(self, eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=2)
true_predictions = [
[self.id_to_label[p] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
return {"accuracy": 0.0}
def predict(self, text: str) -> list:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
outputs = self.model(**inputs)
predictions = torch.argmax(outputs.logits, dim=-1)
tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
entities = []
for token, pred in zip(tokens, predictions[0]):
if token not in ["[CLS]", "[SEP]", "[PAD]"]:
entities.append({"token": token, "label": self.id_to_label[pred.item()]})
return entities
# Usage
trainer = CustomNERTrainer("bert-base-cased", ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"])
trainer.train(train_data, val_data)
Key Takeaways
- CRF captures label dependencies with handcrafted features
- BiLSTM-CRF combines neural features with structured prediction
- Transformers provide state-of-the-art NER with minimal feature engineering
- BIO tagging marks entity boundaries (Beginning, Inside, Outside)
- Custom NER enables domain-specific entity extraction