Data Labeling and Annotation
Labeling Workflow
A robust labeling pipeline combines automated pre-labeling with human review to achieve high quality at scale.
Active Learning Implementation
import numpy as np
from typing import List, Tuple
from sklearn.ensemble import RandomForestClassifier
class ActiveLearner:
def __init__(self, model, X_pool, y_pool=None):
self.model = model
self.X_pool = X_pool.copy()
self.y_pool = y_pool
self.labeled_indices = []
self.unlabeled_indices = list(range(len(X_pool)))
def query_uncertainty(self, n_instances: int = 10) -> List[int]:
proba = self.model.predict_proba(self.X_pool[self.unlabeled_indices])
entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1)
top_indices = np.argsort(entropy)[-n_instances:]
return [self.unlabeled_indices[i] for i in top_indices]
def query_margin(self, n_instances: int = 10) -> List[int]:
proba = self.model.predict_proba(self.X_pool[self.unlabeled_indices])
sorted_proba = np.sort(proba, axis=1)
margins = sorted_proba[:, -1] - sorted_proba[:, -2]
top_indices = np.argsort(margins)[:n_instances]
return [self.unlabeled_indices[i] for i in top_indices]
def label_instances(self, indices: List[int], labels: List[int]):
for idx, label in zip(indices, labels):
self.X_pool[idx] = self.X_pool[idx]
if self.y_pool is not None:
self.y_pool[idx] = label
self.labeled_indices.append(idx)
self.unlabeled_indices.remove(idx)
def train(self):
if len(self.labeled_indices) > 0:
X_train = self.X_pool[self.labeled_indices]
y_train = self.y_pool[self.labeled_indices]
self.model.fit(X_train, y_train)
learner = ActiveLearner(RandomForestClassifier(), X_pool, y_pool)
query_indices = learner.query_uncertainty(n_instances=20)
Pre-labeling with LLMs
import openai
from typing import List, Dict
class LLMPreLabeler:
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
def pre_label_text(
self,
texts: List[str],
labels: List[str]
) -> List[Dict]:
results = []
for text in texts:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": f"""Classify the text into one of: {', '.join(labels)}.
Return JSON with "label" and "confidence" fields."""},
{"role": "user", "content": text}
],
temperature=0,
response_format={"type": "json_object"}
)
import json
result = json.loads(response.choices[0].message.content)
results.append({
"text": text,
"predicted_label": result.get("label"),
"confidence": result.get("confidence", 0.5),
"needs_review": result.get("confidence", 0.5) < 0.8
})
return results
def batch_pre_label(
self,
texts: List[str],
labels: List[str],
batch_size: int = 10
) -> List[Dict]:
all_results = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
results = self.pre_label_text(batch, labels)
all_results.extend(results)
return all_results
pre_labeler = LLMPreLabeler(api_key="your-api-key")
labeled_data = pre_labeler.pre_label_text(
texts=["Great product!", "Terrible service", "Average experience"],
labels=["positive", "negative", "neutral"]
)
Inter-Annotator Agreement
from typing import List
import numpy as np
class AgreementCalculator:
@staticmethod
def cohens_kappa(labels1: List[int], labels2: List[int]) -> float:
assert len(labels1) == len(labels2)
n = len(labels1)
categories = set(labels1 + labels2)
agreements = sum(1 for a, b in zip(labels1, labels2) if a == b)
p_o = agreements / n
p_e = 0
for cat in categories:
p1 = labels1.count(cat) / n
p2 = labels2.count(cat) / n
p_e += p1 * p2
if p_e == 1:
return 1.0
return (p_o - p_e) / (1 - p_e)
@staticmethod
def fleiss_kappa(ratings: np.ndarray) -> float:
n_items, n_categories = ratings.shape
n_raters = ratings.sum(axis=1)[0]
p_i = (ratings * (ratings - 1)).sum(axis=1) / (n_raters * (n_raters - 1))
P = p_i.mean()
p_j = ratings.sum(axis=0) / (n_items * n_raters)
P_e = (p_j ** 2).sum()
if P_e == 1:
return 1.0
return (P - P_e) / (1 - P_e)
calc = AgreementCalculator()
kappa = calc.cohens_kappa(annotator1_labels, annotator2_labels)
print(f"Cohen's Kappa: {kappa:.3f}")
Annotation Interface
from dataclasses import dataclass
from typing import List, Optional
import json
@dataclass
class AnnotationTask:
task_id: str
data_item: str
annotation_type: str
labels: List[str]
assigned_to: Optional[str] = None
completed: bool = False
annotation: Optional[str] = None
class AnnotationManager:
def __init__(self):
self.tasks: List[AnnotationTask] = []
self.completed_tasks: List[AnnotationTask] = []
def create_batch(self, data_items: List[str], labels: List[str]):
for i, item in enumerate(data_items):
task = AnnotationTask(
task_id=f"task_{i}",
data_item=item,
annotation_type="classification",
labels=labels
)
self.tasks.append(task)
def assign_task(self, annotator_id: str) -> Optional[AnnotationTask]:
for task in self.tasks:
if task.assigned_to is None:
task.assigned_to = annotator_id
return task
return None
def submit_annotation(self, task_id: str, annotation: str):
for task in self.tasks:
if task.task_id == task_id:
task.annotation = annotation
task.completed = True
self.completed_tasks.append(task)
self.tasks.remove(task)
break
def export_annotations(self) -> List[Dict]:
return [
{
"id": task.task_id,
"data": task.data_item,
"label": task.annotation
}
for task in self.completed_tasks
]
manager = AnnotationManager()
manager.create_batch(
data_items=["Text 1", "Text 2", "Text 3"],
labels=["positive", "negative", "neutral"]
)
task = manager.assign_task("annotator_1")
manager.submit_annotation(task.task_id, "positive")
annotations = manager.export_annotations()
Best Practices
- Start with clear annotation guidelines
- Use pre-labeling to accelerate human annotation
- Measure inter-annotator agreement regularly
- Implement quality control with gold standard items
- Use active learning to prioritize difficult examples
- Provide annotator feedback and training