Data Labeling and Annotation

Labeling Workflow

A robust labeling pipeline combines automated pre-labeling with human review to achieve high quality at scale.

Active Learning Implementation

import numpy as np
from typing import List, Tuple
from sklearn.ensemble import RandomForestClassifier

class ActiveLearner:
    def __init__(self, model, X_pool, y_pool=None):
        self.model = model
        self.X_pool = X_pool.copy()
        self.y_pool = y_pool
        self.labeled_indices = []
        self.unlabeled_indices = list(range(len(X_pool)))
    
    def query_uncertainty(self, n_instances: int = 10) -> List[int]:
        proba = self.model.predict_proba(self.X_pool[self.unlabeled_indices])
        
        entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1)
        
        top_indices = np.argsort(entropy)[-n_instances:]
        
        return [self.unlabeled_indices[i] for i in top_indices]
    
    def query_margin(self, n_instances: int = 10) -> List[int]:
        proba = self.model.predict_proba(self.X_pool[self.unlabeled_indices])
        
        sorted_proba = np.sort(proba, axis=1)
        margins = sorted_proba[:, -1] - sorted_proba[:, -2]
        
        top_indices = np.argsort(margins)[:n_instances]
        
        return [self.unlabeled_indices[i] for i in top_indices]
    
    def label_instances(self, indices: List[int], labels: List[int]):
        for idx, label in zip(indices, labels):
            self.X_pool[idx] = self.X_pool[idx]
            if self.y_pool is not None:
                self.y_pool[idx] = label
            self.labeled_indices.append(idx)
            self.unlabeled_indices.remove(idx)
    
    def train(self):
        if len(self.labeled_indices) > 0:
            X_train = self.X_pool[self.labeled_indices]
            y_train = self.y_pool[self.labeled_indices]
            self.model.fit(X_train, y_train)

learner = ActiveLearner(RandomForestClassifier(), X_pool, y_pool)
query_indices = learner.query_uncertainty(n_instances=20)

Pre-labeling with LLMs

import openai
from typing import List, Dict

class LLMPreLabeler:
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
    
    def pre_label_text(
        self, 
        texts: List[str], 
        labels: List[str]
    ) -> List[Dict]:
        results = []
        
        for text in texts:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": f"""Classify the text into one of: {', '.join(labels)}.
Return JSON with "label" and "confidence" fields."""},
                    {"role": "user", "content": text}
                ],
                temperature=0,
                response_format={"type": "json_object"}
            )
            
            import json
            result = json.loads(response.choices[0].message.content)
            results.append({
                "text": text,
                "predicted_label": result.get("label"),
                "confidence": result.get("confidence", 0.5),
                "needs_review": result.get("confidence", 0.5) < 0.8
            })
        
        return results
    
    def batch_pre_label(
        self, 
        texts: List[str],
        labels: List[str],
        batch_size: int = 10
    ) -> List[Dict]:
        all_results = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            results = self.pre_label_text(batch, labels)
            all_results.extend(results)
        
        return all_results

pre_labeler = LLMPreLabeler(api_key="your-api-key")
labeled_data = pre_labeler.pre_label_text(
    texts=["Great product!", "Terrible service", "Average experience"],
    labels=["positive", "negative", "neutral"]
)

Inter-Annotator Agreement

from typing import List
import numpy as np

class AgreementCalculator:
    @staticmethod
    def cohens_kappa(labels1: List[int], labels2: List[int]) -> float:
        assert len(labels1) == len(labels2)
        n = len(labels1)
        
        categories = set(labels1 + labels2)
        
        agreements = sum(1 for a, b in zip(labels1, labels2) if a == b)
        p_o = agreements / n
        
        p_e = 0
        for cat in categories:
            p1 = labels1.count(cat) / n
            p2 = labels2.count(cat) / n
            p_e += p1 * p2
        
        if p_e == 1:
            return 1.0
        
        return (p_o - p_e) / (1 - p_e)
    
    @staticmethod
    def fleiss_kappa(ratings: np.ndarray) -> float:
        n_items, n_categories = ratings.shape
        n_raters = ratings.sum(axis=1)[0]
        
        p_i = (ratings * (ratings - 1)).sum(axis=1) / (n_raters * (n_raters - 1))
        P = p_i.mean()
        
        p_j = ratings.sum(axis=0) / (n_items * n_raters)
        P_e = (p_j ** 2).sum()
        
        if P_e == 1:
            return 1.0
        
        return (P - P_e) / (1 - P_e)

calc = AgreementCalculator()
kappa = calc.cohens_kappa(annotator1_labels, annotator2_labels)
print(f"Cohen's Kappa: {kappa:.3f}")

Annotation Interface

from dataclasses import dataclass
from typing import List, Optional
import json

@dataclass
class AnnotationTask:
    task_id: str
    data_item: str
    annotation_type: str
    labels: List[str]
    assigned_to: Optional[str] = None
    completed: bool = False
    annotation: Optional[str] = None

class AnnotationManager:
    def __init__(self):
        self.tasks: List[AnnotationTask] = []
        self.completed_tasks: List[AnnotationTask] = []
    
    def create_batch(self, data_items: List[str], labels: List[str]):
        for i, item in enumerate(data_items):
            task = AnnotationTask(
                task_id=f"task_{i}",
                data_item=item,
                annotation_type="classification",
                labels=labels
            )
            self.tasks.append(task)
    
    def assign_task(self, annotator_id: str) -> Optional[AnnotationTask]:
        for task in self.tasks:
            if task.assigned_to is None:
                task.assigned_to = annotator_id
                return task
        return None
    
    def submit_annotation(self, task_id: str, annotation: str):
        for task in self.tasks:
            if task.task_id == task_id:
                task.annotation = annotation
                task.completed = True
                self.completed_tasks.append(task)
                self.tasks.remove(task)
                break
    
    def export_annotations(self) -> List[Dict]:
        return [
            {
                "id": task.task_id,
                "data": task.data_item,
                "label": task.annotation
            }
            for task in self.completed_tasks
        ]

manager = AnnotationManager()
manager.create_batch(
    data_items=["Text 1", "Text 2", "Text 3"],
    labels=["positive", "negative", "neutral"]
)
task = manager.assign_task("annotator_1")
manager.submit_annotation(task.task_id, "positive")
annotations = manager.export_annotations()

Best Practices

Start with clear annotation guidelines
Use pre-labeling to accelerate human annotation
Measure inter-annotator agreement regularly
Implement quality control with gold standard items
Use active learning to prioritize difficult examples
Provide annotator feedback and training

Data Labeling and Annotation

Data Labeling and Annotation

Labeling Workflow

Active Learning Implementation

Pre-labeling with LLMs

Inter-Annotator Agreement

Annotation Interface

Best Practices

Premium Content

Need Expert Generative AI Help?