πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Data Labeling and Annotation

🟒 Free Lesson

Advertisement

Data Labeling and Annotation

Data Labeling PipelineRaw DataPre-labelHuman ReviewValidationLabeled DataActive Learningβ€’ Uncertainty Samplingβ€’ Query by Committeeβ€’ Expected Model Changeβ€’ Diversity SamplingQuality Assuranceβ€’ Inter-annotator Agreementβ€’ Gold Standard Testingβ€’ Consistency Checksβ€’ Feedback Loops

Labeling Workflow

A robust labeling pipeline combines automated pre-labeling with human review to achieve high quality at scale.

Active Learning Implementation

import numpy as np
from typing import List, Tuple
from sklearn.ensemble import RandomForestClassifier

class ActiveLearner:
    def __init__(self, model, X_pool, y_pool=None):
        self.model = model
        self.X_pool = X_pool.copy()
        self.y_pool = y_pool
        self.labeled_indices = []
        self.unlabeled_indices = list(range(len(X_pool)))
    
    def query_uncertainty(self, n_instances: int = 10) -> List[int]:
        proba = self.model.predict_proba(self.X_pool[self.unlabeled_indices])
        
        entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1)
        
        top_indices = np.argsort(entropy)[-n_instances:]
        
        return [self.unlabeled_indices[i] for i in top_indices]
    
    def query_margin(self, n_instances: int = 10) -> List[int]:
        proba = self.model.predict_proba(self.X_pool[self.unlabeled_indices])
        
        sorted_proba = np.sort(proba, axis=1)
        margins = sorted_proba[:, -1] - sorted_proba[:, -2]
        
        top_indices = np.argsort(margins)[:n_instances]
        
        return [self.unlabeled_indices[i] for i in top_indices]
    
    def label_instances(self, indices: List[int], labels: List[int]):
        for idx, label in zip(indices, labels):
            self.X_pool[idx] = self.X_pool[idx]
            if self.y_pool is not None:
                self.y_pool[idx] = label
            self.labeled_indices.append(idx)
            self.unlabeled_indices.remove(idx)
    
    def train(self):
        if len(self.labeled_indices) > 0:
            X_train = self.X_pool[self.labeled_indices]
            y_train = self.y_pool[self.labeled_indices]
            self.model.fit(X_train, y_train)

learner = ActiveLearner(RandomForestClassifier(), X_pool, y_pool)
query_indices = learner.query_uncertainty(n_instances=20)

Pre-labeling with LLMs

import openai
from typing import List, Dict

class LLMPreLabeler:
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
    
    def pre_label_text(
        self, 
        texts: List[str], 
        labels: List[str]
    ) -> List[Dict]:
        results = []
        
        for text in texts:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": f"""Classify the text into one of: {', '.join(labels)}.
Return JSON with "label" and "confidence" fields."""},
                    {"role": "user", "content": text}
                ],
                temperature=0,
                response_format={"type": "json_object"}
            )
            
            import json
            result = json.loads(response.choices[0].message.content)
            results.append({
                "text": text,
                "predicted_label": result.get("label"),
                "confidence": result.get("confidence", 0.5),
                "needs_review": result.get("confidence", 0.5) < 0.8
            })
        
        return results
    
    def batch_pre_label(
        self, 
        texts: List[str],
        labels: List[str],
        batch_size: int = 10
    ) -> List[Dict]:
        all_results = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            results = self.pre_label_text(batch, labels)
            all_results.extend(results)
        
        return all_results

pre_labeler = LLMPreLabeler(api_key="your-api-key")
labeled_data = pre_labeler.pre_label_text(
    texts=["Great product!", "Terrible service", "Average experience"],
    labels=["positive", "negative", "neutral"]
)

Inter-Annotator Agreement

from typing import List
import numpy as np

class AgreementCalculator:
    @staticmethod
    def cohens_kappa(labels1: List[int], labels2: List[int]) -> float:
        assert len(labels1) == len(labels2)
        n = len(labels1)
        
        categories = set(labels1 + labels2)
        
        agreements = sum(1 for a, b in zip(labels1, labels2) if a == b)
        p_o = agreements / n
        
        p_e = 0
        for cat in categories:
            p1 = labels1.count(cat) / n
            p2 = labels2.count(cat) / n
            p_e += p1 * p2
        
        if p_e == 1:
            return 1.0
        
        return (p_o - p_e) / (1 - p_e)
    
    @staticmethod
    def fleiss_kappa(ratings: np.ndarray) -> float:
        n_items, n_categories = ratings.shape
        n_raters = ratings.sum(axis=1)[0]
        
        p_i = (ratings * (ratings - 1)).sum(axis=1) / (n_raters * (n_raters - 1))
        P = p_i.mean()
        
        p_j = ratings.sum(axis=0) / (n_items * n_raters)
        P_e = (p_j ** 2).sum()
        
        if P_e == 1:
            return 1.0
        
        return (P - P_e) / (1 - P_e)

calc = AgreementCalculator()
kappa = calc.cohens_kappa(annotator1_labels, annotator2_labels)
print(f"Cohen's Kappa: {kappa:.3f}")

Annotation Interface

from dataclasses import dataclass
from typing import List, Optional
import json

@dataclass
class AnnotationTask:
    task_id: str
    data_item: str
    annotation_type: str
    labels: List[str]
    assigned_to: Optional[str] = None
    completed: bool = False
    annotation: Optional[str] = None

class AnnotationManager:
    def __init__(self):
        self.tasks: List[AnnotationTask] = []
        self.completed_tasks: List[AnnotationTask] = []
    
    def create_batch(self, data_items: List[str], labels: List[str]):
        for i, item in enumerate(data_items):
            task = AnnotationTask(
                task_id=f"task_{i}",
                data_item=item,
                annotation_type="classification",
                labels=labels
            )
            self.tasks.append(task)
    
    def assign_task(self, annotator_id: str) -> Optional[AnnotationTask]:
        for task in self.tasks:
            if task.assigned_to is None:
                task.assigned_to = annotator_id
                return task
        return None
    
    def submit_annotation(self, task_id: str, annotation: str):
        for task in self.tasks:
            if task.task_id == task_id:
                task.annotation = annotation
                task.completed = True
                self.completed_tasks.append(task)
                self.tasks.remove(task)
                break
    
    def export_annotations(self) -> List[Dict]:
        return [
            {
                "id": task.task_id,
                "data": task.data_item,
                "label": task.annotation
            }
            for task in self.completed_tasks
        ]

manager = AnnotationManager()
manager.create_batch(
    data_items=["Text 1", "Text 2", "Text 3"],
    labels=["positive", "negative", "neutral"]
)
task = manager.assign_task("annotator_1")
manager.submit_annotation(task.task_id, "positive")
annotations = manager.export_annotations()

Best Practices

  • Start with clear annotation guidelines
  • Use pre-labeling to accelerate human annotation
  • Measure inter-annotator agreement regularly
  • Implement quality control with gold standard items
  • Use active learning to prioritize difficult examples
  • Provide annotator feedback and training
⭐

Premium Content

Data Labeling and Annotation

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement