Multilingual Models
Multilingual models enable NLP tasks across many languages using a single model with shared representations and cross-lingual transfer learning.
Multilingual Classification
from transformers import pipeline
class MultilingualClassifier:
def __init__(self, model_name: str = "xlm-roberta-base"):
self.classifier = pipeline("zero-shot-classification", model=model_name)
self.labels = ["positive", "negative", "neutral"]
def classify(self, text: str, language: str = "auto") -> dict:
result = self.classifier(text, self.labels)
return {
"text": text,
"label": result["labels"][0],
"confidence": result["scores"][0],
"language": language
}
def classify_batch(self, texts: list) -> list:
return [self.classify(text) for text in texts]
# Usage
classifier = MultilingualClassifier()
result = classifier.classify("Este producto es excelente")
# {"label": "positive", "confidence": 0.89}
Cross-Lingual Transfer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
class CrossLingualTransfer:
def __init__(self, model_name: str = "xlm-roberta-base"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
def train_on_english(self, train_data: list):
"""Train on English data, transfer to other languages"""
pass
def predict(self, text: str, language: str = "en") -> dict:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
predicted_class = torch.argmax(probs, dim=-1).item()
return {"class": predicted_class, "confidence": probs[0][predicted_class].item()}
def zero_shot_transfer(self, texts_by_language: dict) -> dict:
results = {}
for lang, texts in texts_by_language.items():
results[lang] = [self.predict(text, lang) for text in texts]
return results
# Usage
transfer = CrossLingualTransfer()
results = transfer.zero_shot_transfer({
"en": ["Great product!"],
"es": ["Excelente producto!"],
"fr": ["Excellent produit!"]
})
Multilingual Embeddings
from sentence_transformers import SentenceTransformer
import numpy as np
class MultilingualEmbeddings:
def __init__(self, model_name: str = "paraphrase-multilingual-MiniLM-L12-v2"):
self.model = SentenceTransformer(model_name)
def encode(self, texts: list) -> np.ndarray:
return self.model.encode(texts)
def similarity(self, text1: str, text2: str) -> float:
embeddings = self.model.encode([text1, text2])
return np.dot(embeddings[0], embeddings[1]) / (
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
def cross_lingual_similarity(self, texts_by_lang: dict) -> dict:
all_texts = []
for texts in texts_by_lang.values():
all_texts.extend(texts)
embeddings = self.model.encode(all_texts)
similarities = {}
for i, (lang1, texts1) in enumerate(texts_by_lang.items()):
for j, (lang2, texts2) in enumerate(texts_by_lang.items()):
if i < j:
sim = np.mean([
np.dot(embeddings[i], embeddings[j]) /
(np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
for i in range(len(texts1))
for j in range(len(texts2))
])
similarities[f"{lang1}-{lang2}"] = sim
return similarities
# Usage
embeddings = MultilingualEmbeddings()
sim = embeddings.similarity("Hello world", "Hola mundo")
# 0.85 - shows cross-lingual similarity
Key Takeaways
- Multilingual models like XLM-R handle 100+ languages
- Cross-lingual transfer enables training on one language, deploying on many
- Shared embeddings capture semantic meaning across languages
- Zero-shot classification works without language-specific training data
- Sentence transformers provide multilingual semantic similarity