Machine Translation
Machine translation converts text from one language to another using sequence-to-sequence models with attention mechanisms and transformer architectures.
Seq2Seq with Attention
import torch
import torch.nn as nn
import random
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(input_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
self.fc_cell = nn.Linear(hidden_dim * 2, hidden_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
embedded = self.dropout(self.embedding(src))
outputs, (hidden, cell) = self.rnn(embedded)
hidden = torch.tanh(self.fc_hidden(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
cell = torch.tanh(self.fc_cell(torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1)))
return outputs, hidden.unsqueeze(0), cell.unsqueeze(0)
class Attention(nn.Module):
def __init__(self, enc_hidden_dim, dec_hidden_dim):
super().__init__()
self.attn = nn.Linear(enc_hidden_dim * 2 + dec_hidden_dim, dec_hidden_dim)
self.v = nn.Linear(dec_hidden_dim, 1, bias=False)
def forward(self, hidden, encoder_outputs):
src_len = encoder_outputs.shape[0]
hidden = hidden.repeat(src_len, 1, 1).permute(1, 0, 2)
encoder_outputs = encoder_outputs.permute(1, 0, 2)
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
attention = self.v(energy).squeeze(2)
return torch.softmax(attention, dim=1)
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout, attention):
super().__init__()
self.output_dim = output_dim
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.LSTM(emb_dim + enc_hidden_dim * 2, dec_hidden_dim, n_layers=1, dropout=dropout)
self.fc_out = nn.Linear(enc_hidden_dim * 2 + dec_hidden_dim + emb_dim, output_dim)
self.dropout = nn.Dropout(dropout)
self.attention = attention
def forward(self, input, hidden, cell, encoder_outputs):
input = input.unsqueeze(0)
embedded = self.dropout(self.embedding(input))
a = self.attention(hidden, encoder_outputs).unsqueeze(1)
encoder_outputs_perm = encoder_outputs.permute(1, 0, 2)
weighted = torch.bmm(a, encoder_outputs_perm).permute(1, 0, 2)
rnn_input = torch.cat((embedded, weighted), dim=2)
output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
return prediction, hidden, cell
# Usage
encoder = Encoder(input_dim=10000, emb_dim=256, hidden_dim=512, n_layers=2, dropout=0.5)
attention = Attention(enc_hidden_dim=512, dec_hidden_dim=512)
decoder = Decoder(output_dim=10000, emb_dim=256, enc_hidden_dim=512, dec_hidden_dim=512, dropout=0.5, attention=attention)
Transformer-Based Translation
class TransformerTranslator:
def __init__(self, model_name: str = "Helsinki-NLP/opus-mt-en-es"):
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
def translate(self, text: str) -> str:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
translated = self.model.generate(**inputs)
return self.tokenizer.decode(translated[0], skip_special_tokens=True)
def translate_batch(self, texts: list, batch_size: int = 32) -> list:
translations = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
inputs = self.tokenizer(batch, return_tensors="pt", truncation=True,
padding=True, max_length=512)
translated = self.model.generate(**inputs)
batch_translations = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
translations.extend(batch_translations)
return translations
def translate_with_options(self, text: str, options: dict) -> dict:
inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
translated = self.model.generate(
**inputs,
num_beams=options.get("num_beams", 4),
length_penalty=options.get("length_penalty", 1.0),
max_length=options.get("max_length", 512)
)
return {
"translation": self.tokenizer.decode(translated[0], skip_special_tokens=True),
"options": options
}
# Usage
translator = TransformerTranslator()
translation = translator.translate("Hello, how are you?")
# "Hola, como estas?"
Multi-Language Translator
class MultiLanguageTranslator:
def __init__(self):
self.models = {}
self.language_pairs = [
("en", "es"), ("en", "fr"), ("en", "de"),
("en", "zh"), ("en", "ja"), ("en", "ko")
]
def load_model(self, src_lang: str, tgt_lang: str):
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
self.models[(src_lang, tgt_lang)] = (tokenizer, model)
except:
print(f"Model not found for {src_lang}-{tgt_lang}")
def translate(self, text: str, src_lang: str, tgt_lang: str) -> str:
if (src_lang, tgt_lang) not in self.models:
self.load_model(src_lang, tgt_lang)
tokenizer, model = self.models[(src_lang, tgt_lang)]
inputs = tokenizer(text, return_tensors="pt", truncation=True)
translated = model.generate(**inputs)
return tokenizer.decode(translated[0], skip_special_tokens=True)
def detect_and_translate(self, text: str, target_lang: str) -> dict:
src_lang = self._detect_language(text)
translation = self.translate(text, src_lang, target_lang)
return {
"source_language": src_lang,
"target_language": target_lang,
"original": text,
"translation": translation
}
def _detect_language(self, text: str) -> str:
from langdetect import detect
return detect(text)
# Usage
translator = MultiLanguageTranslator()
result = translator.detect_and_translate("Hello world", "es")
# {"source_language": "en", "target_language": "es", "translation": "Hola mundo"}
Translation Evaluator
class TranslationEvaluator:
def __init__(self):
pass
def bleu_score(self, reference: str, hypothesis: str) -> float:
from nltk.translate.bleu_score import sentence_bleu
reference_tokens = reference.split()
hypothesis_tokens = hypothesis.split()
return sentence_bleu([reference_tokens], hypothesis_tokens)
def meteor_score(self, reference: str, hypothesis: str) -> float:
from nltk.translate.meteor_score import meteor_score
return meteor_score([reference.split()], hypothesis.split())
def edit_distance(self, reference: str, hypothesis: str) -> int:
import numpy as np
len_ref = len(reference)
len_hyp = len(hypothesis)
dp = np.zeros((len_ref + 1, len_hyp + 1))
for i in range(len_ref + 1):
dp[i][0] = i
for j in range(len_hyp + 1):
dp[0][j] = j
for i in range(1, len_ref + 1):
for j in range(1, len_hyp + 1):
if reference[i-1] == hypothesis[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
return int(dp[len_ref][len_hyp])
def evaluate_pair(self, reference: str, hypothesis: str) -> dict:
return {
"bleu": self.bleu_score(reference, hypothesis),
"meteor": self.meteor_score(reference, hypothesis),
"edit_distance": self.edit_distance(reference, hypothesis)
}
# Usage
evaluator = TranslationEvaluator()
scores = evaluator.evaluate_pair("The cat is on the mat", "El gato esta en la alfombra")
Key Takeaways
- Seq2Seq with attention enables variable-length input/output translation
- Transformers provide state-of-the-art translation quality
- Beam search improves output quality over greedy decoding
- BLEU score is the standard metric for translation evaluation
- Pre-trained models like MarianMT enable quick deployment