πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Machine Translation

🟒 Free Lesson

Advertisement

Machine Translation

Source Text"Hello World"EnglishEncoderBiLSTM / TransformerContext VectorsAttentionSelf-AttentionCross-AttentionDecoderAutoregressiveToken GenerationTarget Text"Hola Mundo"SpanishTeacher ForcingTraining StrategyBeam SearchInference DecodingBLEU ScoreEvaluation MetricPipeline: Tokenize -> Embed -> Encode -> Attend -> Decode -> Detokenize

Machine translation converts text from one language to another using sequence-to-sequence models with attention mechanisms and transformer architectures.

Seq2Seq with Attention

import torch
import torch.nn as nn
import random

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.fc_hidden = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc_cell = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        hidden = torch.tanh(self.fc_hidden(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        cell = torch.tanh(self.fc_cell(torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1)))
        return outputs, hidden.unsqueeze(0), cell.unsqueeze(0)

class Attention(nn.Module):
    def __init__(self, enc_hidden_dim, dec_hidden_dim):
        super().__init__()
        self.attn = nn.Linear(enc_hidden_dim * 2 + dec_hidden_dim, dec_hidden_dim)
        self.v = nn.Linear(dec_hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[0]
        hidden = hidden.repeat(src_len, 1, 1).permute(1, 0, 2)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hidden_dim, dec_hidden_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + enc_hidden_dim * 2, dec_hidden_dim, n_layers=1, dropout=dropout)
        self.fc_out = nn.Linear(enc_hidden_dim * 2 + dec_hidden_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.attention = attention

    def forward(self, input, hidden, cell, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        encoder_outputs_perm = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs_perm).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
        return prediction, hidden, cell

# Usage
encoder = Encoder(input_dim=10000, emb_dim=256, hidden_dim=512, n_layers=2, dropout=0.5)
attention = Attention(enc_hidden_dim=512, dec_hidden_dim=512)
decoder = Decoder(output_dim=10000, emb_dim=256, enc_hidden_dim=512, dec_hidden_dim=512, dropout=0.5, attention=attention)

Transformer-Based Translation

class TransformerTranslator:
    def __init__(self, model_name: str = "Helsinki-NLP/opus-mt-en-es"):
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    def translate(self, text: str) -> str:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        translated = self.model.generate(**inputs)
        return self.tokenizer.decode(translated[0], skip_special_tokens=True)

    def translate_batch(self, texts: list, batch_size: int = 32) -> list:
        translations = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(batch, return_tensors="pt", truncation=True,
                                   padding=True, max_length=512)
            translated = self.model.generate(**inputs)
            batch_translations = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
            translations.extend(batch_translations)
        return translations

    def translate_with_options(self, text: str, options: dict) -> dict:
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        translated = self.model.generate(
            **inputs,
            num_beams=options.get("num_beams", 4),
            length_penalty=options.get("length_penalty", 1.0),
            max_length=options.get("max_length", 512)
        )
        return {
            "translation": self.tokenizer.decode(translated[0], skip_special_tokens=True),
            "options": options
        }

# Usage
translator = TransformerTranslator()
translation = translator.translate("Hello, how are you?")
# "Hola, como estas?"

Multi-Language Translator

class MultiLanguageTranslator:
    def __init__(self):
        self.models = {}
        self.language_pairs = [
            ("en", "es"), ("en", "fr"), ("en", "de"),
            ("en", "zh"), ("en", "ja"), ("en", "ko")
        ]

    def load_model(self, src_lang: str, tgt_lang: str):
        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
        model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            self.models[(src_lang, tgt_lang)] = (tokenizer, model)
        except:
            print(f"Model not found for {src_lang}-{tgt_lang}")

    def translate(self, text: str, src_lang: str, tgt_lang: str) -> str:
        if (src_lang, tgt_lang) not in self.models:
            self.load_model(src_lang, tgt_lang)

        tokenizer, model = self.models[(src_lang, tgt_lang)]
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        translated = model.generate(**inputs)
        return tokenizer.decode(translated[0], skip_special_tokens=True)

    def detect_and_translate(self, text: str, target_lang: str) -> dict:
        src_lang = self._detect_language(text)
        translation = self.translate(text, src_lang, target_lang)
        return {
            "source_language": src_lang,
            "target_language": target_lang,
            "original": text,
            "translation": translation
        }

    def _detect_language(self, text: str) -> str:
        from langdetect import detect
        return detect(text)

# Usage
translator = MultiLanguageTranslator()
result = translator.detect_and_translate("Hello world", "es")
# {"source_language": "en", "target_language": "es", "translation": "Hola mundo"}

Translation Evaluator

class TranslationEvaluator:
    def __init__(self):
        pass

    def bleu_score(self, reference: str, hypothesis: str) -> float:
        from nltk.translate.bleu_score import sentence_bleu
        reference_tokens = reference.split()
        hypothesis_tokens = hypothesis.split()
        return sentence_bleu([reference_tokens], hypothesis_tokens)

    def meteor_score(self, reference: str, hypothesis: str) -> float:
        from nltk.translate.meteor_score import meteor_score
        return meteor_score([reference.split()], hypothesis.split())

    def edit_distance(self, reference: str, hypothesis: str) -> int:
        import numpy as np
        len_ref = len(reference)
        len_hyp = len(hypothesis)
        dp = np.zeros((len_ref + 1, len_hyp + 1))
        for i in range(len_ref + 1):
            dp[i][0] = i
        for j in range(len_hyp + 1):
            dp[0][j] = j
        for i in range(1, len_ref + 1):
            for j in range(1, len_hyp + 1):
                if reference[i-1] == hypothesis[j-1]:
                    dp[i][j] = dp[i-1][j-1]
                else:
                    dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
        return int(dp[len_ref][len_hyp])

    def evaluate_pair(self, reference: str, hypothesis: str) -> dict:
        return {
            "bleu": self.bleu_score(reference, hypothesis),
            "meteor": self.meteor_score(reference, hypothesis),
            "edit_distance": self.edit_distance(reference, hypothesis)
        }

# Usage
evaluator = TranslationEvaluator()
scores = evaluator.evaluate_pair("The cat is on the mat", "El gato esta en la alfombra")

Key Takeaways

  • Seq2Seq with attention enables variable-length input/output translation
  • Transformers provide state-of-the-art translation quality
  • Beam search improves output quality over greedy decoding
  • BLEU score is the standard metric for translation evaluation
  • Pre-trained models like MarianMT enable quick deployment
⭐

Premium Content

Machine Translation

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement