Introduction
Text generation uses language models to produce coherent text based on prompts.
Generation Pipeline
from transformers import pipeline
# Using GPT-2
generator = pipeline('text-generation', model='gpt2')
result = generator("Once upon a time", max_length=50, num_return_sequences=2)
for r in result:
print(r['generated_text'])
Sampling Methods
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')
# Greedy (no sampling)
inputs = tokenizer("The sky is", return_tensors='pt')
greedy = model.generate(**inputs, do_sample=False, max_new_tokens=10)
# Sampling with temperature
sampled = model.generate(**inputs, do_sample=True, temperature=0.7, max_new_tokens=10)
# Top-k sampling
sampled_k = model.generate(**inputs, do_sample=True, top_k=50, max_new_tokens=10)
# Top-p (nucleus) sampling
sampled_p = model.generate(**inputs, do_sample=True, top_p=0.9, max_new_tokens=10)
Beam Search
# Beam search for diverse outputs
outputs = model.generate(
**inputs,
num_beams=5,
no_repeat_ngram_size=2,
max_new_tokens=10
)
Custom Training
# Fine-tune GPT-2 on custom data
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# Prepare data and train
training_args = TrainingArguments(output_dir='./results', num_train_epochs=3)
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
trainer.train()
Practice Problems
- Generate text with GPT-2
- Use different sampling methods
- Implement beam search
- Fine-tune language model
- Control generation with parameters