Speech and Audio Generation
Text-to-Speech (TTS)
from transformers import pipeline
def text_to_speech(text, model="facebook/mms-tts-eng"):
tts = pipeline("text-to-speech", model=model)
audio = tts(text)
return audio
# Advanced TTS with voice cloning
from tortoise.api import TextToSpeaker
def clone_voice(text, voice_samples):
tts = TextToSpeaker()
audio = tts.tts_with_preset(
text,
voice_samples=voice_samples,
preset="ultra_fast"
)
return audio
Speech-to-Text (STT)
import whisper
def speech_to_text(audio_path, model_size="base"):
model = whisper.load_model(model_size)
result = model.transcribe(audio_path)
return {
"text": result["text"],
"language": result["language"],
"segments": result["segments"]
}
Audio Generation
# Music generation
from audiocraft.models import MusicGen
def generate_music(description, duration=10):
model = MusicGen.get_pretrained('facebook/musicgen-small')
model.set_generation_params(duration=duration)
wav = model.generate([description])
return wav
# Sound effects
def generate_sound_effect(description):
# Use audio generation model
model = load_audio_model()
return model.generate(description)
Summary
Speech and audio generation models enable natural language interfaces and creative audio content creation. They're essential for voice assistants, accessibility, and media production.
Next: We'll explore multimodal models.