Audio & Speech Data Science
Audio data contains rich information beyond words β emotion, intent, and context. Learn to process and analyze audio for modern ML applications.
Audio Signal Processing
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
def load_and_preprocess(audio_path, sr=16000):
"""Load audio and apply pre-processing"""
y, sr = librosa.load(audio_path, sr=sr)
# Pre-emphasis filter
y_pre = np.append(y[0], y[1:] - 0.97 * y[:-1])
# Normalize
y_pre = y_pre / (np.max(np.abs(y_pre)) + 1e-8)
return y_pre, sr
def extract_mfcc(y, sr, n_mfcc=13, n_fft=512, hop_length=160):
"""Extract MFCC features"""
mfcc = librosa.feature.mfcc(
y=y, sr=sr,
n_mfcc=n_mfcc,
n_fft=n_fft,
hop_length=hop_length
)
# Delta and delta-delta
mfcc_delta = librosa.feature.delta(mfcc)
mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
# Stack features
features = np.vstack([mfcc, mfcc_delta, mfcc_delta2])
return features
def extract_mel_spectrogram(y, sr, n_mels=128, n_fft=512, hop_length=160):
"""Extract mel spectrogram"""
mel_spec = librosa.feature.melspectrogram(
y=y, sr=sr,
n_mels=n_mels,
n_fft=n_fft,
hop_length=hop_length
)
# Convert to dB
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
return mel_spec_db
def extract_features(y, sr):
"""Extract comprehensive audio features"""
features = {}
# Spectral features
features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
features['spectral_bandwidth'] = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0]
# MFCC
features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
# Chroma
features['chroma'] = librosa.feature.chroma_stft(y=y, sr=sr)
# Tempo
features['tempo'] = librosa.beat.tempo(y=y, sr=sr)[0]
return features
# Visualize audio features
def plot_audio_features(y, sr):
fig, axes = plt.subplots(4, 1, figsize=(12, 10))
# Waveform
librosa.display.waveshow(y, sr=sr, ax=axes[0])
axes[0].set_title('Waveform')
# Spectrogram
D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
axes[1].set_title('Spectrogram')
# MFCC
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
librosa.display.specshow(mfcc, sr=sr, x_axis='time', ax=axes[2])
axes[2].set_title('MFCC')
# Mel spectrogram
mel = librosa.feature.melspectrogram(y=y, sr=sr)
mel_db = librosa.power_to_db(mel, ref=np.max)
librosa.display.specshow(mel_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[3])
axes[3].set_title('Mel Spectrogram')
plt.tight_layout()
plt.show()
Speech Recognition with wav2vec
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import numpy as np
class SpeechRecognizer:
def __init__(self, model_name="facebook/wav2vec2-large-960h"):
self.processor = Wav2Vec2Processor.from_pretrained(model_name)
self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
self.model.eval()
def transcribe(self, audio_path):
"""Transcribe audio to text"""
# Load and resample
audio, sr = librosa.load(audio_path, sr=16000)
# Process
inputs = self.processor(
audio,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
# Predict
with torch.no_grad():
logits = self.model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = self.processor.batch_decode(predicted_ids)
return transcription[0]
def transcribe_with_timestamps(self, audio_path):
"""Transcribe with word-level timestamps"""
audio, sr = librosa.load(audio_path, sr=16000)
inputs = self.processor(
audio,
sampling_rate=16000,
return_tensors="pt",
padding=True
)
with torch.no_grad():
outputs = self.model(**inputs)
# Get alignment scores
logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
# Find non-blank tokens
blank_token_id = self.processor.tokenizer.pad_token_id
predicted_ids = torch.argmax(logits, dim=-1)
timestamps = []
current_time = 0
time_per_frame = len(audio) / (logits.shape[1] * sr)
for i, token_id in enumerate(predicted_ids[0]):
if token_id != blank_token_id:
word = self.processor.decode([token_id.item()])
timestamps.append({
'word': word,
'start': current_time,
'end': current_time + time_per_frame,
'confidence': probs[0, i, token_id].item()
})
current_time += time_per_frame
return timestamps
# Fine-tuning for custom domain
def fine_tune_wav2vec():
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
model_name = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name, vocab_size=processor.tokenizer.vocab_size)
# Load custom dataset
dataset = load_dataset("your_custom_dataset")
def preprocess_function(examples):
audio = examples["audio"]
inputs = processor(
audio["array"],
sampling_rate=audio["sampling_rate"],
return_tensors="pt",
padding=True
)
with processor.as_target_processor():
labels = processor(examples["text"], return_tensors="pt", padding=True)
inputs["labels"] = labels["input_ids"]
return inputs
tokenized_dataset = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names)
return model, processor, tokenized_dataset
Audio Classification
import torch
import torch.nn as nn
import torchaudio
class AudioClassifier(nn.Module):
def __init__(self, num_classes, n_mels=128, hidden_dim=256):
super().__init__()
# CNN for spectrogram features
self.cnn = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(32, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.AdaptiveAvgPool2d((1, None))
)
# LSTM for temporal modeling
self.lstm = nn.LSTM(
input_size=128,
hidden_size=hidden_dim,
num_layers=2,
batch_first=True,
bidirectional=True,
dropout=0.3
)
# Attention
self.attention = nn.Linear(hidden_dim * 2, 1)
# Classifier
self.classifier = nn.Sequential(
nn.Linear(hidden_dim * 2, 128),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, num_classes)
)
def forward(self, x):
# CNN features
cnn_out = self.cnn(x)
# Reshape for LSTM: (batch, time, features)
batch, channels, freq, time = cnn_out.size()
lstm_in = cnn_out.squeeze(2).permute(0, 2, 1)
# LSTM
lstm_out, _ = self.lstm(lstm_in)
# Attention pooling
attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
context = torch.sum(attention_weights * lstm_out, dim=1)
# Classify
return self.classifier(context)
# Audio data augmentation
class AudioAugmentation:
def __init__(self, sr=16000):
self.sr = sr
def time_stretch(self, y, rate):
return librosa.effects.time_stretch(y, rate=rate)
def pitch_shift(self, y, n_steps):
return librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)
def add_noise(self, y, noise_level=0.005):
noise = np.random.randn(len(y)) * noise_level
return y + noise
def time_mask(self, y, max_mask_time=0.1):
mask_length = int(len(y) * max_mask_time)
start = np.random.randint(0, len(y) - mask_length)
y_masked = y.copy()
y_masked[start:start + mask_length] = 0
return y_masked
def spec_augment(self, spec, num_freq_mask=2, num_time_mask=2, max_freq=10, max_time=20):
"""SpecAugment for spectrograms"""
augmented = spec.copy()
for _ in range(num_freq_mask):
f = np.random.randint(0, max_freq)
f0 = np.random.randint(0, spec.shape[0] - f)
augmented[f0:f0+f, :] = 0
for _ in range(num_time_mask):
t = np.random.randint(0, max_time)
t0 = np.random.randint(0, spec.shape[1] - t)
augmented[:, t0:t0+t] = 0
return augmented
Best Practices
- Always resample to a consistent sample rate (16kHz for speech)
- Use mel spectrograms as input for most audio classification tasks
- Data augmentation is crucial β time stretching, pitch shifting, noise addition
- Fine-tune pre-trained models for better performance with less data
- Evaluate with WER (Word Error Rate) for speech recognition tasks