Audio & Speech Data Science

Audio data contains rich information beyond words – emotion, intent, and context. Learn to process and analyze audio for modern ML applications.

Audio Signal Processing

import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

def load_and_preprocess(audio_path, sr=16000):
    """Load audio and apply pre-processing"""
    y, sr = librosa.load(audio_path, sr=sr)
    
    # Pre-emphasis filter
    y_pre = np.append(y[0], y[1:] - 0.97 * y[:-1])
    
    # Normalize
    y_pre = y_pre / (np.max(np.abs(y_pre)) + 1e-8)
    
    return y_pre, sr

def extract_mfcc(y, sr, n_mfcc=13, n_fft=512, hop_length=160):
    """Extract MFCC features"""
    mfcc = librosa.feature.mfcc(
        y=y, sr=sr,
        n_mfcc=n_mfcc,
        n_fft=n_fft,
        hop_length=hop_length
    )
    
    # Delta and delta-delta
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    
    # Stack features
    features = np.vstack([mfcc, mfcc_delta, mfcc_delta2])
    
    return features

def extract_mel_spectrogram(y, sr, n_mels=128, n_fft=512, hop_length=160):
    """Extract mel spectrogram"""
    mel_spec = librosa.feature.melspectrogram(
        y=y, sr=sr,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length
    )
    
    # Convert to dB
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    return mel_spec_db

def extract_features(y, sr):
    """Extract comprehensive audio features"""
    features = {}
    
    # Spectral features
    features['spectral_centroid'] = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    features['spectral_bandwidth'] = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    features['spectral_rolloff'] = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]
    features['zero_crossing_rate'] = librosa.feature.zero_crossing_rate(y)[0]
    
    # MFCC
    features['mfcc'] = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    
    # Chroma
    features['chroma'] = librosa.feature.chroma_stft(y=y, sr=sr)
    
    # Tempo
    features['tempo'] = librosa.beat.tempo(y=y, sr=sr)[0]
    
    return features

# Visualize audio features
def plot_audio_features(y, sr):
    fig, axes = plt.subplots(4, 1, figsize=(12, 10))
    
    # Waveform
    librosa.display.waveshow(y, sr=sr, ax=axes[0])
    axes[0].set_title('Waveform')
    
    # Spectrogram
    D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
    librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
    axes[1].set_title('Spectrogram')
    
    # MFCC
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    librosa.display.specshow(mfcc, sr=sr, x_axis='time', ax=axes[2])
    axes[2].set_title('MFCC')
    
    # Mel spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    librosa.display.specshow(mel_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[3])
    axes[3].set_title('Mel Spectrogram')
    
    plt.tight_layout()
    plt.show()

Speech Recognition with wav2vec

import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import numpy as np

class SpeechRecognizer:
    def __init__(self, model_name="facebook/wav2vec2-large-960h"):
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
        self.model.eval()
    
    def transcribe(self, audio_path):
        """Transcribe audio to text"""
        # Load and resample
        audio, sr = librosa.load(audio_path, sr=16000)
        
        # Process
        inputs = self.processor(
            audio, 
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        
        # Predict
        with torch.no_grad():
            logits = self.model(inputs.input_values).logits
        
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)
        
        return transcription[0]
    
    def transcribe_with_timestamps(self, audio_path):
        """Transcribe with word-level timestamps"""
        audio, sr = librosa.load(audio_path, sr=16000)
        
        inputs = self.processor(
            audio, 
            sampling_rate=16000,
            return_tensors="pt",
            padding=True
        )
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Get alignment scores
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        
        # Find non-blank tokens
        blank_token_id = self.processor.tokenizer.pad_token_id
        predicted_ids = torch.argmax(logits, dim=-1)
        
        timestamps = []
        current_time = 0
        time_per_frame = len(audio) / (logits.shape[1] * sr)
        
        for i, token_id in enumerate(predicted_ids[0]):
            if token_id != blank_token_id:
                word = self.processor.decode([token_id.item()])
                timestamps.append({
                    'word': word,
                    'start': current_time,
                    'end': current_time + time_per_frame,
                    'confidence': probs[0, i, token_id].item()
                })
            current_time += time_per_frame
        
        return timestamps

# Fine-tuning for custom domain
def fine_tune_wav2vec():
    from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
    from datasets import load_dataset
    
    model_name = "facebook/wav2vec2-base"
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name, vocab_size=processor.tokenizer.vocab_size)
    
    # Load custom dataset
    dataset = load_dataset("your_custom_dataset")
    
    def preprocess_function(examples):
        audio = examples["audio"]
        inputs = processor(
            audio["array"],
            sampling_rate=audio["sampling_rate"],
            return_tensors="pt",
            padding=True
        )
        
        with processor.as_target_processor():
            labels = processor(examples["text"], return_tensors="pt", padding=True)
        
        inputs["labels"] = labels["input_ids"]
        return inputs
    
    tokenized_dataset = dataset.map(preprocess_function, remove_columns=dataset["train"].column_names)
    
    return model, processor, tokenized_dataset

Audio Classification

import torch
import torch.nn as nn
import torchaudio

class AudioClassifier(nn.Module):
    def __init__(self, num_classes, n_mels=128, hidden_dim=256):
        super().__init__()
        
        # CNN for spectrogram features
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, None))
        )
        
        # LSTM for temporal modeling
        self.lstm = nn.LSTM(
            input_size=128,
            hidden_size=hidden_dim,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )
        
        # Attention
        self.attention = nn.Linear(hidden_dim * 2, 1)
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        # CNN features
        cnn_out = self.cnn(x)
        
        # Reshape for LSTM: (batch, time, features)
        batch, channels, freq, time = cnn_out.size()
        lstm_in = cnn_out.squeeze(2).permute(0, 2, 1)
        
        # LSTM
        lstm_out, _ = self.lstm(lstm_in)
        
        # Attention pooling
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context = torch.sum(attention_weights * lstm_out, dim=1)
        
        # Classify
        return self.classifier(context)

# Audio data augmentation
class AudioAugmentation:
    def __init__(self, sr=16000):
        self.sr = sr
    
    def time_stretch(self, y, rate):
        return librosa.effects.time_stretch(y, rate=rate)
    
    def pitch_shift(self, y, n_steps):
        return librosa.effects.pitch_shift(y, sr=self.sr, n_steps=n_steps)
    
    def add_noise(self, y, noise_level=0.005):
        noise = np.random.randn(len(y)) * noise_level
        return y + noise
    
    def time_mask(self, y, max_mask_time=0.1):
        mask_length = int(len(y) * max_mask_time)
        start = np.random.randint(0, len(y) - mask_length)
        y_masked = y.copy()
        y_masked[start:start + mask_length] = 0
        return y_masked
    
    def spec_augment(self, spec, num_freq_mask=2, num_time_mask=2, max_freq=10, max_time=20):
        """SpecAugment for spectrograms"""
        augmented = spec.copy()
        
        for _ in range(num_freq_mask):
            f = np.random.randint(0, max_freq)
            f0 = np.random.randint(0, spec.shape[0] - f)
            augmented[f0:f0+f, :] = 0
        
        for _ in range(num_time_mask):
            t = np.random.randint(0, max_time)
            t0 = np.random.randint(0, spec.shape[1] - t)
            augmented[:, t0:t0+t] = 0
        
        return augmented

Best Practices

Always resample to a consistent sample rate (16kHz for speech)
Use mel spectrograms as input for most audio classification tasks
Data augmentation is crucial – time stretching, pitch shifting, noise addition
Fine-tune pre-trained models for better performance with less data
Evaluate with WER (Word Error Rate) for speech recognition tasks