πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

Synthetic Data Generation

🟒 Free Lesson

Advertisement

Synthetic Data Generation

Synthetic Data PipelineReal DataAnalysisGen ModelGenerationValidationGeneration Methodsβ€’ GANs (Generative Adversarial)β€’ VAEs (Variational Autoencoders)β€’ LLM-based Generationβ€’ Diffusion ModelsApplicationsβ€’ Training Data Augmentationβ€’ Privacy-Preserving Sharingβ€’ Edge Case Generationβ€’ Testing and Benchmarking

Why Synthetic Data?

Synthetic data addresses challenges like data scarcity, privacy regulations, class imbalance, and the need for diverse test scenarios. It enables ML development when real data is limited or sensitive.

Tabular Data Generation with GANs

import numpy as np
import pandas as pd
from typing import List, Dict
import torch
import torch.nn as nn

class TabularGAN:
    def __init__(self, input_dim: int, latent_dim: int = 100):
        self.latent_dim = latent_dim
        
        self.generator = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, input_dim),
            nn.Sigmoid()
        )
        
        self.discriminator = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def generate_samples(self, n_samples: int) -> np.ndarray:
        self.generator.eval()
        with torch.no_grad():
            noise = torch.randn(n_samples, self.latent_dim)
            samples = self.generator(noise)
        return samples.numpy()

class SyntheticDataGenerator:
    def __init__(self):
        self.gan = None
    
    def fit(self, data: pd.DataFrame, epochs: int = 100):
        numeric_data = data.select_dtypes(include=[np.number])
        self.gan = TabularGAN(input_dim=numeric_data.shape[1])
        
        print(f"Training GAN on {numeric_data.shape[1]} features")
        
    def generate(self, n_samples: int) -> pd.DataFrame:
        if self.gan is None:
            raise ValueError("Generator not trained")
        
        synthetic = self.gan.generate_samples(n_samples)
        return pd.DataFrame(synthetic, columns=self.feature_names)

generator = SyntheticDataGenerator()
generator.fit(training_data, epochs=50)
synthetic_df = generator.generate(1000)

LLM-Based Text Generation

import openai
from typing import List, Dict
import json

class TextDataGenerator:
    def __init__(self, api_key: str):
        self.client = openai.OpenAI(api_key=api_key)
    
    def generate_training Examples(
        self, 
        task_description: str,
        n_examples: int = 10,
        style: str = "professional"
    ) -> List[Dict]:
        response = self.client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": f"""Generate synthetic training data.
Style: {style}
Return JSON array with "input" and "output" fields."""},
                {"role": "user", "content": f"""Task: {task_description}
Generate {n_examples} diverse examples."""}
            ],
            temperature=0.8,
            response_format={"type": "json_object"}
        )
        
        return json.loads(response.choices[0].message.content)["examples"]
    
    def augment_dataset(
        self, 
        existing_data: List[Dict],
        augmentation_factor: int = 2
    ) -> List[Dict]:
        augmented = []
        
        for item in existing_data:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "Rephrase the following example while preserving meaning."},
                    {"role": "user", "content": json.dumps(item)}
                ],
                temperature=0.9
            )
            
            new_item = json.loads(response.choices[0].message.content)
            augmented.append(new_item)
        
        return augmented

text_generator = TextDataGenerator(api_key="your-api-key")
training_data = text_generator.generate_training_examples(
    task_description="Classify customer support tickets by urgency",
    n_examples=20
)

Privacy-Preserving Generation

import numpy as np
from typing import Tuple

class DifferentialPrivacyGenerator:
    def __init__(self, epsilon: float = 1.0, delta: float = 1e-5):
        self.epsilon = epsilon
        self.delta = delta
    
    def add_laplace_noise(self, data: np.ndarray, sensitivity: float) -> np.ndarray:
        scale = sensitivity / self.epsilon
        noise = np.random.laplace(0, scale, data.shape)
        return data + noise
    
    def add_gaussian_noise(self, data: np.ndarray, sensitivity: float) -> np.ndarray:
        sigma = sensitivity * np.sqrt(2 * np.log(1.25 / self.delta)) / self.epsilon
        noise = np.random.normal(0, sigma, data.shape)
        return data + noise
    
    def private_mean(self, data: np.ndarray) -> Tuple[float, float]:
        true_mean = np.mean(data, axis=0)
        sensitivity = np.max(data, axis=0) - np.min(data, axis=0)
        private_mean = self.add_laplace_noise(true_mean, sensitivity / len(data))
        return private_mean, sensitivity

dp_gen = DifferentialPrivacyGenerator(epsilon=0.5)
private_data = dp_gen.add_laplace_noise(original_data, sensitivity=1.0)

Synthetic Data Validation

from scipy import stats
from sklearn.metrics import pairwise_distances

class SyntheticDataValidator:
    def __init__(self, real_data, synthetic_data):
        self.real = real_data
        self.synthetic = synthetic_data
    
    def distribution_comparison(self) -> Dict:
        results = {}
        for col in self.real.columns:
            if self.real[col].dtype in ['float64', 'int64']:
                ks_stat, ks_pval = stats.ks_2samp(self.real[col], self.synthetic[col])
                results[col] = {
                    "ks_statistic": ks_stat,
                    "ks_pvalue": ks_pval,
                    "similar_distribution": ks_pval > 0.05
                }
        return results
    
    def privacy_check(self) -> Dict:
        real_distances = pairwise_distances(self.real, metric='euclidean')
        synthetic_distances = pairwise_distances(self.synthetic, metric='euclidean')
        
        return {
            "real_nearest_neighbor_mean": np.mean(np.min(real_distances, axis=1)),
            "synthetic_nearest_neighbor_mean": np.mean(np.min(synthetic_distances, axis=1)),
            "privacy_preserved": np.mean(np.min(synthetic_distances, axis=1)) > 
                                np.mean(np.min(real_distances, axis=1)) * 0.5
        }

validator = SyntheticDataValidator(real_df, synthetic_df)
dist_results = validator.distribution_comparison()
privacy_results = validator.privacy_check()

Best Practices

  • Validate synthetic data against real data distributions
  • Ensure privacy guarantees meet regulatory requirements
  • Use multiple generation methods for diversity
  • Test model performance on both real and synthetic data
  • Document generation process and limitations
  • Monitor for bias amplification
⭐

Premium Content

Synthetic Data Generation

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement