Synthetic Data Generation
Why Synthetic Data?
Synthetic data addresses challenges like data scarcity, privacy regulations, class imbalance, and the need for diverse test scenarios. It enables ML development when real data is limited or sensitive.
Tabular Data Generation with GANs
import numpy as np
import pandas as pd
from typing import List, Dict
import torch
import torch.nn as nn
class TabularGAN:
def __init__(self, input_dim: int, latent_dim: int = 100):
self.latent_dim = latent_dim
self.generator = nn.Sequential(
nn.Linear(latent_dim, 128),
nn.ReLU(),
nn.BatchNorm1d(128),
nn.Linear(128, 256),
nn.ReLU(),
nn.BatchNorm1d(256),
nn.Linear(256, input_dim),
nn.Sigmoid()
)
self.discriminator = nn.Sequential(
nn.Linear(input_dim, 256),
nn.LeakyReLU(0.2),
nn.Dropout(0.3),
nn.Linear(256, 128),
nn.LeakyReLU(0.2),
nn.Dropout(0.3),
nn.Linear(128, 1),
nn.Sigmoid()
)
def generate_samples(self, n_samples: int) -> np.ndarray:
self.generator.eval()
with torch.no_grad():
noise = torch.randn(n_samples, self.latent_dim)
samples = self.generator(noise)
return samples.numpy()
class SyntheticDataGenerator:
def __init__(self):
self.gan = None
def fit(self, data: pd.DataFrame, epochs: int = 100):
numeric_data = data.select_dtypes(include=[np.number])
self.gan = TabularGAN(input_dim=numeric_data.shape[1])
print(f"Training GAN on {numeric_data.shape[1]} features")
def generate(self, n_samples: int) -> pd.DataFrame:
if self.gan is None:
raise ValueError("Generator not trained")
synthetic = self.gan.generate_samples(n_samples)
return pd.DataFrame(synthetic, columns=self.feature_names)
generator = SyntheticDataGenerator()
generator.fit(training_data, epochs=50)
synthetic_df = generator.generate(1000)
LLM-Based Text Generation
import openai
from typing import List, Dict
import json
class TextDataGenerator:
def __init__(self, api_key: str):
self.client = openai.OpenAI(api_key=api_key)
def generate_training Examples(
self,
task_description: str,
n_examples: int = 10,
style: str = "professional"
) -> List[Dict]:
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": f"""Generate synthetic training data.
Style: {style}
Return JSON array with "input" and "output" fields."""},
{"role": "user", "content": f"""Task: {task_description}
Generate {n_examples} diverse examples."""}
],
temperature=0.8,
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)["examples"]
def augment_dataset(
self,
existing_data: List[Dict],
augmentation_factor: int = 2
) -> List[Dict]:
augmented = []
for item in existing_data:
response = self.client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Rephrase the following example while preserving meaning."},
{"role": "user", "content": json.dumps(item)}
],
temperature=0.9
)
new_item = json.loads(response.choices[0].message.content)
augmented.append(new_item)
return augmented
text_generator = TextDataGenerator(api_key="your-api-key")
training_data = text_generator.generate_training_examples(
task_description="Classify customer support tickets by urgency",
n_examples=20
)
Privacy-Preserving Generation
import numpy as np
from typing import Tuple
class DifferentialPrivacyGenerator:
def __init__(self, epsilon: float = 1.0, delta: float = 1e-5):
self.epsilon = epsilon
self.delta = delta
def add_laplace_noise(self, data: np.ndarray, sensitivity: float) -> np.ndarray:
scale = sensitivity / self.epsilon
noise = np.random.laplace(0, scale, data.shape)
return data + noise
def add_gaussian_noise(self, data: np.ndarray, sensitivity: float) -> np.ndarray:
sigma = sensitivity * np.sqrt(2 * np.log(1.25 / self.delta)) / self.epsilon
noise = np.random.normal(0, sigma, data.shape)
return data + noise
def private_mean(self, data: np.ndarray) -> Tuple[float, float]:
true_mean = np.mean(data, axis=0)
sensitivity = np.max(data, axis=0) - np.min(data, axis=0)
private_mean = self.add_laplace_noise(true_mean, sensitivity / len(data))
return private_mean, sensitivity
dp_gen = DifferentialPrivacyGenerator(epsilon=0.5)
private_data = dp_gen.add_laplace_noise(original_data, sensitivity=1.0)
Synthetic Data Validation
from scipy import stats
from sklearn.metrics import pairwise_distances
class SyntheticDataValidator:
def __init__(self, real_data, synthetic_data):
self.real = real_data
self.synthetic = synthetic_data
def distribution_comparison(self) -> Dict:
results = {}
for col in self.real.columns:
if self.real[col].dtype in ['float64', 'int64']:
ks_stat, ks_pval = stats.ks_2samp(self.real[col], self.synthetic[col])
results[col] = {
"ks_statistic": ks_stat,
"ks_pvalue": ks_pval,
"similar_distribution": ks_pval > 0.05
}
return results
def privacy_check(self) -> Dict:
real_distances = pairwise_distances(self.real, metric='euclidean')
synthetic_distances = pairwise_distances(self.synthetic, metric='euclidean')
return {
"real_nearest_neighbor_mean": np.mean(np.min(real_distances, axis=1)),
"synthetic_nearest_neighbor_mean": np.mean(np.min(synthetic_distances, axis=1)),
"privacy_preserved": np.mean(np.min(synthetic_distances, axis=1)) >
np.mean(np.min(real_distances, axis=1)) * 0.5
}
validator = SyntheticDataValidator(real_df, synthetic_df)
dist_results = validator.distribution_comparison()
privacy_results = validator.privacy_check()
Best Practices
- Validate synthetic data against real data distributions
- Ensure privacy guarantees meet regulatory requirements
- Use multiple generation methods for diversity
- Test model performance on both real and synthetic data
- Document generation process and limitations
- Monitor for bias amplification