Tabular Deep Learning
Tabular data is everywhere. Learn when deep learning beats gradient boosting and how to apply modern architectures like TabNet and FT-Transformer.
When to Use DL for Tabular
TabNet: Attentive Interpretable Tabular Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
class AttentiveTransformer(nn.Module):
def __init__(self, input_dim, output_dim, relaxation_factor=1.5):
super().__init__()
self.fc = nn.Linear(input_dim, output_dim)
self.bn = nn.BatchNorm1d(output_dim)
self.relaxation_factor = relaxation_factor
def forward(self, x, processed_features):
x = self.fc(x)
x = self.bn(x)
# Sparsemax attention
x = self._sparsemax(x, self.relaxation_factor)
return x * processed_features
def _sparsemax(self, x, relaxation_factor):
"""Sparsemax activation for feature selection"""
x = x / relaxation_factor
x = F.relu(x)
# Normalize
x_sum = x.sum(dim=-1, keepdim=True) + 1e-8
return x / x_sum
class FeatureTransformer(nn.Module):
def __init__(self, input_dim, output_dim, shared=None):
super().__init__()
self.shared = shared
self.fc = nn.Linear(input_dim, output_dim)
self.bn = nn.BatchNorm1d(output_dim)
def forward(self, x):
if self.shared is not None:
x = self.shared(x)
x = self.fc(x)
x = self.bn(x)
return F.relu(x)
class TabNet(nn.Module):
def __init__(self, input_dim, output_dim, n_steps=3, relaxation_factor=1.5,
hidden_dim=64, n_independent=2, n_shared=2):
super().__init__()
self.n_steps = n_steps
self.hidden_dim = hidden_dim
# Shared feature transformer
shared_layers = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.ReLU()
)
# Step-specific transformers
self.feature_transformers = nn.ModuleList([
FeatureTransformer(input_dim, hidden_dim, shared_layers if i < n_shared else None)
for i in range(n_steps)
])
# Attention
self.attention_maps = nn.ModuleList([
AttentiveTransformer(hidden_dim, input_dim, relaxation_factor)
for _ in range(n_steps)
])
# Final output
self.final_fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
batch_size = x.size(0)
# Initial decision
aggregated = torch.zeros(batch_size, self.hidden_dim, device=x.device)
attention = torch.ones(batch_size, x.size(1), device=x.device)
for step in range(self.n_steps):
# Transform features
transformed = self.feature_transformers[step](x * attention)
# Get attention mask
attention = self.attention_maps[step](transformed, attention)
# Aggregate
aggregated += transformed
return self.final_fc(aggregated)
# Training TabNet
def train_tabnet(model, train_loader, val_loader, epochs=100):
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
criterion = nn.CrossEntropyLoss()
best_val_acc = 0
for epoch in range(epochs):
model.train()
train_loss = 0
for X_batch, y_batch in train_loader:
optimizer.zero_grad()
output = model(X_batch)
loss = criterion(output, y_batch)
loss.backward()
optimizer.step()
train_loss += loss.item()
scheduler.step()
# Validation
model.eval()
correct = 0
total = 0
with torch.no_grad():
for X_batch, y_batch in val_loader:
output = model(X_batch)
_, predicted = torch.max(output, 1)
total += y_batch.size(0)
correct += (predicted == y_batch).sum().item()
val_acc = correct / total
if val_acc > best_val_acc:
best_val_acc = val_acc
torch.save(model.state_dict(), 'best_tabnet.pth')
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}: Loss={train_loss/len(train_loader):.4f}, Val Acc={val_acc:.4f}")
FT-Transformer (Feature Tokenizer + Transformer)
import torch
import torch.nn as nn
class FeatureTokenizer(nn.Module):
def __init__(self, num_features, embed_dim):
super().__init__()
self.embeddings = nn.ModuleList([
nn.Embedding(1, embed_dim) for _ in range(num_features)
])
self.num_features = num_features
def forward(self, x):
batch_size = x.size(0)
tokens = []
for i in range(self.num_features):
# Each feature gets its own embedding
feature_val = x[:, i:i+1]
token = self.embeddings[i](torch.zeros(batch_size, 1, device=x.device))
token = token + feature_val.unsqueeze(-1) * 0.1 # Simple scaling
tokens.append(token)
return torch.cat(tokens, dim=1)
class FTTransformer(nn.Module):
def __init__(self, num_features, num_classes, embed_dim=192,
num_heads=8, num_layers=6, dropout=0.1):
super().__init__()
self.tokenizer = FeatureTokenizer(num_features, embed_dim)
# CLS token
self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
# Positional embedding
self.pos_embed = nn.Parameter(torch.randn(1, num_features + 1, embed_dim))
# Transformer encoder
encoder_layer = nn.TransformerEncoderLayer(
d_model=embed_dim,
nhead=num_heads,
dim_feedforward=embed_dim * 4,
dropout=dropout,
batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.norm = nn.LayerNorm(embed_dim)
self.head = nn.Linear(embed_dim, num_classes)
def forward(self, x):
batch_size = x.size(0)
# Tokenize features
tokens = self.tokenizer(x)
# Add CLS token
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
tokens = torch.cat([cls_tokens, tokens], dim=1)
# Add positional embeddings
tokens = tokens + self.pos_embed
# Transformer
out = self.transformer(tokens)
out = self.norm(out)
# Use CLS token for prediction
cls_output = out[:, 0]
return self.head(cls_output)
Wide & Deep Learning
import torch
import torch.nn as nn
class WideAndDeep(nn.Module):
def __init__(self, num_features, num_categories_per_feature,
embed_dim=16, hidden_dims=[256, 128, 64], num_classes=2):
super().__init__()
# Wide component (linear)
self.wide = nn.Linear(num_features, num_classes)
# Deep component (embeddings + MLP)
self.embeddings = nn.ModuleList([
nn.Embedding(cat + 1, embed_dim) for cat in num_categories_per_feature
])
deep_input_dim = num_features + len(num_categories_per_feature) * embed_dim
layers = []
for hidden_dim in hidden_dims:
layers.extend([
nn.Linear(deep_input_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.ReLU(),
nn.Dropout(0.3)
])
deep_input_dim = hidden_dim
layers.append(nn.Linear(deep_input_dim, num_classes))
self.deep = nn.Sequential(*layers)
def forward(self, x_continuous, x_categorical):
# Wide path
wide_out = self.wide(x_continuous)
# Deep path
embeds = []
for i, emb in enumerate(self.embeddings):
embeds.append(emb(x_categorical[:, i]))
deep_input = torch.cat([x_continuous] + embeds, dim=1)
deep_out = self.deep(deep_input)
# Combine
return wide_out + deep_out
# Usage example
num_continuous = 10
num_categorical_features = [100, 50, 200, 30] # cardinality per feature
model = WideAndDeep(num_continuous, num_categorical_features)
Model Comparison
import time
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
def benchmark_models(X_train, X_test, y_train, y_test, cat_features):
"""Compare gradient boosting vs deep learning"""
results = {}
# LightGBM (baseline)
import lightgbm as lgb
start = time.time()
train_data = lgb.Dataset(X_train, label=y_train)
params = {'objective': 'multiclass', 'num_class': len(np.unique(y_train)), 'verbose': -1}
model = lgb.train(params, train_data, num_boost_round=100)
lgb_time = time.time() - start
lgb_pred = model.predict(X_test).argmax(axis=1)
results['LightGBM'] = {
'accuracy': accuracy_score(y_test, lgb_pred),
'f1': f1_score(y_test, lgb_pred, average='weighted'),
'train_time': lgb_time
}
# TabNet
from pytorch_tabnet.tab_model import TabNetClassifier
start = time.time()
tabnet = TabNetClassifier(
n_d=64, n_a=64,
n_steps=3,
optimizer_params=dict(lr=2e-2),
scheduler_params={"step_size": 50, "gamma": 0.9},
mask_type='entmax'
)
tabnet.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
max_epochs=100,
patience=10
)
tabnet_time = time.time() - start
tabnet_pred = tabnet.predict(X_test)
results['TabNet'] = {
'accuracy': accuracy_score(y_test, tabnet_pred),
'f1': f1_score(y_test, tabnet_pred, average='weighted'),
'train_time': tabnet_time
}
return results
Best Practices
- Start with LightGBM β it's the baseline for tabular data
- Use DL when you have 100K+ samples and complex feature interactions
- Feature tokenization is key for transformer-based tabular models
- Embed categorical features rather than one-hot encoding for DL
- Ensemble gradient boosting with DL for best performance