AI for Data Science
AI assistants for data science automate code generation, analysis planning, and insight discovery, significantly accelerating the data science workflow.
AI Code Generation for Data Analysis
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
class DataScienceAssistant:
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4", temperature=0)
self.code_history = []
def generate_analysis_code(self, question: str, df_info: str) -> str:
prompt = ChatPromptTemplate.from_template(
"""Generate Python pandas code to answer this question about a DataFrame.
DataFrame info: {df_info}
Question: {question}
Return only executable Python code, no explanations:"""
)
chain = prompt | self.llm
result = chain.invoke({"question": question, "df_info": df_info})
code = result.content.strip()
self.code_history.append({"question": question, "code": code})
return code
def generate_visualization(self, data_description: str, chart_type: str = "auto") -> str:
prompt = ChatPromptTemplate.from_template(
"""Generate Python code to create a {chart_type} visualization.
Data description: {data_description}
Use matplotlib and seaborn. Return only executable code:"""
)
chain = prompt | self.llm
result = chain.invoke({
"data_description": data_description,
"chart_type": chart_type
})
return result.content.strip()
def explain_code(self, code: str) -> str:
prompt = ChatPromptTemplate.from_template(
"""Explain this Python code step by step:
{code}
Provide a clear, concise explanation:"""
)
chain = prompt | self.llm
result = chain.invoke({"code": code})
return result.content
def suggest_features(self, df_info: str, target: str) -> str:
prompt = ChatPromptTemplate.from_template(
"""Suggest feature engineering techniques for this dataset:
DataFrame info: {df_info}
Target variable: {target}
Provide specific code suggestions:"""
)
chain = prompt | self.llm
result = chain.invoke({"df_info": df_info, "target": target})
return result.content
# Usage
assistant = DataScienceAssistant()
df = pd.read_csv("sales_data.csv")
df_info = str(df.dtypes) + "\n" + str(df.describe())
code = assistant.generate_analysis_code(
"What are the top 10 products by revenue?",
df_info
)
exec(code)
Automated Insight Discovery
from dataclasses import dataclass
from typing import List
import numpy as np
@dataclass
class Insight:
category: str
title: str
description: str
confidence: float
code: str
class InsightDiscovery:
def __init__(self, llm):
self.llm = llm
def discover_insights(self, df: pd.DataFrame) -> List[Insight]:
insights = []
summary_stats = df.describe().to_string()
correlations = df.corr(numeric_only=True).to_string()
prompt = f"""Analyze this data summary and find key insights:
Summary Statistics:
{summary_stats}
Correlations:
{correlations}
List 5 important insights with titles, descriptions, and confidence (0-1):"""
response = self.llm.invoke(prompt).content
for line in response.split("\n"):
if line.strip():
insights.append(Insight(
category="statistical",
title=line[:50],
description=line,
confidence=0.8,
code=""
))
return insights
def detect_anomalies(self, df: pd.DataFrame, column: str) -> List[dict]:
mean = df[column].mean()
std = df[column].std()
anomalies = df[(df[column] - mean).abs() > 3 * std]
prompt = f"""Explain these anomalies in {column}:
Count: {len(anomalies)}
Values: {anomalies[column].tolist()[:5]}
Provide business context:"""
explanation = self.llm.invoke(prompt).content
return [{"indices": anomalies.index.tolist(), "explanation": explanation}]
def generate_report(self, df: pd.DataFrame, insights: List[Insight]) -> str:
prompt = f"""Generate an executive summary report for this dataset:
Shape: {df.shape}
Columns: {list(df.columns)}
Insights found: {len(insights)}
Provide a professional report in markdown format:"""
return self.llm.invoke(prompt).content
Smart Data Cleaning
class DataCleaningAssistant:
def __init__(self, llm):
self.llm = llm
def analyze_quality(self, df: pd.DataFrame) -> dict:
quality_report = {
"missing": df.isnull().sum().to_dict(),
"duplicates": df.duplicated().sum(),
"dtypes": df.dtypes.astype(str).to_dict(),
"unique_counts": {col: df[col].nunique() for col in df.columns}
}
return quality_report
def suggest_cleaning(self, quality_report: dict) -> str:
prompt = f"""Based on this data quality report, suggest cleaning steps:
Missing values: {quality_report['missing']}
Duplicates: {quality_report['duplicates']}
Provide specific pandas code for each cleaning step:"""
return self.llm.invoke(prompt).content
def handle_missing(self, df: pd.DataFrame, strategy: str = "auto") -> pd.DataFrame:
if strategy == "auto":
prompt = f"""Suggest the best strategy for handling missing values:
{df.isnull().sum().to_dict()}
Choose from: mean, median, mode, drop, interpolate"""
suggestion = self.llm.invoke(prompt).content
if "mean" in suggestion.lower():
return df.fillna(df.mean(numeric_only=True))
elif "drop" in suggestion.lower():
return df.dropna()
else:
return df.fillna(df.median(numeric_only=True))
return df
def generate_cleaning_pipeline(self, df: pd.DataFrame) -> str:
quality = self.analyze_quality(df)
prompt = f"""Generate a complete pandas cleaning pipeline for this data:
Quality issues: {quality}
Return a function that takes a DataFrame and returns a cleaned DataFrame:"""
return self.llm.invoke(prompt).content
Feature Engineering Assistant
class FeatureEngineer:
def __init__(self, llm):
self.llm = llm
def suggest_features(self, df: pd.DataFrame, target: str) -> str:
prompt = f"""Suggest feature engineering techniques for predicting {target}:
Columns: {list(df.columns)}
Dtypes: {df.dtypes.to_dict()}
Sample values: {df.head(2).to_dict()}
Suggest specific features with code:"""
return self.llm.invoke(prompt).content
def create_features(self, df: pd.DataFrame, feature_specs: list) -> pd.DataFrame:
for spec in feature_specs:
if spec["type"] == "interaction":
df[f"{spec['col1']}_x_{spec['col2']}"] = df[spec["col1"]] * df[spec["col2"]]
elif spec["type"] == "polynomial":
df[f"{spec['col']}_sq"] = df[spec["col"]] ** 2
elif spec["type"] == "binning":
df[f"{spec['col']}_binned"] = pd.cut(df[spec["col"]], bins=spec.get("bins", 5))
elif spec["type"] == "log":
df[f"{spec['col']}_log"] = np.log1p(df[spec["col"]])
return df
def auto_feature_selection(self, df: pd.DataFrame, target: str) -> str:
prompt = f"""Suggest feature selection methods for:
Features: {len(df.columns) - 1}
Target: {target}
Recommend methods and provide code:"""
return self.llm.invoke(prompt).content
Key Takeaways
- AI assistants accelerate code generation for data tasks
- Automated insights surface patterns humans might miss
- Smart cleaning suggests optimal strategies for data quality
- Feature engineering automates tedious preprocessing steps
- Report generation creates executive summaries from data