πŸŽ‰ 75% of content is free forever β€” Unlock Premium from $10/mo β†’
CW
Search courses…
πŸ’Ό Servicesℹ️ Aboutβœ‰οΈ ContactView Pricing Plansfrom $10

AI for Data Science

🟒 Free Lesson

Advertisement

AI for Data Science

Data SourcesCSV, SQL, APIsJSON, ParquetAI AssistantCode GenerationAnalysis PlanningVisualizationFeature EngineeringGenerated CodePandas OperationsSQL QueriesVisualizationsMatplotlibPlotly ChartsInsightsAnomaly DetectionTrend AnalysisReportsAuto-GeneratedExecutive SummaryML ModelsAutoMLFeature Selection

AI assistants for data science automate code generation, analysis planning, and insight discovery, significantly accelerating the data science workflow.

AI Code Generation for Data Analysis

import pandas as pd
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

class DataScienceAssistant:
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.code_history = []

    def generate_analysis_code(self, question: str, df_info: str) -> str:
        prompt = ChatPromptTemplate.from_template(
            """Generate Python pandas code to answer this question about a DataFrame.
            DataFrame info: {df_info}
            Question: {question}
            Return only executable Python code, no explanations:"""
        )
        chain = prompt | self.llm
        result = chain.invoke({"question": question, "df_info": df_info})
        code = result.content.strip()
        self.code_history.append({"question": question, "code": code})
        return code

    def generate_visualization(self, data_description: str, chart_type: str = "auto") -> str:
        prompt = ChatPromptTemplate.from_template(
            """Generate Python code to create a {chart_type} visualization.
            Data description: {data_description}
            Use matplotlib and seaborn. Return only executable code:"""
        )
        chain = prompt | self.llm
        result = chain.invoke({
            "data_description": data_description,
            "chart_type": chart_type
        })
        return result.content.strip()

    def explain_code(self, code: str) -> str:
        prompt = ChatPromptTemplate.from_template(
            """Explain this Python code step by step:
            {code}
            Provide a clear, concise explanation:"""
        )
        chain = prompt | self.llm
        result = chain.invoke({"code": code})
        return result.content

    def suggest_features(self, df_info: str, target: str) -> str:
        prompt = ChatPromptTemplate.from_template(
            """Suggest feature engineering techniques for this dataset:
            DataFrame info: {df_info}
            Target variable: {target}
            Provide specific code suggestions:"""
        )
        chain = prompt | self.llm
        result = chain.invoke({"df_info": df_info, "target": target})
        return result.content

# Usage
assistant = DataScienceAssistant()
df = pd.read_csv("sales_data.csv")
df_info = str(df.dtypes) + "\n" + str(df.describe())

code = assistant.generate_analysis_code(
    "What are the top 10 products by revenue?",
    df_info
)
exec(code)

Automated Insight Discovery

from dataclasses import dataclass
from typing import List
import numpy as np

@dataclass
class Insight:
    category: str
    title: str
    description: str
    confidence: float
    code: str

class InsightDiscovery:
    def __init__(self, llm):
        self.llm = llm

    def discover_insights(self, df: pd.DataFrame) -> List[Insight]:
        insights = []
        summary_stats = df.describe().to_string()
        correlations = df.corr(numeric_only=True).to_string()

        prompt = f"""Analyze this data summary and find key insights:
        Summary Statistics:
        {summary_stats}

        Correlations:
        {correlations}

        List 5 important insights with titles, descriptions, and confidence (0-1):"""

        response = self.llm.invoke(prompt).content
        for line in response.split("\n"):
            if line.strip():
                insights.append(Insight(
                    category="statistical",
                    title=line[:50],
                    description=line,
                    confidence=0.8,
                    code=""
                ))
        return insights

    def detect_anomalies(self, df: pd.DataFrame, column: str) -> List[dict]:
        mean = df[column].mean()
        std = df[column].std()
        anomalies = df[(df[column] - mean).abs() > 3 * std]

        prompt = f"""Explain these anomalies in {column}:
        Count: {len(anomalies)}
        Values: {anomalies[column].tolist()[:5]}
        Provide business context:"""

        explanation = self.llm.invoke(prompt).content
        return [{"indices": anomalies.index.tolist(), "explanation": explanation}]

    def generate_report(self, df: pd.DataFrame, insights: List[Insight]) -> str:
        prompt = f"""Generate an executive summary report for this dataset:
        Shape: {df.shape}
        Columns: {list(df.columns)}
        Insights found: {len(insights)}

        Provide a professional report in markdown format:"""

        return self.llm.invoke(prompt).content

Smart Data Cleaning

class DataCleaningAssistant:
    def __init__(self, llm):
        self.llm = llm

    def analyze_quality(self, df: pd.DataFrame) -> dict:
        quality_report = {
            "missing": df.isnull().sum().to_dict(),
            "duplicates": df.duplicated().sum(),
            "dtypes": df.dtypes.astype(str).to_dict(),
            "unique_counts": {col: df[col].nunique() for col in df.columns}
        }
        return quality_report

    def suggest_cleaning(self, quality_report: dict) -> str:
        prompt = f"""Based on this data quality report, suggest cleaning steps:
        Missing values: {quality_report['missing']}
        Duplicates: {quality_report['duplicates']}

        Provide specific pandas code for each cleaning step:"""

        return self.llm.invoke(prompt).content

    def handle_missing(self, df: pd.DataFrame, strategy: str = "auto") -> pd.DataFrame:
        if strategy == "auto":
            prompt = f"""Suggest the best strategy for handling missing values:
            {df.isnull().sum().to_dict()}
            Choose from: mean, median, mode, drop, interpolate"""

            suggestion = self.llm.invoke(prompt).content
            if "mean" in suggestion.lower():
                return df.fillna(df.mean(numeric_only=True))
            elif "drop" in suggestion.lower():
                return df.dropna()
            else:
                return df.fillna(df.median(numeric_only=True))
        return df

    def generate_cleaning_pipeline(self, df: pd.DataFrame) -> str:
        quality = self.analyze_quality(df)
        prompt = f"""Generate a complete pandas cleaning pipeline for this data:
        Quality issues: {quality}

        Return a function that takes a DataFrame and returns a cleaned DataFrame:"""

        return self.llm.invoke(prompt).content

Feature Engineering Assistant

class FeatureEngineer:
    def __init__(self, llm):
        self.llm = llm

    def suggest_features(self, df: pd.DataFrame, target: str) -> str:
        prompt = f"""Suggest feature engineering techniques for predicting {target}:
        Columns: {list(df.columns)}
        Dtypes: {df.dtypes.to_dict()}
        Sample values: {df.head(2).to_dict()}

        Suggest specific features with code:"""

        return self.llm.invoke(prompt).content

    def create_features(self, df: pd.DataFrame, feature_specs: list) -> pd.DataFrame:
        for spec in feature_specs:
            if spec["type"] == "interaction":
                df[f"{spec['col1']}_x_{spec['col2']}"] = df[spec["col1"]] * df[spec["col2"]]
            elif spec["type"] == "polynomial":
                df[f"{spec['col']}_sq"] = df[spec["col"]] ** 2
            elif spec["type"] == "binning":
                df[f"{spec['col']}_binned"] = pd.cut(df[spec["col"]], bins=spec.get("bins", 5))
            elif spec["type"] == "log":
                df[f"{spec['col']}_log"] = np.log1p(df[spec["col"]])
        return df

    def auto_feature_selection(self, df: pd.DataFrame, target: str) -> str:
        prompt = f"""Suggest feature selection methods for:
        Features: {len(df.columns) - 1}
        Target: {target}

        Recommend methods and provide code:"""

        return self.llm.invoke(prompt).content

Key Takeaways

  • AI assistants accelerate code generation for data tasks
  • Automated insights surface patterns humans might miss
  • Smart cleaning suggests optimal strategies for data quality
  • Feature engineering automates tedious preprocessing steps
  • Report generation creates executive summaries from data
⭐

Premium Content

AI for Data Science

Unlock this lesson and 900+ advanced tutorials with a Premium plan.

🎯End-to-end Projects
πŸ’ΌInterview Prep
πŸ“œCertificates
🀝Community Access

Already a member? Log in

Need Expert Generative AI Help?

Get personalized tutoring, project support, or professional consulting.

Advertisement