4 min read
Building AI Data Analysis Assistants
AI assistants that understand data and generate insights are transforming analytics. Build systems that interpret datasets, generate visualizations, and provide actionable recommendations.
Data Analysis Assistant
import pandas as pd
from dataclasses import dataclass
@dataclass
class AnalysisResult:
summary: str
insights: list[str]
visualizations: list[dict]
recommendations: list[str]
code: str
class DataAnalysisAssistant:
"""AI-powered data analysis assistant."""
def __init__(self, client):
self.client = client
async def analyze(
self,
df: pd.DataFrame,
question: str = None
) -> AnalysisResult:
"""Analyze DataFrame and generate insights."""
# Get data profile
profile = self._profile_data(df)
# Generate analysis
analysis = await self._generate_analysis(profile, question)
# Generate visualization recommendations
viz_recs = await self._recommend_visualizations(profile, question)
return AnalysisResult(
summary=analysis["summary"],
insights=analysis["insights"],
visualizations=viz_recs,
recommendations=analysis["recommendations"],
code=analysis.get("code", "")
)
def _profile_data(self, df: pd.DataFrame) -> dict:
"""Create data profile for LLM context."""
return {
"shape": df.shape,
"columns": list(df.columns),
"dtypes": df.dtypes.to_dict(),
"sample": df.head(5).to_dict(),
"stats": df.describe().to_dict(),
"nulls": df.isnull().sum().to_dict(),
"unique_counts": {col: df[col].nunique() for col in df.columns}
}
async def _generate_analysis(
self,
profile: dict,
question: str
) -> dict:
"""Generate analysis from profile."""
import json
profile_str = json.dumps(profile, indent=2, default=str)
prompt = f"""Analyze this dataset.
Data Profile:
{profile_str}
{f'Specific question: {question}' if question else 'Provide general analysis'}
Provide:
1. Summary of what the data contains
2. Key insights (5-7 points)
3. Anomalies or issues found
4. Recommendations for further analysis
5. Python code to reproduce key findings
Return as JSON."""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
try:
return json.loads(response.content)
except:
return {"summary": response.content, "insights": [], "recommendations": []}
async def _recommend_visualizations(
self,
profile: dict,
question: str
) -> list[dict]:
"""Recommend visualizations."""
prompt = f"""Recommend visualizations for this data.
Columns: {profile['columns']}
Types: {profile['dtypes']}
{f'Focus: {question}' if question else ''}
For each recommendation provide:
- chart_type: type of chart
- columns: columns to use
- purpose: what it shows
- code: matplotlib/seaborn code
Return as JSON array."""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
import json
try:
return json.loads(response.content)
except:
return []
Conversational Data Analysis
class ConversationalAnalyst:
"""Have a conversation about your data."""
def __init__(self, client, df: pd.DataFrame):
self.client = client
self.df = df
self.history = []
self.context = self._create_context()
def _create_context(self) -> str:
"""Create context about the data."""
return f"""Dataset Information:
- Shape: {self.df.shape}
- Columns: {list(self.df.columns)}
- Types: {self.df.dtypes.to_dict()}
- Sample: {self.df.head(3).to_dict()}"""
async def chat(self, message: str) -> str:
"""Chat about the data."""
self.history.append({"role": "user", "content": message})
messages = [
{"role": "system", "content": f"""You are a data analyst assistant.
{self.context}
You can:
- Answer questions about the data
- Generate analysis code
- Explain patterns and insights
- Recommend visualizations
When you write code, it will be executed on the DataFrame 'df'."""},
*self.history
]
response = await self.client.chat_completion(
model="gpt-4",
messages=messages
)
self.history.append({
"role": "assistant",
"content": response.content
})
# Check if response contains code to execute
if "```python" in response.content:
code = self._extract_code(response.content)
result = self._execute_code(code)
if result:
return f"{response.content}\n\nExecution Result:\n{result}"
return response.content
def _extract_code(self, response: str) -> str:
"""Extract Python code from response."""
if "```python" in response:
start = response.find("```python") + 9
end = response.find("```", start)
return response[start:end].strip()
return ""
def _execute_code(self, code: str) -> str:
"""Safely execute analysis code."""
try:
# Create safe execution environment
local_vars = {"df": self.df, "pd": pd}
exec(code, {"__builtins__": {}}, local_vars)
# Get result variable if it exists
if "result" in local_vars:
return str(local_vars["result"])
return "Code executed successfully"
except Exception as e:
return f"Error: {e}"
Automated Reporting
class AutoReportGenerator:
"""Generate analysis reports automatically."""
async def generate_report(
self,
df: pd.DataFrame,
report_type: str = "executive"
) -> str:
"""Generate formatted analysis report."""
profile = self._profile_data(df)
prompts = {
"executive": "Generate an executive summary with key metrics and insights",
"technical": "Generate a technical analysis with statistical details",
"exploratory": "Generate an exploratory data analysis report"
}
prompt = f"""{prompts.get(report_type, prompts['executive'])}
Data:
{json.dumps(profile, indent=2, default=str)}
Format as Markdown with:
1. Title
2. Executive Summary
3. Key Metrics
4. Insights
5. Visualizations (describe what to create)
6. Recommendations
7. Next Steps"""
response = await self.client.chat_completion(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return response.content
AI data analysis assistants make insights accessible to everyone, regardless of technical expertise. The key is combining natural language understanding with proper data handling.