November 27, 2023 1 min read

Code Interpreter: AI-Powered Data Analysis

OpenAI Code Interpreter Data Analysis Python AI

Code Interpreter: AI-Powered Data Analysis

Code Interpreter is one of the most powerful tools in the Assistants API. It gives your assistant the ability to write and execute Python code, perform data analysis, create visualizations, and work with files - all in a sandboxed environment.

Understanding Code Interpreter

from openai import OpenAI
from typing import List, Dict, Optional
import json

class CodeInterpreterAssistant:
    """Assistant with code interpreter capabilities."""

    def __init__(self, client: OpenAI):
        self.client = client

    def create_data_analyst(self) -> str:
        """Create a data analysis assistant."""
        assistant = self.client.beta.assistants.create(
            name="Data Analyst",
            instructions="""You are an expert data analyst.
            When given data:
            1. First explore and understand the data structure
            2. Perform requested analysis
            3. Create clear visualizations
            4. Provide actionable insights

            Guidelines:
            - Always handle missing data appropriately
            - Use pandas for data manipulation
            - Use matplotlib/seaborn for visualizations
            - Include statistical significance where relevant
            - Explain findings in business terms""",
            model="gpt-4-1106-preview",
            tools=[{"type": "code_interpreter"}]
        )
        return assistant.id

    def analyze_file(
        self,
        assistant_id: str,
        file_path: str,
        analysis_prompt: str
    ) -> Dict:
        """Upload a file and run analysis."""
        # Upload file
        with open(file_path, "rb") as f:
            file = self.client.files.create(file=f, purpose="assistants")

        # Create thread with file
        thread = self.client.beta.threads.create()
        self.client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content=analysis_prompt,
            file_ids=[file.id]
        )

        # Run analysis
        run = self.client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=assistant_id
        )

        # Wait for completion
        import time
        while run.status in ["queued", "in_progress"]:
            time.sleep(1)
            run = self.client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id
            )

        # Get results
        messages = self.client.beta.threads.messages.list(thread_id=thread.id)

        return {
            "thread_id": thread.id,
            "run_id": run.id,
            "status": run.status,
            "response": self._parse_response(messages)
        }

    def _parse_response(self, messages) -> Dict:
        """Parse the assistant response including code and files."""
        result = {
            "text": [],
            "code_blocks": [],
            "generated_files": []
        }

        for message in messages.data:
            if message.role != "assistant":
                continue

            for content in message.content:
                if content.type == "text":
                    result["text"].append(content.text.value)

                    # Extract code blocks from annotations
                    for annotation in content.text.annotations:
                        if annotation.type == "file_path":
                            result["generated_files"].append({
                                "file_id": annotation.file_path.file_id,
                                "reference": annotation.text
                            })

                elif content.type == "image_file":
                    result["generated_files"].append({
                        "file_id": content.image_file.file_id,
                        "type": "image"
                    })

        return result

Practical Analysis Examples

class DataAnalysisWorkflows:
    """Pre-built analysis workflows using code interpreter."""

    def __init__(self, client: OpenAI, assistant_id: str):
        self.client = client
        self.assistant_id = assistant_id
        self.ci = CodeInterpreterAssistant(client)

    def exploratory_analysis(self, file_path: str) -> Dict:
        """Run exploratory data analysis."""
        prompt = """Perform a comprehensive exploratory data analysis:

        1. Data Overview:
           - Shape, columns, data types
           - Missing values summary
           - Unique values per column

        2. Statistical Summary:
           - Descriptive statistics for numeric columns
           - Value counts for categorical columns

        3. Visualizations:
           - Distribution plots for numeric columns
           - Correlation heatmap
           - Bar charts for categorical columns

        4. Key Findings:
           - Summarize the most important insights
           - Flag any data quality issues"""

        return self.ci.analyze_file(self.assistant_id, file_path, prompt)

    def sales_analysis(self, file_path: str) -> Dict:
        """Analyze sales data."""
        prompt = """Analyze this sales data and provide:

        1. Revenue Analysis:
           - Total revenue by period (month/quarter)
           - Revenue by product category
           - Revenue by region/segment

        2. Trend Analysis:
           - Month-over-month growth
           - Seasonal patterns
           - Year-over-year comparison if applicable

        3. Top Performers:
           - Top 10 products by revenue
           - Top 10 customers
           - Best performing regions

        4. Visualizations:
           - Revenue trend line chart
           - Category breakdown pie chart
           - Regional heatmap

        5. Recommendations:
           - Areas for improvement
           - Growth opportunities"""

        return self.ci.analyze_file(self.assistant_id, file_path, prompt)

    def customer_segmentation(self, file_path: str) -> Dict:
        """Perform customer segmentation analysis."""
        prompt = """Perform customer segmentation analysis:

        1. RFM Analysis (if applicable):
           - Calculate Recency, Frequency, Monetary scores
           - Segment customers into groups

        2. Clustering:
           - Use K-means or similar algorithm
           - Determine optimal number of clusters
           - Visualize clusters

        3. Segment Profiles:
           - Describe each segment characteristics
           - Size of each segment
           - Average metrics per segment

        4. Recommendations:
           - Marketing strategies per segment
           - Retention priorities"""

        return self.ci.analyze_file(self.assistant_id, file_path, prompt)

    def anomaly_detection(self, file_path: str, target_column: str) -> Dict:
        """Detect anomalies in data."""
        prompt = f"""Analyze the '{target_column}' column for anomalies:

        1. Statistical Methods:
           - Z-score analysis
           - IQR method
           - Moving average deviation

        2. Visualization:
           - Time series plot with anomalies highlighted
           - Box plot showing outliers

        3. Anomaly Summary:
           - List all detected anomalies
           - Severity classification
           - Potential causes/patterns

        4. Recommendations:
           - Which anomalies require investigation
           - Suggested thresholds for monitoring"""

        return self.ci.analyze_file(self.assistant_id, file_path, prompt)

Interactive Analysis Session

class InteractiveAnalysisSession:
    """Manage interactive data analysis sessions."""

    def __init__(self, client: OpenAI, assistant_id: str):
        self.client = client
        self.assistant_id = assistant_id
        self.thread_id = None
        self.file_ids = []

    def start_session(self, file_paths: List[str] = None) -> str:
        """Start a new analysis session."""
        # Upload files if provided
        if file_paths:
            for path in file_paths:
                with open(path, "rb") as f:
                    file = self.client.files.create(file=f, purpose="assistants")
                    self.file_ids.append(file.id)

        # Create thread
        self.thread_id = self.client.beta.threads.create().id

        # Initial context message
        if self.file_ids:
            self.client.beta.threads.messages.create(
                thread_id=self.thread_id,
                role="user",
                content="I've uploaded data files for analysis. Please acknowledge and briefly describe what you see.",
                file_ids=self.file_ids
            )
            self._run_and_wait()

        return self.thread_id

    def ask(self, question: str, file_path: str = None) -> str:
        """Ask a question or request analysis."""
        file_ids = []
        if file_path:
            with open(file_path, "rb") as f:
                file = self.client.files.create(file=f, purpose="assistants")
                file_ids.append(file.id)
                self.file_ids.append(file.id)

        self.client.beta.threads.messages.create(
            thread_id=self.thread_id,
            role="user",
            content=question,
            file_ids=file_ids if file_ids else None
        )

        return self._run_and_wait()

    def _run_and_wait(self) -> str:
        """Run assistant and return response."""
        import time

        run = self.client.beta.threads.runs.create(
            thread_id=self.thread_id,
            assistant_id=self.assistant_id
        )

        while run.status in ["queued", "in_progress"]:
            time.sleep(1)
            run = self.client.beta.threads.runs.retrieve(
                thread_id=self.thread_id,
                run_id=run.id
            )

        messages = self.client.beta.threads.messages.list(
            thread_id=self.thread_id,
            limit=1
        )

        return messages.data[0].content[0].text.value

    def download_outputs(self, output_dir: str = "analysis_output") -> List[str]:
        """Download all generated files from the session."""
        import os
        os.makedirs(output_dir, exist_ok=True)

        messages = self.client.beta.threads.messages.list(thread_id=self.thread_id)
        downloaded = []

        for message in messages.data:
            for content in message.content:
                if content.type == "image_file":
                    file_id = content.image_file.file_id
                    path = os.path.join(output_dir, f"{file_id}.png")
                    content = self.client.files.content(file_id)
                    with open(path, "wb") as f:
                        f.write(content.read())
                    downloaded.append(path)

        return downloaded

# Example usage
"""
session = InteractiveAnalysisSession(client, assistant_id)
session.start_session(["sales_data.csv"])

# Interactive questions
print(session.ask("What's the total revenue by month?"))
print(session.ask("Create a chart showing the trend"))
print(session.ask("Which products are declining in sales?"))
print(session.ask("Predict next month's revenue using linear regression"))

# Download all charts
outputs = session.download_outputs()
"""

Best Practices

Be specific about the analysis you want
Request visualizations explicitly
Ask for intermediate steps to verify methodology
Download generated files promptly
Use follow-up questions to drill down into insights

Tomorrow, we’ll explore the Retrieval tool for building knowledge-based AI applications!