March 20, 2023 1 min read

GPT-4 for Data Analysis: Practical Applications

GPT-4’s reasoning capabilities make it an exceptional data analysis assistant. It can interpret data, generate insights, and even write analysis code. Here’s how to leverage it effectively.

Natural Language to SQL

GPT-4 excels at understanding complex data questions:

class NL2SQLEngine:
    """Convert natural language to SQL with GPT-4."""

    def __init__(self, client, schema_provider):
        self.client = client
        self.schema_provider = schema_provider

    async def query(
        self,
        question: str,
        database: str,
        validate: bool = True
    ) -> dict:
        """Convert question to SQL and optionally execute."""

        # Get database schema
        schema = await self.schema_provider.get_schema(database)

        prompt = f"""Convert this question to SQL.

Database Schema:
{schema}

Question: {question}

Requirements:
- Use appropriate JOINs
- Handle NULL values
- Use aliases for readability
- Include comments for complex logic
- Optimize for performance

Return only the SQL query."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a SQL expert. Generate correct, efficient SQL."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )

        sql = self._extract_sql(response.content)

        result = {
            "question": question,
            "sql": sql,
            "model": "gpt-4"
        }

        if validate:
            result["validation"] = await self._validate_sql(sql, schema)

        return result

    def _extract_sql(self, content: str) -> str:
        """Extract SQL from response."""
        # Handle markdown code blocks
        if "```sql" in content:
            start = content.find("```sql") + 6
            end = content.find("```", start)
            return content[start:end].strip()
        elif "```" in content:
            start = content.find("```") + 3
            end = content.find("```", start)
            return content[start:end].strip()
        return content.strip()

    async def _validate_sql(self, sql: str, schema: str) -> dict:
        """Validate generated SQL."""
        prompt = f"""Validate this SQL query.

Schema:
{schema}

Query:
{sql}

Check for:
1. Syntax errors
2. Invalid table/column references
3. Type mismatches
4. Potential performance issues
5. SQL injection risks

Return JSON:
{{"valid": true/false, "issues": ["issue1", ...], "suggestions": ["suggestion1", ...]}}"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        import json
        try:
            return json.loads(response.content)
        except:
            return {"valid": True, "raw": response.content}

Data Interpretation

class DataInterpreter:
    """Interpret data results with GPT-4."""

    async def interpret_results(
        self,
        data: list[dict],
        question: str,
        context: str = ""
    ) -> str:
        """Interpret query results in natural language."""

        # Format data for prompt
        if len(data) > 20:
            data_sample = data[:10] + [{"...": f"({len(data) - 20} more rows)"}] + data[-10:]
        else:
            data_sample = data

        import json
        data_str = json.dumps(data_sample, indent=2, default=str)

        prompt = f"""Interpret these data results.

Original Question: {question}
{f'Context: {context}' if context else ''}

Results ({len(data)} rows):
{data_str}

Provide:
1. Direct answer to the question
2. Key insights from the data
3. Notable patterns or anomalies
4. Caveats or limitations"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

    async def statistical_summary(
        self,
        data: list[dict],
        focus_columns: list[str] = None
    ) -> dict:
        """Generate statistical summary."""

        import json
        data_str = json.dumps(data[:100], indent=2, default=str)

        focus = f"Focus on columns: {', '.join(focus_columns)}" if focus_columns else ""

        prompt = f"""Analyze this dataset statistically.

Data (sample of {len(data)} rows):
{data_str}

{focus}

Provide comprehensive statistical analysis including:
1. Data types and distributions
2. Central tendencies (mean, median, mode)
3. Spread (std dev, range, IQR)
4. Correlations between numeric columns
5. Categorical breakdowns
6. Outliers and anomalies
7. Data quality issues

Return as structured JSON."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )

        return {"analysis": response.content}

Analysis Code Generation

class AnalysisCodeGenerator:
    """Generate data analysis code with GPT-4."""

    async def generate_pandas_analysis(
        self,
        data_description: str,
        analysis_goal: str
    ) -> str:
        """Generate pandas analysis code."""

        prompt = f"""Write Python code to analyze this data.

Data Description:
{data_description}

Analysis Goal: {analysis_goal}

Requirements:
- Use pandas for data manipulation
- Include data loading (assume CSV file)
- Add data cleaning steps
- Perform the requested analysis
- Create relevant visualizations with matplotlib/seaborn
- Add comments explaining each step
- Handle potential errors"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a data scientist writing clean, well-documented Python code."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.2
        )

        return response.content

    async def generate_visualization(
        self,
        data_description: str,
        chart_request: str
    ) -> str:
        """Generate visualization code."""

        prompt = f"""Create a visualization for this data.

Data: {data_description}
Request: {chart_request}

Generate Python code using matplotlib/seaborn that:
- Creates the requested chart
- Includes proper labels and titles
- Uses appropriate colors
- Handles edge cases
- Is ready to save as image"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

Anomaly Detection

class AnomalyAnalyzer:
    """Detect anomalies with GPT-4 reasoning."""

    async def detect_anomalies(
        self,
        data: list[dict],
        context: str = ""
    ) -> dict:
        """Detect anomalies in data."""

        import json
        data_str = json.dumps(data[:50], indent=2, default=str)

        prompt = f"""Analyze this data for anomalies.

{f'Context: {context}' if context else ''}

Data:
{data_str}

Identify:
1. Statistical outliers
2. Pattern breaks
3. Unexpected values
4. Missing data patterns
5. Suspicious combinations

For each anomaly:
- Description
- Severity (high/medium/low)
- Possible causes
- Recommended action

Return as JSON."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.1
        )

        return {"anomalies": response.content}

    async def explain_metric_change(
        self,
        metric_name: str,
        current_value: float,
        previous_value: float,
        context_data: dict = None
    ) -> str:
        """Explain why a metric changed."""

        change_pct = ((current_value - previous_value) / previous_value * 100) if previous_value else 0

        context_str = ""
        if context_data:
            import json
            context_str = f"\nContext data:\n{json.dumps(context_data, indent=2)}"

        prompt = f"""Explain this metric change.

Metric: {metric_name}
Previous: {previous_value}
Current: {current_value}
Change: {change_pct:.1f}%
{context_str}

Provide:
1. Possible explanations for this change
2. What additional data would help confirm the cause
3. Whether this change is concerning
4. Recommended next steps"""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

Report Generation

class ReportGenerator:
    """Generate data analysis reports with GPT-4."""

    async def generate_executive_summary(
        self,
        analyses: list[dict],
        audience: str = "executives"
    ) -> str:
        """Generate executive summary from analyses."""

        import json
        analyses_str = json.dumps(analyses, indent=2, default=str)

        prompt = f"""Create an executive summary from these analyses.

Analyses:
{analyses_str}

Audience: {audience}

Structure:
1. Key Findings (3-5 bullet points)
2. Metrics Overview (table format)
3. Trends and Patterns
4. Concerns and Risks
5. Recommendations

Keep it concise and actionable. Use business language appropriate for {audience}."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

    async def generate_data_story(
        self,
        data_journey: list[dict],
        narrative_style: str = "analytical"
    ) -> str:
        """Create a narrative data story."""

        import json
        journey_str = json.dumps(data_journey, indent=2, default=str)

        prompt = f"""Create a data story from this analysis journey.

Analysis Journey:
{journey_str}

Style: {narrative_style}

Create a compelling narrative that:
- Starts with the business question
- Walks through the discovery process
- Highlights key insights with data
- Builds to conclusions
- Ends with clear recommendations

Make it engaging while maintaining accuracy."""

        response = await self.client.chat_completion(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content

Complete Analysis Pipeline

class DataAnalysisPipeline:
    """End-to-end data analysis with GPT-4."""

    def __init__(self, client):
        self.nl2sql = NL2SQLEngine(client, schema_provider)
        self.interpreter = DataInterpreter()
        self.code_gen = AnalysisCodeGenerator()
        self.anomaly = AnomalyAnalyzer()
        self.reporter = ReportGenerator()

    async def analyze(
        self,
        question: str,
        database: str
    ) -> dict:
        """Complete analysis from question to report."""

        # Step 1: Convert to SQL
        sql_result = await self.nl2sql.query(question, database)

        # Step 2: Execute query (using your DB connection)
        data = await self.execute_sql(sql_result["sql"])

        # Step 3: Interpret results
        interpretation = await self.interpreter.interpret_results(
            data, question
        )

        # Step 4: Check for anomalies
        anomalies = await self.anomaly.detect_anomalies(data)

        # Step 5: Generate report
        report = await self.reporter.generate_executive_summary([
            {"question": question, "findings": interpretation},
            {"anomalies": anomalies}
        ])

        return {
            "question": question,
            "sql": sql_result["sql"],
            "row_count": len(data),
            "interpretation": interpretation,
            "anomalies": anomalies,
            "report": report
        }

GPT-4 transforms data analysis from technical SQL writing to conversational exploration. The ability to interpret results and generate insights makes data accessible to everyone.