Back to Blog
5 min read

Auto-Generated Insights: AI-Driven Data Discovery

Auto-Generated Insights: AI-Driven Data Discovery

Auto-generated insights use AI to automatically discover patterns, anomalies, and trends in your data. This guide covers building insight generation systems.

Types of Automated Insights

from enum import Enum
from dataclasses import dataclass
from typing import List, Dict, Optional

class InsightType(Enum):
    TREND = "trend"  # Changes over time
    ANOMALY = "anomaly"  # Unusual values
    CORRELATION = "correlation"  # Relationships
    DISTRIBUTION = "distribution"  # Data shape
    COMPARISON = "comparison"  # Segment differences
    DRIVER = "driver"  # Contributing factors

@dataclass
class Insight:
    type: InsightType
    title: str
    description: str
    importance: float  # 0-1
    data_points: List[Dict]
    visualization: Optional[Dict] = None
    actions: Optional[List[str]] = None

Insight Generation Engine

import numpy as np
import pandas as pd
from scipy import stats
from typing import List, Dict
import anthropic

class InsightEngine:
    """Generate insights automatically from data"""

    def __init__(self):
        self.client = anthropic.Anthropic()

    def generate_insights(
        self,
        df: pd.DataFrame,
        metric_columns: List[str],
        dimension_columns: List[str],
        date_column: Optional[str] = None
    ) -> List[Insight]:
        """Generate all types of insights from dataframe"""

        insights = []

        # Trend insights
        if date_column:
            insights.extend(
                self._find_trends(df, metric_columns, date_column)
            )

        # Anomaly insights
        insights.extend(
            self._find_anomalies(df, metric_columns)
        )

        # Correlation insights
        insights.extend(
            self._find_correlations(df, metric_columns)
        )

        # Distribution insights
        insights.extend(
            self._find_distribution_insights(df, metric_columns)
        )

        # Comparison insights
        for dim in dimension_columns:
            insights.extend(
                self._find_comparisons(df, metric_columns, dim)
            )

        # Driver analysis
        for metric in metric_columns:
            insights.extend(
                self._find_drivers(df, metric, dimension_columns)
            )

        # Rank and return top insights
        insights.sort(key=lambda x: x.importance, reverse=True)
        return insights[:10]  # Top 10 insights

    def _find_trends(
        self,
        df: pd.DataFrame,
        metrics: List[str],
        date_col: str
    ) -> List[Insight]:
        """Identify significant trends"""

        insights = []

        for metric in metrics:
            # Group by date and calculate trend
            trend_df = df.groupby(date_col)[metric].sum().reset_index()

            if len(trend_df) < 3:
                continue

            # Calculate trend direction and strength
            x = np.arange(len(trend_df))
            y = trend_df[metric].values

            slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

            if p_value < 0.05 and abs(r_value) > 0.7:
                direction = "increasing" if slope > 0 else "decreasing"
                pct_change = ((y[-1] - y[0]) / y[0]) * 100

                importance = min(abs(r_value) * abs(pct_change) / 100, 1.0)

                insights.append(Insight(
                    type=InsightType.TREND,
                    title=f"{metric} is {direction}",
                    description=f"{metric} has been consistently {direction}, "
                               f"changing by {pct_change:.1f}% over the period.",
                    importance=importance,
                    data_points=[{
                        "start_value": y[0],
                        "end_value": y[-1],
                        "change_pct": pct_change,
                        "r_squared": r_value ** 2
                    }],
                    visualization={"type": "line_chart", "metric": metric}
                ))

        return insights

    def _find_anomalies(
        self,
        df: pd.DataFrame,
        metrics: List[str]
    ) -> List[Insight]:
        """Detect anomalies using statistical methods"""

        insights = []

        for metric in metrics:
            values = df[metric].dropna()

            if len(values) < 10:
                continue

            # Calculate Z-scores
            z_scores = np.abs(stats.zscore(values))
            anomaly_mask = z_scores > 3

            if anomaly_mask.any():
                anomaly_indices = np.where(anomaly_mask)[0]
                anomaly_values = values.iloc[anomaly_indices]

                insights.append(Insight(
                    type=InsightType.ANOMALY,
                    title=f"Unusual values detected in {metric}",
                    description=f"Found {len(anomaly_indices)} unusual data points "
                               f"in {metric} that deviate significantly from normal.",
                    importance=0.8,
                    data_points=[{
                        "count": len(anomaly_indices),
                        "values": anomaly_values.tolist()[:5],
                        "threshold_z": 3
                    }],
                    actions=["Investigate data quality", "Review source systems"]
                ))

        return insights

    def _find_correlations(
        self,
        df: pd.DataFrame,
        metrics: List[str]
    ) -> List[Insight]:
        """Find significant correlations between metrics"""

        insights = []

        if len(metrics) < 2:
            return insights

        corr_matrix = df[metrics].corr()

        for i, m1 in enumerate(metrics):
            for j, m2 in enumerate(metrics):
                if i >= j:
                    continue

                corr = corr_matrix.loc[m1, m2]

                if abs(corr) > 0.7:
                    direction = "positively" if corr > 0 else "negatively"

                    insights.append(Insight(
                        type=InsightType.CORRELATION,
                        title=f"{m1} and {m2} are {direction} correlated",
                        description=f"When {m1} increases, {m2} tends to "
                                   f"{'increase' if corr > 0 else 'decrease'}. "
                                   f"Correlation strength: {abs(corr):.2f}",
                        importance=abs(corr),
                        data_points=[{
                            "metric1": m1,
                            "metric2": m2,
                            "correlation": corr
                        }],
                        visualization={"type": "scatter", "x": m1, "y": m2}
                    ))

        return insights

    def _find_distribution_insights(
        self,
        df: pd.DataFrame,
        metrics: List[str]
    ) -> List[Insight]:
        """Analyze distributions for insights"""

        insights = []

        for metric in metrics:
            values = df[metric].dropna()

            if len(values) < 10:
                continue

            # Check for skewness
            skewness = stats.skew(values)

            if abs(skewness) > 1:
                direction = "right" if skewness > 0 else "left"

                insights.append(Insight(
                    type=InsightType.DISTRIBUTION,
                    title=f"{metric} distribution is skewed {direction}",
                    description=f"The distribution of {metric} is heavily "
                               f"skewed to the {direction}, indicating "
                               f"{'many low values with few high outliers' if skewness > 0 else 'many high values with few low outliers'}.",
                    importance=min(abs(skewness) / 3, 0.7),
                    data_points=[{
                        "skewness": skewness,
                        "mean": values.mean(),
                        "median": values.median()
                    }],
                    visualization={"type": "histogram", "metric": metric}
                ))

        return insights

    def _find_comparisons(
        self,
        df: pd.DataFrame,
        metrics: List[str],
        dimension: str
    ) -> List[Insight]:
        """Compare metrics across dimensions"""

        insights = []

        for metric in metrics:
            grouped = df.groupby(dimension)[metric].agg(['mean', 'sum', 'count'])

            if len(grouped) < 2:
                continue

            # Find best and worst performers
            best = grouped['sum'].idxmax()
            worst = grouped['sum'].idxmin()

            if best != worst:
                best_val = grouped.loc[best, 'sum']
                worst_val = grouped.loc[worst, 'sum']
                diff_pct = ((best_val - worst_val) / worst_val) * 100

                if diff_pct > 50:  # Significant difference
                    insights.append(Insight(
                        type=InsightType.COMPARISON,
                        title=f"Large {metric} gap between {dimension}s",
                        description=f"{best} leads in {metric} with {best_val:,.0f}, "
                                   f"while {worst} has only {worst_val:,.0f} - "
                                   f"a {diff_pct:.0f}% difference.",
                        importance=min(diff_pct / 200, 0.9),
                        data_points=[{
                            "best": {"name": best, "value": best_val},
                            "worst": {"name": worst, "value": worst_val},
                            "difference_pct": diff_pct
                        }],
                        visualization={"type": "bar", "dimension": dimension, "metric": metric}
                    ))

        return insights

    def _find_drivers(
        self,
        df: pd.DataFrame,
        target_metric: str,
        dimensions: List[str]
    ) -> List[Insight]:
        """Find what drives a metric"""

        insights = []

        for dim in dimensions:
            if df[dim].dtype == 'object' or df[dim].nunique() < 20:
                grouped = df.groupby(dim)[target_metric].sum()
                contribution = grouped / grouped.sum() * 100

                top_contributor = contribution.idxmax()
                top_pct = contribution.max()

                if top_pct > 30:  # Significant contributor
                    insights.append(Insight(
                        type=InsightType.DRIVER,
                        title=f"{top_contributor} drives {target_metric}",
                        description=f"{top_contributor} accounts for {top_pct:.1f}% "
                                   f"of total {target_metric}.",
                        importance=top_pct / 100,
                        data_points=[{
                            "driver": top_contributor,
                            "contribution_pct": top_pct,
                            "dimension": dim
                        }],
                        actions=[f"Focus on {top_contributor}", f"Analyze {dim} breakdown"]
                    ))

        return insights

Generating Insight Narratives

def generate_insight_narrative(insights: List[Insight]) -> str:
    """Generate a narrative summary of insights"""

    client = anthropic.Anthropic()

    insights_text = "\n".join([
        f"- {i.title}: {i.description} (importance: {i.importance:.2f})"
        for i in insights[:5]
    ])

    prompt = f"""Create a brief executive summary based on these data insights:

{insights_text}

Requirements:
- 2-3 paragraphs
- Lead with the most important finding
- Connect related insights
- End with recommended actions

Write in a professional, business style."""

    response = client.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=500,
        messages=[{"role": "user", "content": prompt}]
    )

    return response.content[0].text

Conclusion

Automated insight generation transforms raw data into actionable intelligence. Combine statistical analysis with AI-powered narrative generation to surface the most important patterns in your data.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.