April 4, 2024 1 min read

Data Analysis with Copilot: AI-Powered Insights

Microsoft Fabric Copilot Data Analysis AI Analytics

Data Analysis with Copilot: AI-Powered Insights

Microsoft Fabric’s Copilot transforms data analysis by enabling natural language interactions with your data. This guide shows how to leverage Copilot for effective data analysis.

Getting Started with Copilot Analysis

# Copilot can help with various analysis tasks

COPILOT_ANALYSIS_CAPABILITIES = {
    "exploratory_analysis": [
        "Describe the data structure",
        "Find patterns and anomalies",
        "Identify correlations",
        "Summarize distributions"
    ],
    "statistical_analysis": [
        "Calculate descriptive statistics",
        "Perform hypothesis testing",
        "Run regression analysis",
        "Generate forecasts"
    ],
    "visualization": [
        "Suggest appropriate chart types",
        "Generate visualization code",
        "Create dashboards",
        "Build reports"
    ]
}

Interactive Analysis Session

# Example: How to structure prompts for effective analysis

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

# Load your data
df = spark.read.format("delta").load("Tables/sales_data")

# Prompt 1: "Describe this dataset"
# Copilot generates:
def describe_dataset(df):
    """Comprehensive dataset description"""

    print("=== Dataset Overview ===")
    print(f"Total rows: {df.count():,}")
    print(f"Total columns: {len(df.columns)}")

    print("\n=== Column Types ===")
    for col_name, col_type in df.dtypes:
        print(f"  {col_name}: {col_type}")

    print("\n=== Null Values ===")
    null_counts = df.select([
        count(when(col(c).isNull(), c)).alias(c)
        for c in df.columns
    ]).collect()[0]

    for col_name in df.columns:
        null_count = null_counts[col_name]
        if null_count > 0:
            print(f"  {col_name}: {null_count:,} nulls")

    print("\n=== Numeric Statistics ===")
    df.describe().show()

    return df

describe_dataset(df)

Pattern Discovery

# Prompt: "Find patterns in the sales data over time"

# Copilot generates:
def analyze_time_patterns(df):
    """Discover temporal patterns in data"""

    # Ensure date column is properly typed
    df = df.withColumn("order_date", to_date(col("order_date")))

    # Daily patterns
    daily_sales = df.groupBy("order_date").agg(
        count("*").alias("order_count"),
        sum("amount").alias("total_sales"),
        avg("amount").alias("avg_order_value")
    ).orderBy("order_date")

    print("=== Daily Sales Trend ===")
    daily_sales.show(10)

    # Day of week patterns
    dow_sales = df.withColumn(
        "day_of_week", dayofweek(col("order_date"))
    ).groupBy("day_of_week").agg(
        count("*").alias("order_count"),
        avg("amount").alias("avg_amount")
    ).orderBy("day_of_week")

    print("\n=== Day of Week Pattern ===")
    dow_sales.show()

    # Monthly patterns
    monthly_sales = df.withColumn(
        "year_month", date_format(col("order_date"), "yyyy-MM")
    ).groupBy("year_month").agg(
        count("*").alias("order_count"),
        sum("amount").alias("total_sales")
    ).orderBy("year_month")

    print("\n=== Monthly Sales Trend ===")
    monthly_sales.show(12)

    # Identify peak periods
    stats = monthly_sales.agg(
        avg("total_sales").alias("avg"),
        stddev("total_sales").alias("std")
    ).collect()[0]

    peak_months = monthly_sales.filter(
        col("total_sales") > stats["avg"] + stats["std"]
    )

    print("\n=== Peak Months (Above Average) ===")
    peak_months.show()

    return {
        "daily": daily_sales,
        "day_of_week": dow_sales,
        "monthly": monthly_sales,
        "peaks": peak_months
    }

patterns = analyze_time_patterns(df)

Anomaly Detection

# Prompt: "Identify anomalies in the transaction data"

# Copilot generates:
def detect_anomalies(df, value_column="amount", date_column="order_date"):
    """Detect anomalies using statistical methods"""

    from pyspark.sql.window import Window
    from pyspark.sql.functions import percentile_approx

    # Method 1: IQR-based outliers
    quantiles = df.approxQuantile(value_column, [0.25, 0.75], 0.05)
    q1, q3 = quantiles[0], quantiles[1]
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    outliers_iqr = df.filter(
        (col(value_column) < lower_bound) |
        (col(value_column) > upper_bound)
    )

    print(f"=== IQR Outliers ===")
    print(f"Lower bound: {lower_bound:.2f}")
    print(f"Upper bound: {upper_bound:.2f}")
    print(f"Outlier count: {outliers_iqr.count()}")
    outliers_iqr.show(5)

    # Method 2: Rolling window anomalies
    window_spec = Window.orderBy(date_column).rowsBetween(-6, 0)

    df_with_rolling = df.withColumn(
        "rolling_avg", avg(col(value_column)).over(window_spec)
    ).withColumn(
        "rolling_std", stddev(col(value_column)).over(window_spec)
    ).withColumn(
        "z_score", (col(value_column) - col("rolling_avg")) / col("rolling_std")
    )

    rolling_anomalies = df_with_rolling.filter(abs(col("z_score")) > 3)

    print(f"\n=== Rolling Window Anomalies (|Z| > 3) ===")
    print(f"Count: {rolling_anomalies.count()}")
    rolling_anomalies.select(
        date_column, value_column, "rolling_avg", "z_score"
    ).show(5)

    # Method 3: Sudden changes
    lag_window = Window.orderBy(date_column)
    df_with_change = df.withColumn(
        "prev_value", lag(col(value_column)).over(lag_window)
    ).withColumn(
        "pct_change", (col(value_column) - col("prev_value")) / col("prev_value") * 100
    )

    sudden_changes = df_with_change.filter(abs(col("pct_change")) > 50)

    print(f"\n=== Sudden Changes (>50% change) ===")
    sudden_changes.show(5)

    return {
        "iqr_outliers": outliers_iqr,
        "rolling_anomalies": rolling_anomalies,
        "sudden_changes": sudden_changes
    }

anomalies = detect_anomalies(df)

Correlation Analysis

# Prompt: "Find correlations between different metrics"

# Copilot generates:
def analyze_correlations(df, numeric_columns=None):
    """Analyze correlations between numeric columns"""

    import pandas as pd
    import numpy as np

    if numeric_columns is None:
        # Auto-detect numeric columns
        numeric_columns = [
            f.name for f in df.schema.fields
            if str(f.dataType) in ['IntegerType', 'LongType', 'DoubleType', 'FloatType']
        ]

    # Convert to pandas for correlation analysis
    pdf = df.select(numeric_columns).toPandas()

    # Calculate correlation matrix
    corr_matrix = pdf.corr()

    print("=== Correlation Matrix ===")
    print(corr_matrix.round(2))

    # Find strong correlations
    print("\n=== Strong Correlations (|r| > 0.7) ===")
    for i in range(len(numeric_columns)):
        for j in range(i+1, len(numeric_columns)):
            corr = corr_matrix.iloc[i, j]
            if abs(corr) > 0.7:
                print(f"{numeric_columns[i]} <-> {numeric_columns[j]}: {corr:.3f}")

    # Visualize (generates code for visualization)
    visualization_code = """
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()
"""

    print("\n=== Visualization Code ===")
    print(visualization_code)

    return corr_matrix

correlations = analyze_correlations(df)

Insight Generation

# Prompt: "Generate key insights from the data"

# Copilot generates:
def generate_insights(df, dimension_cols, metric_cols):
    """Automatically generate key insights"""

    insights = []

    for metric in metric_cols:
        # Overall statistics
        stats = df.agg(
            avg(col(metric)).alias("avg"),
            min(col(metric)).alias("min"),
            max(col(metric)).alias("max"),
            stddev(col(metric)).alias("std")
        ).collect()[0]

        insights.append({
            "type": "statistic",
            "metric": metric,
            "message": f"Average {metric}: {stats['avg']:.2f} (range: {stats['min']:.2f} - {stats['max']:.2f})"
        })

        # Top performers by dimension
        for dim in dimension_cols:
            top = df.groupBy(dim).agg(
                sum(col(metric)).alias("total")
            ).orderBy(col("total").desc()).first()

            insights.append({
                "type": "top_performer",
                "dimension": dim,
                "metric": metric,
                "message": f"Top {dim} by {metric}: {top[dim]} ({top['total']:.2f})"
            })

    print("=== Key Insights ===")
    for i, insight in enumerate(insights, 1):
        print(f"{i}. {insight['message']}")

    return insights

insights = generate_insights(
    df,
    dimension_cols=["region", "product_category"],
    metric_cols=["amount", "quantity"]
)

Conclusion

Copilot-assisted data analysis accelerates insight discovery by generating code for common analytical tasks. The key is to ask specific questions and iterate on the generated analysis.