November 17, 2023 1 min read

Copilot in Microsoft Fabric: AI-Powered Analytics

Microsoft Fabric Copilot AI Analytics Productivity

Copilot in Microsoft Fabric: AI-Powered Analytics

With Fabric’s GA, Copilot capabilities have expanded significantly. This AI assistant transforms how we interact with data, from writing SQL queries to building visualizations and analyzing results.

Copilot Capabilities in Fabric

from enum import Enum
from dataclasses import dataclass
from typing import List

class CopilotCapability(Enum):
    SQL_GENERATION = "Generate SQL queries from natural language"
    SPARK_CODE = "Generate PySpark/Spark SQL code"
    DAX_FORMULAS = "Create DAX measures and calculations"
    DATA_ANALYSIS = "Analyze data and provide insights"
    VISUALIZATION = "Suggest and create visualizations"
    DOCUMENTATION = "Generate documentation"
    TROUBLESHOOTING = "Debug and fix issues"
    OPTIMIZATION = "Suggest performance improvements"

@dataclass
class CopilotWorkload:
    name: str
    capabilities: List[CopilotCapability]
    context_aware: bool

fabric_copilot_workloads = {
    "DataWarehouse": CopilotWorkload(
        name="Warehouse Copilot",
        capabilities=[
            CopilotCapability.SQL_GENERATION,
            CopilotCapability.DATA_ANALYSIS,
            CopilotCapability.OPTIMIZATION,
            CopilotCapability.DOCUMENTATION
        ],
        context_aware=True  # Understands your schema
    ),
    "Notebook": CopilotWorkload(
        name="Notebook Copilot",
        capabilities=[
            CopilotCapability.SPARK_CODE,
            CopilotCapability.DATA_ANALYSIS,
            CopilotCapability.TROUBLESHOOTING,
            CopilotCapability.DOCUMENTATION
        ],
        context_aware=True
    ),
    "PowerBI": CopilotWorkload(
        name="Power BI Copilot",
        capabilities=[
            CopilotCapability.DAX_FORMULAS,
            CopilotCapability.VISUALIZATION,
            CopilotCapability.DATA_ANALYSIS
        ],
        context_aware=True
    )
}

SQL Generation in Warehouse

Copilot understands your schema and generates context-aware queries:

# Example: Natural language to SQL with Copilot

user_request = "Show me the top 10 customers by revenue this quarter"

# Copilot understands your schema and generates:
generated_sql = """
SELECT
    c.customer_name,
    c.customer_segment,
    SUM(o.order_total) as total_revenue,
    COUNT(DISTINCT o.order_id) as order_count,
    AVG(o.order_total) as avg_order_value
FROM
    dim_customer c
    INNER JOIN fact_orders o ON c.customer_key = o.customer_key
    INNER JOIN dim_date d ON o.order_date_key = d.date_key
WHERE
    d.fiscal_quarter = DATEPART(QUARTER, GETDATE())
    AND d.fiscal_year = YEAR(GETDATE())
GROUP BY
    c.customer_name,
    c.customer_segment
ORDER BY
    total_revenue DESC
OFFSET 0 ROWS FETCH NEXT 10 ROWS ONLY;
"""

# Copilot also explains the query:
explanation = """
This query:
1. Joins customers, orders, and date dimensions
2. Filters to current fiscal quarter
3. Aggregates revenue by customer
4. Returns top 10 by total revenue
5. Includes order count and average value for context
"""

PySpark Code Generation

# User request: "Create a pipeline to clean customer data and detect duplicates"

# Copilot generates:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, trim, lower, regexp_replace,
    soundex, levenshtein, when, row_number
)
from pyspark.sql.window import Window

def clean_customer_data(df):
    """
    Clean and standardize customer data.
    Generated by Copilot based on your data profile.
    """
    return df.select(
        col("customer_id"),
        trim(col("first_name")).alias("first_name"),
        trim(col("last_name")).alias("last_name"),
        lower(trim(col("email"))).alias("email"),
        regexp_replace(col("phone"), r'[^0-9]', '').alias("phone_cleaned"),
        trim(col("address")).alias("address"),
        col("created_date")
    ).filter(
        col("email").isNotNull() &
        col("email").rlike(r'^[\w\.-]+@[\w\.-]+\.\w+$')
    )

def detect_duplicates(df):
    """
    Detect potential duplicate customers using fuzzy matching.
    """
    # Create matching keys
    df_with_keys = df.withColumn(
        "name_soundex",
        soundex(col("first_name") + col("last_name"))
    ).withColumn(
        "email_domain",
        regexp_replace(col("email"), r'.*@', '')
    )

    # Self-join for comparison
    duplicates = df_with_keys.alias("a").join(
        df_with_keys.alias("b"),
        (col("a.customer_id") < col("b.customer_id")) &
        (
            (col("a.email") == col("b.email")) |
            (
                (col("a.name_soundex") == col("b.name_soundex")) &
                (col("a.phone_cleaned") == col("b.phone_cleaned"))
            )
        )
    ).select(
        col("a.customer_id").alias("customer_id_1"),
        col("b.customer_id").alias("customer_id_2"),
        col("a.email").alias("email_1"),
        col("b.email").alias("email_2"),
        when(col("a.email") == col("b.email"), "exact_email")
        .otherwise("fuzzy_match").alias("match_type")
    )

    return duplicates

# Usage in notebook
spark = SparkSession.builder.getOrCreate()

# Read from Lakehouse
raw_customers = spark.table("raw_customers")

# Clean data
cleaned = clean_customer_data(raw_customers)

# Find duplicates
duplicates = detect_duplicates(cleaned)

# Save results
cleaned.write.format("delta").mode("overwrite").saveAsTable("cleaned_customers")
duplicates.write.format("delta").mode("overwrite").saveAsTable("duplicate_candidates")

DAX Generation in Power BI

// User: "Create a measure for year-over-year revenue growth percentage"

// Copilot generates:
Revenue YoY Growth % =
VAR CurrentYearRevenue = [Total Revenue]
VAR PreviousYearRevenue =
    CALCULATE(
        [Total Revenue],
        SAMEPERIODLASTYEAR('Date'[Date])
    )
VAR Growth =
    IF(
        PreviousYearRevenue <> 0,
        DIVIDE(
            CurrentYearRevenue - PreviousYearRevenue,
            PreviousYearRevenue
        ),
        BLANK()
    )
RETURN
    Growth

// Copilot also suggests:
Revenue YoY Growth Display =
VAR GrowthPct = [Revenue YoY Growth %]
RETURN
    IF(
        ISBLANK(GrowthPct),
        "N/A",
        FORMAT(GrowthPct, "+0.0%;-0.0%;0.0%")
    )

// And a conditional formatting measure:
Revenue Growth Color =
VAR GrowthPct = [Revenue YoY Growth %]
RETURN
    SWITCH(
        TRUE(),
        GrowthPct >= 0.1, "#107C10",  // Strong growth - green
        GrowthPct >= 0, "#498205",     // Positive - light green
        GrowthPct >= -0.1, "#FF8C00",  // Slight decline - orange
        "#D13438"                       // Significant decline - red
    )

Data Analysis with Copilot

# Copilot can analyze your data and provide insights

# User: "Analyze the sales_data table and identify anomalies"

analysis_response = """
## Sales Data Analysis Summary

### Dataset Overview
- **Total Records**: 1,234,567
- **Date Range**: Jan 2022 - Nov 2023
- **Distinct Customers**: 45,678
- **Product Categories**: 12

### Key Findings

1. **Revenue Trend**: Q3 2023 shows 15% increase vs Q3 2022
   - Primary driver: Enterprise segment (+23%)
   - Lagging: SMB segment (-5%)

2. **Anomalies Detected**:
   - Oct 15, 2023: Unusual spike (3.2x normal daily average)
     - Cause: Black Friday early promotion
   - Sep 3, 2023: Sharp drop (0.4x normal)
     - Cause: System outage (confirmed via logs)

3. **Seasonal Patterns**:
   - Peak: November-December (holiday season)
   - Trough: January-February (post-holiday)

### Recommendations
1. Investigate SMB decline - potential market issue
2. Consider early promotions strategy success
3. Plan capacity for holiday surge

### Suggested Queries
```sql
-- Deep dive into SMB decline
SELECT month, segment, SUM(revenue)
FROM sales_data
WHERE segment = 'SMB'
GROUP BY month, segment
ORDER BY month;

"""


## Best Practices for Copilot

```python
copilot_best_practices = {
    "context": [
        "Ensure your schema is well-documented",
        "Use descriptive column and table names",
        "Add comments to existing code"
    ],
    "prompts": [
        "Be specific about what you want",
        "Include business context when relevant",
        "Ask for explanations to learn"
    ],
    "validation": [
        "Always review generated code",
        "Test on sample data first",
        "Verify business logic accuracy"
    ],
    "iteration": [
        "Refine prompts based on results",
        "Ask follow-up questions",
        "Request optimizations explicitly"
    ]
}

Tomorrow, we’ll explore Azure AI Studio updates and how they complement your Fabric analytics!