6 min read
Microsoft Fabric Performance Tuning: From Notebooks to Reports
Performance issues in Fabric can originate from multiple layers - Spark notebooks, Delta tables, semantic models, or Power BI reports. This guide covers tuning strategies across the entire stack.
Spark Notebook Performance
1. Optimize Spark Configuration
# Set optimal Spark configurations for Fabric
spark.conf.set("spark.sql.shuffle.partitions", "200") # Adjust based on data size
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")
# For large data operations
spark.conf.set("spark.sql.files.maxPartitionBytes", "128mb")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "100mb")
2. Data Reading Optimization
from pyspark.sql.functions import *
# BAD: Reading entire table then filtering
df_bad = spark.read.table("large_table")
result = df_bad.filter(col("date") >= "2024-01-01")
# GOOD: Push down filters
df_good = spark.read.table("large_table").filter(col("date") >= "2024-01-01")
# BEST: Use partition pruning with Delta
df_best = spark.sql("""
SELECT * FROM large_table
WHERE date >= '2024-01-01'
AND region = 'NORTH'
""")
# Column pruning - only read needed columns
df_pruned = spark.read.table("wide_table").select("col1", "col2", "col3")
3. Join Optimization
from pyspark.sql.functions import broadcast
# Profile your tables first
def profile_table(table_name: str):
df = spark.read.table(table_name)
return {
"rows": df.count(),
"columns": len(df.columns),
"size_estimate_mb": df.rdd.map(lambda x: len(str(x))).sum() / 1024 / 1024
}
# Small table: Use broadcast
small_df = spark.read.table("dim_product") # 10K rows
large_df = spark.read.table("fact_sales") # 100M rows
# Broadcast join (small table sent to all workers)
result = large_df.join(broadcast(small_df), "product_id")
# For skewed joins, salt the keys
def salted_join(large_df, small_df, key_col, num_salts=10):
"""Join with key salting to handle skew."""
from pyspark.sql.functions import concat, lit, floor, rand
# Add salt to large dataframe
large_salted = large_df.withColumn(
"salt",
floor(rand() * num_salts).cast("int")
).withColumn(
"salted_key",
concat(col(key_col), lit("_"), col("salt"))
)
# Explode small dataframe with all salts
salt_df = spark.range(num_salts).withColumnRenamed("id", "salt")
small_exploded = small_df.crossJoin(salt_df).withColumn(
"salted_key",
concat(col(key_col), lit("_"), col("salt"))
)
# Join on salted key
result = large_salted.join(small_exploded, "salted_key")
return result.drop("salt", "salted_key")
4. Aggregation Optimization
# BAD: Multiple passes over data
count = df.count()
avg_val = df.agg({"value": "avg"}).collect()[0][0]
max_val = df.agg({"value": "max"}).collect()[0][0]
# GOOD: Single pass
from pyspark.sql.functions import count, avg, max, min, sum
stats = df.agg(
count("*").alias("count"),
avg("value").alias("avg_value"),
max("value").alias("max_value"),
min("value").alias("min_value"),
sum("value").alias("total_value")
).collect()[0]
# For distinct counts, use approx when exact isn't needed
approx_distinct = df.agg(approx_count_distinct("customer_id")).collect()[0][0]
Delta Table Optimization
1. File Size Management
from delta.tables import DeltaTable
def optimize_delta_table(table_name: str, z_order_cols: list = None):
"""Comprehensive Delta table optimization."""
delta_table = DeltaTable.forName(spark, table_name)
# Compact small files
if z_order_cols:
delta_table.optimize().executeZOrderBy(z_order_cols)
else:
delta_table.optimize().executeCompaction()
# Vacuum old files
delta_table.vacuum(168) # 7 days
# Update statistics
spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS FOR ALL COLUMNS")
print(f"Optimization complete for {table_name}")
# Check file sizes
def check_delta_files(table_path: str):
"""Analyze Delta table file distribution."""
files = spark.read.format("delta").load(table_path).inputFiles()
file_sizes = []
for f in files[:100]: # Sample first 100
# Get file size from storage
# This is simplified - actual implementation would use storage APIs
pass
return {
"total_files": len(files),
"avg_file_size_mb": sum(file_sizes) / len(file_sizes) if file_sizes else 0
}
2. Partitioning Strategy
# Good partitioning for time-series data
df.write \
.format("delta") \
.partitionBy("year", "month") \
.mode("overwrite") \
.saveAsTable("partitioned_sales")
# Check partition pruning is working
spark.sql("EXPLAIN SELECT * FROM partitioned_sales WHERE year = 2024 AND month = 1")
# Avoid over-partitioning (rule of thumb: partitions should be > 1GB)
def recommend_partitioning(df, potential_columns: list):
"""Recommend partitioning strategy based on data distribution."""
recommendations = []
for col in potential_columns:
distinct_values = df.select(col).distinct().count()
total_rows = df.count()
avg_rows_per_partition = total_rows / distinct_values
if distinct_values > 10000:
recommendations.append({
"column": col,
"verdict": "Too many partitions",
"distinct_values": distinct_values
})
elif avg_rows_per_partition < 1000000:
recommendations.append({
"column": col,
"verdict": "Partitions too small",
"avg_rows": avg_rows_per_partition
})
else:
recommendations.append({
"column": col,
"verdict": "Good candidate",
"distinct_values": distinct_values,
"avg_rows": avg_rows_per_partition
})
return recommendations
Power BI / Semantic Model Performance
1. DAX Query Optimization
// BAD: Row-by-row calculation
Bad Measure =
SUMX(
Sales,
Sales[Quantity] * RELATED(Products[Price])
)
// GOOD: Pre-calculated in model or using efficient pattern
Good Measure =
SUMX(
VALUES(Products[ProductID]),
CALCULATE(SUM(Sales[Quantity])) * MAX(Products[Price])
)
// Avoid FILTER with large tables
// BAD
Bad Filter =
CALCULATE(
[Total Sales],
FILTER(ALL(Sales), Sales[Date] >= DATE(2024,1,1))
)
// GOOD
Good Filter =
CALCULATE(
[Total Sales],
Sales[Date] >= DATE(2024,1,1)
)
// Use variables for repeated calculations
Efficient Measure =
VAR TotalSales = SUM(Sales[Amount])
VAR TotalCost = SUM(Sales[Cost])
RETURN
DIVIDE(TotalSales - TotalCost, TotalCost)
2. Model Design
# Python script to analyze semantic model
def analyze_model_performance(workspace_id: str, dataset_id: str):
"""Analyze semantic model for performance issues."""
issues = []
# Check for too many columns (aim for < 500)
column_count = get_column_count(workspace_id, dataset_id)
if column_count > 500:
issues.append(f"High column count: {column_count}")
# Check for high cardinality columns
high_cardinality = get_high_cardinality_columns(workspace_id, dataset_id)
for col in high_cardinality:
if col["cardinality"] > 1000000:
issues.append(f"High cardinality: {col['table']}.{col['column']} ({col['cardinality']:,})")
# Check for calculated columns (prefer measures)
calc_columns = get_calculated_columns(workspace_id, dataset_id)
if len(calc_columns) > 20:
issues.append(f"Many calculated columns: {len(calc_columns)} (consider measures)")
# Check relationships
relationships = get_relationships(workspace_id, dataset_id)
for rel in relationships:
if rel["cross_filter_direction"] == "Both":
issues.append(f"Bi-directional filter: {rel['from_table']} <-> {rel['to_table']}")
return issues
3. Report-Level Optimization
// Power BI report optimization checklist
// 1. Limit visuals per page (aim for < 10)
// 2. Use bookmarks instead of multiple similar pages
// 3. Disable interactions where not needed
// In report settings:
{
"filterPaneBehavior": "collapsed",
"visualInteractions": "limited",
"drillthrough": "enabled",
"persistentFilters": true
}
End-to-End Performance Testing
import time
from dataclasses import dataclass
@dataclass
class PerformanceResult:
operation: str
duration_seconds: float
rows_processed: int
success: bool
error: str = None
class FabricPerformanceTester:
def __init__(self):
self.results = []
def test_notebook_execution(self, notebook_path: str) -> PerformanceResult:
"""Test notebook execution time."""
start = time.time()
try:
# Execute notebook
result = mssparkutils.notebook.run(notebook_path)
duration = time.time() - start
return PerformanceResult(
operation=f"notebook:{notebook_path}",
duration_seconds=duration,
rows_processed=result.get("rows", 0),
success=True
)
except Exception as e:
return PerformanceResult(
operation=f"notebook:{notebook_path}",
duration_seconds=time.time() - start,
rows_processed=0,
success=False,
error=str(e)
)
def test_query_performance(self, query: str, name: str) -> PerformanceResult:
"""Test Spark SQL query performance."""
start = time.time()
try:
df = spark.sql(query)
count = df.count() # Force execution
duration = time.time() - start
return PerformanceResult(
operation=f"query:{name}",
duration_seconds=duration,
rows_processed=count,
success=True
)
except Exception as e:
return PerformanceResult(
operation=f"query:{name}",
duration_seconds=time.time() - start,
rows_processed=0,
success=False,
error=str(e)
)
def generate_report(self) -> str:
"""Generate performance test report."""
report = ["# Performance Test Results\n"]
for result in self.results:
status = "PASS" if result.success else "FAIL"
report.append(f"## {result.operation}")
report.append(f"- Status: {status}")
report.append(f"- Duration: {result.duration_seconds:.2f}s")
report.append(f"- Rows: {result.rows_processed:,}")
if result.error:
report.append(f"- Error: {result.error}")
report.append("")
return "\n".join(report)
# Usage
tester = FabricPerformanceTester()
tester.results.append(tester.test_query_performance(
"SELECT * FROM sales WHERE date >= '2024-01-01'",
"filtered_sales"
))
print(tester.generate_report())
Conclusion
Fabric performance tuning requires attention to:
- Spark configuration - Right-size shuffle partitions and enable adaptive features
- Data reading - Push down filters, prune columns
- Joins - Broadcast small tables, salt skewed keys
- Delta tables - Optimize file sizes, partition wisely
- DAX - Use efficient patterns, avoid row-by-row
- Models - Minimize cardinality, prefer measures
Start with profiling to identify bottlenecks, then apply targeted optimizations. Measure before and after to validate improvements.