8 min read
V-Order: Microsoft Fabric's Performance Optimization
V-Order is Microsoft Fabric’s proprietary optimization that reorders Parquet data for maximum Direct Lake and Power BI performance. Let’s understand how it works and when to use it.
What is V-Order?
V-Order is a write-time optimization that physically reorders data within Parquet files to maximize compression and query performance, especially for Direct Lake semantic models.
Standard Parquet:
┌─────────────────────────────────────────────────────────────┐
│ Row Group: Data ordered by write sequence │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ USA, Product A, 100, 2024-01-01 │ │
│ │ EUR, Product B, 200, 2024-01-02 │ │
│ │ USA, Product A, 150, 2024-01-01 │ │
│ │ APAC, Product C, 300, 2024-01-03 │ │
│ │ EUR, Product A, 100, 2024-01-01 │ │
│ └─────────────────────────────────────────────────────┘ │
│ Compression: Good │
│ Query Skip: Limited │
└─────────────────────────────────────────────────────────────┘
V-Ordered Parquet:
┌─────────────────────────────────────────────────────────────┐
│ Row Group: Data sorted by V-Order algorithm │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ APAC, Product C, 300, 2024-01-03 │ │
│ │ EUR, Product A, 100, 2024-01-01 │ │
│ │ EUR, Product B, 200, 2024-01-02 │ │
│ │ USA, Product A, 100, 2024-01-01 │ │
│ │ USA, Product A, 150, 2024-01-01 │ │
│ └─────────────────────────────────────────────────────┘ │
│ Compression: Better (similar values clustered) │
│ Query Skip: Excellent (min/max statistics work well) │
└─────────────────────────────────────────────────────────────┘
Enabling V-Order
In Fabric Notebooks
# V-Order is enabled by default in Microsoft Fabric
# To explicitly enable:
spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
# Write with V-Order
df.write.format("delta") \
.mode("overwrite") \
.option("parquet.vorder.enabled", "true") \
.saveAsTable("sales_vorder")
At Table Creation
# Create table with V-Order enabled
spark.sql("""
CREATE TABLE sales_optimized (
order_id BIGINT,
order_date DATE,
region STRING,
product STRING,
amount DECIMAL(10,2)
)
USING DELTA
TBLPROPERTIES (
'delta.parquet.vorder.enabled' = 'true'
)
""")
# Or alter existing table
spark.sql("""
ALTER TABLE sales
SET TBLPROPERTIES ('delta.parquet.vorder.enabled' = 'true')
""")
Via Data Factory
# Data Factory Copy activity V-Order setting
copy_activity_config = {
"name": "CopyToLakehouse",
"type": "Copy",
"sink": {
"type": "LakehouseTableSink",
"tableType": "Delta",
"parquetWriterVersion": "V2",
"enableVOrder": True # Enable V-Order
}
}
V-Order Benefits
1. Better Compression
class VOrderAnalyzer:
"""Analyze V-Order benefits."""
def __init__(self, spark):
self.spark = spark
def compare_compression(
self,
df,
base_path: str
) -> dict:
"""Compare file sizes with and without V-Order."""
# Write without V-Order
self.spark.conf.set("spark.sql.parquet.vorder.enabled", "false")
df.write.format("delta") \
.mode("overwrite") \
.save(f"{base_path}/no_vorder")
# Write with V-Order
self.spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
df.write.format("delta") \
.mode("overwrite") \
.save(f"{base_path}/with_vorder")
# Compare sizes
no_vorder = self.spark.sql(
f"DESCRIBE DETAIL delta.`{base_path}/no_vorder`"
).first().sizeInBytes
with_vorder = self.spark.sql(
f"DESCRIBE DETAIL delta.`{base_path}/with_vorder`"
).first().sizeInBytes
compression_improvement = (no_vorder - with_vorder) / no_vorder * 100
return {
"without_vorder_mb": no_vorder / (1024 * 1024),
"with_vorder_mb": with_vorder / (1024 * 1024),
"compression_improvement_percent": compression_improvement
}
# Usage
analyzer = VOrderAnalyzer(spark)
results = analyzer.compare_compression(sales_df, "Files/compression_test")
print(f"Without V-Order: {results['without_vorder_mb']:.1f}MB")
print(f"With V-Order: {results['with_vorder_mb']:.1f}MB")
print(f"Improvement: {results['compression_improvement_percent']:.1f}%")
2. Faster Direct Lake Queries
# V-Order improves Direct Lake performance by:
# 1. Better data skipping via improved statistics
# 2. More efficient memory usage
# 3. Faster decompression
direct_lake_benefits = {
"data_skipping": {
"description": "V-Order clusters similar values, improving min/max statistics",
"impact": "Fewer row groups scanned for filtered queries"
},
"memory_efficiency": {
"description": "Compressed data loads faster into VertiPaq",
"impact": "Lower memory pressure, more data fits in cache"
},
"decompression": {
"description": "Similar values compress better with RLE/dictionary",
"impact": "Faster decompression during query execution"
}
}
3. Improved Data Skipping
def demonstrate_data_skipping(table_path: str, filter_column: str):
"""Show how V-Order improves data skipping."""
# Query with filter
query = f"""
SELECT COUNT(*)
FROM delta.`{table_path}`
WHERE {filter_column} = 'specific_value'
"""
# Explain to see files scanned
spark.sql(query).explain("formatted")
# V-Order improves skipping because:
# - Similar values are grouped in same row groups
# - Min/max statistics become more selective
# - Fewer row groups contain the target value
V-Order vs Z-Order
Understanding the Difference
comparison = {
"V-Order": {
"purpose": "Optimize for Direct Lake and compression",
"when_applied": "During write (automatic sorting)",
"columns": "All columns (algorithm decides order)",
"use_case": "Power BI semantic models, general queries",
"cost": "Minimal write overhead"
},
"Z-Order": {
"purpose": "Optimize for specific column filters",
"when_applied": "During OPTIMIZE operation",
"columns": "User-specified columns (up to ~4)",
"use_case": "Known query patterns with specific filters",
"cost": "Requires explicit OPTIMIZE, rewrites files"
}
}
# When to use which:
usage_guide = {
"use_vorder_only": [
"General analytics workloads",
"Power BI Direct Lake models",
"Unknown or varied query patterns"
],
"use_zorder_only": [
"Very specific, known filter columns",
"Point lookups on specific columns",
"When V-Order alone isn't sufficient"
],
"use_both": [
"Large tables with specific hot columns",
"When you need both general optimization and specific filters",
"Apply V-Order on write, Z-Order on important columns periodically"
]
}
Combined Strategy
class OrderingStrategy:
"""Combined V-Order and Z-Order strategy."""
def __init__(self, spark):
self.spark = spark
def write_with_vorder(self, df, table_path: str):
"""Write new data with V-Order."""
self.spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
df.write.format("delta") \
.mode("append") \
.save(table_path)
def apply_zorder_on_hot_columns(
self,
table_path: str,
columns: list[str]
):
"""Apply Z-Order on frequently filtered columns."""
delta_table = DeltaTable.forPath(self.spark, table_path)
delta_table.optimize().executeZOrderBy(*columns)
def full_optimization(
self,
df,
table_path: str,
zorder_columns: list[str] = None
):
"""Full optimization: V-Order write + optional Z-Order."""
# Step 1: Write with V-Order
self.write_with_vorder(df, table_path)
# Step 2: Z-Order if specific columns needed
if zorder_columns:
self.apply_zorder_on_hot_columns(table_path, zorder_columns)
# Usage
strategy = OrderingStrategy(spark)
# For most tables: just V-Order
strategy.write_with_vorder(sales_df, "Tables/sales")
# For heavily filtered tables: V-Order + Z-Order
strategy.full_optimization(
df=large_transactions_df,
table_path="Tables/transactions",
zorder_columns=["transaction_date", "customer_id"]
)
V-Order for Direct Lake
Optimizing for Power BI
class DirectLakeOptimizer:
"""Optimize tables for Direct Lake semantic models."""
def __init__(self, spark):
self.spark = spark
def optimize_for_direct_lake(self, table_path: str):
"""Apply all Direct Lake optimizations."""
# Enable V-Order
self.spark.sql(f"""
ALTER TABLE delta.`{table_path}`
SET TBLPROPERTIES (
'delta.parquet.vorder.enabled' = 'true',
'delta.autoOptimize.optimizeWrite' = 'true'
)
""")
# Rewrite existing data with V-Order
df = self.spark.read.format("delta").load(table_path)
df.write.format("delta") \
.mode("overwrite") \
.option("parquet.vorder.enabled", "true") \
.option("dataChange", "false") \
.save(table_path)
def get_direct_lake_recommendations(self, table_path: str) -> list[str]:
"""Get recommendations for Direct Lake optimization."""
recommendations = []
detail = self.spark.sql(f"DESCRIBE DETAIL delta.`{table_path}`").first()
properties = self.spark.sql(f"SHOW TBLPROPERTIES delta.`{table_path}`").collect()
prop_dict = {row.key: row.value for row in properties}
# Check V-Order
if prop_dict.get("delta.parquet.vorder.enabled") != "true":
recommendations.append("Enable V-Order for better Direct Lake performance")
# Check file sizes
if detail.numFiles > 0:
avg_size = detail.sizeInBytes / detail.numFiles
if avg_size < 128 * 1024 * 1024:
recommendations.append(
f"Files are small (avg {avg_size/(1024*1024):.0f}MB). "
"Run OPTIMIZE or enable optimized write"
)
elif avg_size > 512 * 1024 * 1024:
recommendations.append(
f"Files are large (avg {avg_size/(1024*1024):.0f}MB). "
"Consider smaller target file size"
)
if not recommendations:
recommendations.append("Table is well optimized for Direct Lake")
return recommendations
# Usage
optimizer = DirectLakeOptimizer(spark)
# Optimize table
optimizer.optimize_for_direct_lake("Tables/fact_sales")
# Get recommendations
recs = optimizer.get_direct_lake_recommendations("Tables/fact_sales")
for rec in recs:
print(f"- {rec}")
Monitoring V-Order
Verifying V-Order is Applied
def check_vorder_status(table_path: str) -> dict:
"""Check if V-Order is enabled and applied."""
# Check table properties
properties = spark.sql(f"SHOW TBLPROPERTIES delta.`{table_path}`").collect()
prop_dict = {row.key: row.value for row in properties}
vorder_enabled = prop_dict.get("delta.parquet.vorder.enabled", "false") == "true"
# Check session config
session_vorder = spark.conf.get(
"spark.sql.parquet.vorder.enabled",
"true" # Default in Fabric
) == "true"
return {
"table_property_enabled": vorder_enabled,
"session_enabled": session_vorder,
"effectively_enabled": vorder_enabled or session_vorder,
"recommendation": (
"V-Order is active" if vorder_enabled or session_vorder
else "Consider enabling V-Order for better performance"
)
}
# Check status
status = check_vorder_status("Tables/sales")
print(f"V-Order Status: {status['recommendation']}")
Performance Benchmarks
Measuring V-Order Impact
import time
class VOrderBenchmark:
"""Benchmark V-Order performance impact."""
def __init__(self, spark):
self.spark = spark
def benchmark_query(
self,
table_path: str,
query: str,
iterations: int = 5
) -> dict:
"""Benchmark query performance."""
times = []
for i in range(iterations):
# Clear cache
self.spark.catalog.clearCache()
start = time.time()
self.spark.sql(query.format(table=table_path)).collect()
elapsed = time.time() - start
if i > 0: # Skip first run (cold)
times.append(elapsed)
return {
"avg_time_seconds": sum(times) / len(times),
"min_time_seconds": min(times),
"max_time_seconds": max(times)
}
def compare_vorder_performance(
self,
df,
base_path: str,
test_query: str
) -> dict:
"""Compare query performance with and without V-Order."""
# Create without V-Order
self.spark.conf.set("spark.sql.parquet.vorder.enabled", "false")
df.write.format("delta").mode("overwrite").save(f"{base_path}/no_vorder")
# Create with V-Order
self.spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
df.write.format("delta").mode("overwrite").save(f"{base_path}/with_vorder")
# Benchmark both
no_vorder_perf = self.benchmark_query(
f"{base_path}/no_vorder",
test_query
)
with_vorder_perf = self.benchmark_query(
f"{base_path}/with_vorder",
test_query
)
improvement = (
(no_vorder_perf["avg_time_seconds"] - with_vorder_perf["avg_time_seconds"])
/ no_vorder_perf["avg_time_seconds"] * 100
)
return {
"without_vorder": no_vorder_perf,
"with_vorder": with_vorder_perf,
"improvement_percent": improvement
}
# Usage
benchmark = VOrderBenchmark(spark)
results = benchmark.compare_vorder_performance(
df=large_sales_df,
base_path="Files/benchmark",
test_query="SELECT region, SUM(amount) FROM delta.`{table}` GROUP BY region"
)
print(f"Without V-Order: {results['without_vorder']['avg_time_seconds']:.2f}s")
print(f"With V-Order: {results['with_vorder']['avg_time_seconds']:.2f}s")
print(f"Improvement: {results['improvement_percent']:.1f}%")
Best Practices
- Enable by default: V-Order has minimal overhead
- Use for Direct Lake: Essential for Power BI performance
- Combine with optimized write: Comprehensive optimization
- Consider Z-Order additionally: For specific filter patterns
- Monitor compression: V-Order should improve compression ratios
Conclusion
V-Order is a key optimization feature in Microsoft Fabric that improves both storage efficiency and query performance. Enable it by default for all analytical tables, especially those backing Power BI Direct Lake semantic models.
The minimal write overhead is far outweighed by the compression and query performance benefits, making V-Order essential for any Fabric lakehouse.