5 min read
Data Virtualization in Microsoft Fabric
Data virtualization enables querying data where it lives without physical movement. Microsoft Fabric’s shortcut feature brings virtualization capabilities to the lakehouse, enabling unified access across cloud storage providers.
What is Data Virtualization?
Traditional ETL moves data physically:
Source → Extract → Transform → Load → Query
Virtualization queries in place:
Source ←── Virtual Layer ←── Query
OneLake Shortcuts
Shortcuts create virtual pointers to external data:
from azure.identity import DefaultAzureCredential
import requests
class ShortcutManager:
def __init__(self, workspace_id: str, lakehouse_id: str):
self.workspace_id = workspace_id
self.lakehouse_id = lakehouse_id
self.base_url = "https://api.fabric.microsoft.com/v1"
def _get_headers(self):
credential = DefaultAzureCredential()
token = credential.get_token("https://api.fabric.microsoft.com/.default")
return {
"Authorization": f"Bearer {token.token}",
"Content-Type": "application/json"
}
def create_adls_shortcut(
self,
shortcut_name: str,
storage_account: str,
container: str,
path: str,
target_path: str = "Tables"
) -> dict:
"""Create shortcut to Azure Data Lake Storage."""
payload = {
"path": f"{target_path}/{shortcut_name}",
"target": {
"type": "AdlsGen2",
"adlsGen2": {
"location": f"https://{storage_account}.dfs.core.windows.net",
"subpath": f"/{container}/{path}"
}
}
}
response = requests.post(
f"{self.base_url}/workspaces/{self.workspace_id}/lakehouses/{self.lakehouse_id}/shortcuts",
headers=self._get_headers(),
json=payload
)
response.raise_for_status()
return response.json()
def create_s3_shortcut(
self,
shortcut_name: str,
bucket: str,
path: str,
connection_id: str,
target_path: str = "Tables"
) -> dict:
"""Create shortcut to Amazon S3."""
payload = {
"path": f"{target_path}/{shortcut_name}",
"target": {
"type": "S3",
"s3": {
"location": f"https://{bucket}.s3.amazonaws.com",
"subpath": f"/{path}",
"connectionId": connection_id
}
}
}
response = requests.post(
f"{self.base_url}/workspaces/{self.workspace_id}/lakehouses/{self.lakehouse_id}/shortcuts",
headers=self._get_headers(),
json=payload
)
response.raise_for_status()
return response.json()
def create_gcs_shortcut(
self,
shortcut_name: str,
bucket: str,
path: str,
connection_id: str,
target_path: str = "Tables"
) -> dict:
"""Create shortcut to Google Cloud Storage."""
payload = {
"path": f"{target_path}/{shortcut_name}",
"target": {
"type": "GoogleCloudStorage",
"googleCloudStorage": {
"location": f"https://storage.googleapis.com/{bucket}",
"subpath": f"/{path}",
"connectionId": connection_id
}
}
}
response = requests.post(
f"{self.base_url}/workspaces/{self.workspace_id}/lakehouses/{self.lakehouse_id}/shortcuts",
headers=self._get_headers(),
json=payload
)
response.raise_for_status()
return response.json()
def create_onelake_shortcut(
self,
shortcut_name: str,
source_workspace_id: str,
source_item_id: str,
source_path: str,
target_path: str = "Tables"
) -> dict:
"""Create shortcut to another OneLake location."""
payload = {
"path": f"{target_path}/{shortcut_name}",
"target": {
"type": "OneLake",
"oneLake": {
"workspaceId": source_workspace_id,
"itemId": source_item_id,
"path": source_path
}
}
}
response = requests.post(
f"{self.base_url}/workspaces/{self.workspace_id}/lakehouses/{self.lakehouse_id}/shortcuts",
headers=self._get_headers(),
json=payload
)
response.raise_for_status()
return response.json()
# Usage
manager = ShortcutManager("workspace-id", "lakehouse-id")
# ADLS shortcut
manager.create_adls_shortcut(
shortcut_name="external_sales",
storage_account="mystorage",
container="data",
path="sales/parquet"
)
# S3 shortcut (requires connection)
manager.create_s3_shortcut(
shortcut_name="aws_data",
bucket="my-data-bucket",
path="analytics/",
connection_id="s3-connection-id"
)
Querying Virtualized Data
Via SQL Endpoint
-- Shortcuts appear as regular tables
SELECT
s.product_id,
s.quantity,
s.sale_date,
p.product_name,
p.category
FROM external_sales s -- Shortcut to ADLS
JOIN products p ON s.product_id = p.product_id -- Local table
WHERE s.sale_date >= '2024-01-01';
Via Spark
# Shortcuts are accessible via standard paths
external_sales = spark.read.format("delta").load(
"Tables/external_sales" # Shortcut to external data
)
local_products = spark.read.format("delta").table("products")
# Join across sources
combined = external_sales.join(
local_products,
external_sales.product_id == local_products.product_id
)
display(combined)
Cross-Cloud Data Federation
class CrossCloudFederation:
"""Federate data across multiple clouds."""
def __init__(self, lakehouse_manager: ShortcutManager):
self.manager = lakehouse_manager
def setup_multi_cloud_lakehouse(self):
"""Create shortcuts to multiple cloud sources."""
# Azure source
self.manager.create_adls_shortcut(
shortcut_name="azure_transactions",
storage_account="azurestorage",
container="datalake",
path="transactions"
)
# AWS source
self.manager.create_s3_shortcut(
shortcut_name="aws_customer_data",
bucket="customer-data-lake",
path="customers/",
connection_id="aws-connection"
)
# GCP source
self.manager.create_gcs_shortcut(
shortcut_name="gcp_analytics",
bucket="analytics-bucket",
path="processed/",
connection_id="gcp-connection"
)
print("Multi-cloud lakehouse configured")
def query_federated_data(self):
"""Query across all cloud sources."""
query = """
SELECT
t.transaction_id,
t.amount,
c.customer_name,
c.region,
a.segment
FROM azure_transactions t
JOIN aws_customer_data c ON t.customer_id = c.customer_id
JOIN gcp_analytics a ON c.customer_id = a.customer_id
WHERE t.transaction_date >= '2024-01-01'
"""
return spark.sql(query)
Performance Optimization
Caching Strategy
class VirtualizationOptimizer:
def analyze_query_patterns(self, query_history: list[dict]) -> dict:
"""Analyze which virtual data is accessed frequently."""
access_counts = {}
for query in query_history:
for table in query.get("tables_accessed", []):
if table.startswith("shortcut_"):
access_counts[table] = access_counts.get(table, 0) + 1
return {
"frequent_shortcuts": sorted(
access_counts.items(),
key=lambda x: x[1],
reverse=True
)[:10],
"recommendation": self._recommend_materialization(access_counts)
}
def _recommend_materialization(self, access_counts: dict) -> list[str]:
"""Recommend which shortcuts should be materialized."""
recommendations = []
for table, count in access_counts.items():
if count > 100: # Threshold
recommendations.append(
f"Consider materializing {table} - accessed {count} times"
)
return recommendations
def create_cached_view(
self,
shortcut_name: str,
cache_table_name: str,
partition_columns: list[str] = None
):
"""Create cached copy of frequently accessed virtual data."""
source_df = spark.read.format("delta").load(f"Tables/{shortcut_name}")
writer = source_df.write.format("delta").mode("overwrite")
if partition_columns:
writer = writer.partitionBy(*partition_columns)
writer.saveAsTable(cache_table_name)
# Create view that falls back to cache
spark.sql(f"""
CREATE OR REPLACE VIEW {shortcut_name}_optimized AS
SELECT * FROM {cache_table_name}
""")
Predicate Pushdown
# Enable predicate pushdown for better performance
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")
spark.conf.set("spark.sql.parquet.filterPushdown", "true")
spark.conf.set("spark.sql.parquet.recordLevelFilter.enabled", "true")
# Query with pushdown-friendly predicates
efficient_query = """
SELECT *
FROM external_sales
WHERE sale_date = '2024-08-09' -- Partition column
AND region = 'US' -- Filter pushed to source
"""
Security Considerations
# Shortcuts inherit security from connection
security_model = {
"authentication": {
"adls": "Managed Identity or Service Principal",
"s3": "IAM Role via Cross-Account",
"gcs": "Service Account Key"
},
"authorization": {
"workspace_access": "Controls who can see shortcuts",
"data_access": "Controlled at source level",
"row_level": "Apply in views or Direct Lake"
},
"encryption": {
"in_transit": "TLS 1.2+",
"at_rest": "Source-dependent (SSE, CMK)"
}
}
Best Practices
- Use for read-heavy workloads: Virtualization adds latency
- Materialize hot data: Cache frequently accessed shortcuts
- Design partitions wisely: Enable effective predicate pushdown
- Monitor cross-cloud costs: Data transfer can be expensive
- Test query performance: Compare virtualized vs materialized
Conclusion
Data virtualization through shortcuts enables flexible, multi-cloud analytics without massive data movement. Use it to federate data across your cloud estate while maintaining a unified query interface.
Start with shortcuts for exploratory work, then materialize frequently accessed data for production workloads.