January 16, 2024 1 min read

OneLake Optimization: Storage Patterns and Best Practices

OneLake Microsoft Fabric Data Lake Storage Architecture

OneLake is the unified storage layer for Microsoft Fabric. Optimizing how you organize and access data in OneLake directly impacts performance, cost, and governance. Here’s how to get the most out of it.

OneLake Architecture

OneLake provides:

Single namespace across all Fabric workloads
Delta Lake format as the default
Shortcuts for virtual access to external data
ADLS Gen2 compatible APIs

Storage Organization Patterns

1. Medallion Architecture

The recommended pattern for data lakes:

OneLake
└── Lakehouse: analytics
    ├── Tables/
    │   ├── bronze/
    │   │   ├── raw_sales_data
    │   │   ├── raw_customer_data
    │   │   └── raw_product_data
    │   ├── silver/
    │   │   ├── cleaned_sales
    │   │   ├── cleaned_customers
    │   │   └── cleaned_products
    │   └── gold/
    │       ├── sales_summary
    │       ├── customer_360
    │       └── product_performance
    └── Files/
        ├── landing/
        │   └── {source}/{date}/
        ├── archive/
        │   └── {year}/{month}/
        └── reference/
            └── lookups/

2. Domain-Based Organization

For large organizations with multiple domains:

# Python notebook for domain-based structure setup

domains = ["sales", "marketing", "finance", "operations"]
layers = ["bronze", "silver", "gold"]

def create_domain_structure(domain: str):
    """Create standardized folder structure for a domain."""

    base_path = f"abfss://analytics@onelake.dfs.fabric.microsoft.com/{domain}"

    # Create layer folders
    for layer in layers:
        table_path = f"{base_path}/Tables/{layer}"
        files_path = f"{base_path}/Files/{layer}"

        # Create directories using mssparkutils
        mssparkutils.fs.mkdirs(table_path)
        mssparkutils.fs.mkdirs(files_path)

        print(f"Created: {table_path}")
        print(f"Created: {files_path}")

# Initialize all domains
for domain in domains:
    create_domain_structure(domain)

Shortcuts Strategy

1. External Data Access

# Create shortcut to external ADLS storage
def create_external_shortcut(
    lakehouse_path: str,
    shortcut_name: str,
    external_storage_url: str,
    sas_token: str
):
    """Create shortcut to external Azure storage."""

    import requests

    # Using Fabric REST API
    payload = {
        "path": f"{lakehouse_path}/Files/{shortcut_name}",
        "target": {
            "type": "AzureBlobStorage",
            "url": external_storage_url,
            "credentials": {
                "sasToken": sas_token
            }
        }
    }

    # API call to create shortcut
    response = requests.post(
        "https://api.fabric.microsoft.com/v1/shortcuts",
        json=payload,
        headers={"Authorization": f"Bearer {access_token}"}
    )

    return response.json()

# Example: Link to partner data
create_external_shortcut(
    lakehouse_path="/workspaces/analytics/lakehouses/main",
    shortcut_name="partner_data",
    external_storage_url="https://partnerstorage.blob.core.windows.net/data",
    sas_token="sv=2021-06-08&ss=b&srt=co&sp=rl..."
)

2. Cross-Workspace Shortcuts

# Create shortcut between Fabric workspaces
def create_internal_shortcut(
    source_workspace: str,
    source_lakehouse: str,
    source_path: str,
    target_workspace: str,
    target_lakehouse: str,
    shortcut_name: str
):
    """Create shortcut between Fabric lakehouses."""

    payload = {
        "path": f"/workspaces/{target_workspace}/lakehouses/{target_lakehouse}/Files/{shortcut_name}",
        "target": {
            "type": "OneLake",
            "workspaceId": source_workspace,
            "itemId": source_lakehouse,
            "path": source_path
        }
    }

    # Create via API
    # This avoids data duplication while enabling access

    return payload

# Example: Share curated data across workspaces
create_internal_shortcut(
    source_workspace="data-engineering",
    source_lakehouse="curated",
    source_path="Tables/gold",
    target_workspace="data-science",
    target_lakehouse="ml-workspace",
    shortcut_name="curated_data"
)

File Format Optimization

1. Delta Table Settings

# Optimal Delta table configuration
def create_optimized_delta_table(
    df,
    table_name: str,
    partition_cols: list = None,
    z_order_cols: list = None
):
    """Create a Delta table with optimal settings."""

    writer = df.write.format("delta")

    # Enable optimized writes
    writer = writer.option("delta.autoOptimize.optimizeWrite", "true")
    writer = writer.option("delta.autoOptimize.autoCompact", "true")

    # Set target file size (128MB is good for most cases)
    writer = writer.option("delta.targetFileSize", "134217728")

    # Enable deletion vectors for faster updates
    writer = writer.option("delta.enableDeletionVectors", "true")

    # Partitioning
    if partition_cols:
        writer = writer.partitionBy(*partition_cols)

    # Save table
    writer.mode("overwrite").saveAsTable(table_name)

    # Apply Z-ordering if specified
    if z_order_cols:
        from delta.tables import DeltaTable
        delta_table = DeltaTable.forName(spark, table_name)
        delta_table.optimize().executeZOrderBy(z_order_cols)

    print(f"Created optimized table: {table_name}")

# Usage
create_optimized_delta_table(
    df=sales_df,
    table_name="gold_sales",
    partition_cols=["year", "month"],
    z_order_cols=["customer_id", "product_id"]
)

2. Parquet for Raw Files

# Optimize Parquet file writes
def write_optimized_parquet(
    df,
    output_path: str,
    compression: str = "snappy",
    row_group_size: int = 128 * 1024 * 1024  # 128MB
):
    """Write Parquet files with optimal settings."""

    df.write \
        .option("compression", compression) \
        .option("parquet.block.size", row_group_size) \
        .mode("overwrite") \
        .parquet(output_path)

    # Verify output
    files = mssparkutils.fs.ls(output_path)
    print(f"Written {len(files)} files to {output_path}")

    # Check file sizes
    for f in files[:5]:
        print(f"  {f.name}: {f.size / 1024 / 1024:.2f} MB")

Access Patterns

1. Direct OneLake Access

# Access patterns for different scenarios

# Pattern 1: Spark (notebooks)
df = spark.read.format("delta").load("Tables/silver_sales")

# Pattern 2: Pandas via fsspec
import pandas as pd

# Install: pip install fsspec adlfs
storage_options = {
    "account_name": "onelake",
    "account_host": "onelake.dfs.fabric.microsoft.com"
}

df_pandas = pd.read_parquet(
    "abfss://workspace@onelake.dfs.fabric.microsoft.com/lakehouse/Files/data.parquet",
    storage_options=storage_options
)

# Pattern 3: REST API
import requests

def read_onelake_file(workspace: str, lakehouse: str, path: str, token: str):
    """Read file from OneLake via REST API."""

    url = f"https://onelake.dfs.fabric.microsoft.com/{workspace}/{lakehouse}/Files/{path}"

    response = requests.get(
        url,
        headers={"Authorization": f"Bearer {token}"}
    )

    return response.content

2. Caching Strategy

# Implement caching for frequently accessed data

class OneLakeCache:
    def __init__(self, cache_path: str = "/tmp/onelake_cache"):
        self.cache_path = cache_path
        self.cache_index = {}

    def get_or_load(
        self,
        table_path: str,
        cache_ttl_hours: int = 1
    ):
        """Get from cache or load from OneLake."""

        import os
        from datetime import datetime, timedelta

        cache_key = table_path.replace("/", "_")
        cache_file = f"{self.cache_path}/{cache_key}"

        # Check if cached and fresh
        if cache_key in self.cache_index:
            cached_time = self.cache_index[cache_key]
            if datetime.utcnow() - cached_time < timedelta(hours=cache_ttl_hours):
                print(f"Cache hit: {table_path}")
                return spark.read.parquet(cache_file)

        # Load from OneLake
        print(f"Cache miss: {table_path}")
        df = spark.read.format("delta").load(table_path)

        # Cache locally
        df.write.mode("overwrite").parquet(cache_file)
        self.cache_index[cache_key] = datetime.utcnow()

        return df

# Usage
cache = OneLakeCache()
df = cache.get_or_load("Tables/gold_sales")

Governance and Security

1. Access Control

# OneLake inherits Fabric workspace permissions
# Additional granular control via:

# 1. Row-level security in semantic models
# 2. Column-level security via views
# 3. Object-level permissions on tables

def create_secure_view(
    source_table: str,
    view_name: str,
    allowed_columns: list,
    row_filter: str = None
):
    """Create a secure view with column/row restrictions."""

    columns_sql = ", ".join(allowed_columns)

    view_sql = f"""
    CREATE OR REPLACE VIEW {view_name} AS
    SELECT {columns_sql}
    FROM {source_table}
    """

    if row_filter:
        view_sql += f" WHERE {row_filter}"

    spark.sql(view_sql)
    print(f"Created secure view: {view_name}")

# Example: Create view hiding PII
create_secure_view(
    source_table="silver_customers",
    view_name="customers_no_pii",
    allowed_columns=["customer_id", "segment", "region", "signup_date"],
    row_filter="is_active = true"
)

2. Data Lineage

# Track data lineage in OneLake

def log_data_lineage(
    source_tables: list,
    target_table: str,
    operation: str,
    row_count: int
):
    """Log data lineage for audit trail."""

    lineage_record = {
        "timestamp": datetime.utcnow().isoformat(),
        "source_tables": source_tables,
        "target_table": target_table,
        "operation": operation,
        "row_count": row_count,
        "user": spark.sparkContext.sparkUser(),
        "notebook": mssparkutils.runtime.context.get("notebookPath", "unknown")
    }

    # Append to lineage log
    lineage_df = spark.createDataFrame([lineage_record])
    lineage_df.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable("_metadata.data_lineage")

# Usage
log_data_lineage(
    source_tables=["bronze_sales", "silver_customers"],
    target_table="gold_customer_sales",
    operation="JOIN_AGGREGATE",
    row_count=result_df.count()
)

Monitoring and Maintenance

# OneLake storage monitoring

def analyze_storage_usage(lakehouse_path: str):
    """Analyze storage usage in a lakehouse."""

    tables_path = f"{lakehouse_path}/Tables"
    files_path = f"{lakehouse_path}/Files"

    results = {
        "tables": [],
        "files": []
    }

    # Analyze Delta tables
    for table_dir in mssparkutils.fs.ls(tables_path):
        if table_dir.isDir:
            table_files = mssparkutils.fs.ls(f"{tables_path}/{table_dir.name}")
            total_size = sum(f.size for f in table_files if not f.isDir)

            results["tables"].append({
                "name": table_dir.name,
                "size_mb": total_size / 1024 / 1024,
                "file_count": len([f for f in table_files if not f.isDir])
            })

    # Analyze files
    for file_dir in mssparkutils.fs.ls(files_path):
        if file_dir.isDir:
            dir_files = mssparkutils.fs.ls(f"{files_path}/{file_dir.name}")
            total_size = sum(f.size for f in dir_files if not f.isDir)

            results["files"].append({
                "name": file_dir.name,
                "size_mb": total_size / 1024 / 1024,
                "file_count": len([f for f in dir_files if not f.isDir])
            })

    return results

# Generate report
usage = analyze_storage_usage("abfss://analytics@onelake.dfs.fabric.microsoft.com/main")

print("=== Tables ===")
for t in sorted(usage["tables"], key=lambda x: x["size_mb"], reverse=True):
    print(f"{t['name']}: {t['size_mb']:.2f} MB ({t['file_count']} files)")

Conclusion

OneLake optimization requires:

Thoughtful organization - Medallion architecture or domain-based
Strategic shortcuts - Virtual access without duplication
Optimal file formats - Delta with proper settings
Efficient access patterns - Caching and proper APIs
Strong governance - Security and lineage tracking

Treat OneLake as your single source of truth and design around that principle. The unified storage model is powerful when properly architected.