December 16, 2023 1 min read

Microsoft Fabric Best Practices for Production

Microsoft Fabric Best Practices Production Data Engineering Architecture

Microsoft Fabric Best Practices for Production

Moving from proof-of-concept to production requires following best practices. Let’s explore proven patterns for Fabric production deployments.

Workspace Organization

from dataclasses import dataclass
from typing import List, Dict

@dataclass
class WorkspacePattern:
    pattern_name: str
    description: str
    workspace_structure: Dict[str, List[str]]
    pros: List[str]
    cons: List[str]
    best_for: str

workspace_patterns = {
    "environment_based": WorkspacePattern(
        pattern_name="Environment-Based",
        description="Separate workspaces for dev, test, prod",
        workspace_structure={
            "Sales-Dev": ["Development notebooks", "Test data"],
            "Sales-Test": ["Integration testing", "UAT"],
            "Sales-Prod": ["Production workloads", "Live data"]
        },
        pros=["Clear separation", "Easy access control", "Standard CI/CD"],
        cons=["Workspace proliferation", "Cross-environment testing complex"],
        best_for="Traditional enterprise deployments"
    ),
    "domain_based": WorkspacePattern(
        pattern_name="Domain-Based",
        description="Workspaces aligned with business domains",
        workspace_structure={
            "Sales-Domain": ["All sales workloads"],
            "Marketing-Domain": ["All marketing workloads"],
            "Finance-Domain": ["All finance workloads"],
            "Platform-Shared": ["Shared data products"]
        },
        pros=["Domain autonomy", "Clear ownership", "Data mesh aligned"],
        cons=["Need cross-domain governance", "Potential duplication"],
        best_for="Data mesh implementations"
    ),
    "hybrid": WorkspacePattern(
        pattern_name="Hybrid (Domain + Environment)",
        description="Combine domain and environment separation",
        workspace_structure={
            "Sales-Dev": ["Development"],
            "Sales-Prod": ["Production"],
            "Marketing-Dev": ["Development"],
            "Marketing-Prod": ["Production"],
            "Platform-Shared": ["Cross-domain shared"]
        },
        pros=["Full isolation", "Domain ownership", "Environment control"],
        cons=["Many workspaces to manage", "Complex governance"],
        best_for="Large enterprises with strict requirements"
    )
}

def recommend_workspace_pattern(requirements: Dict) -> str:
    """Recommend workspace pattern based on requirements."""
    if requirements.get("data_mesh"):
        return "domain_based"
    elif requirements.get("strict_environments"):
        return "environment_based"
    elif requirements.get("large_enterprise"):
        return "hybrid"
    else:
        return "environment_based"

Naming Conventions

naming_conventions = {
    "workspaces": {
        "pattern": "{Domain}-{Environment}",
        "examples": ["Sales-Prod", "Marketing-Dev", "Finance-UAT"],
        "rules": [
            "Use PascalCase",
            "Keep under 50 characters",
            "Include environment suffix for non-prod"
        ]
    },
    "lakehouses": {
        "pattern": "lh_{domain}_{purpose}",
        "examples": ["lh_sales_raw", "lh_sales_curated", "lh_shared_reference"],
        "rules": [
            "Use snake_case",
            "Prefix with 'lh_'",
            "Include layer (raw/curated/serving)"
        ]
    },
    "warehouses": {
        "pattern": "wh_{domain}_{purpose}",
        "examples": ["wh_sales_analytics", "wh_finance_reporting"],
        "rules": [
            "Use snake_case",
            "Prefix with 'wh_'",
            "Indicate primary use case"
        ]
    },
    "notebooks": {
        "pattern": "{category}_{action}_{subject}",
        "examples": ["etl_load_customers", "analysis_monthly_sales", "transform_dedupe_records"],
        "rules": [
            "Use snake_case",
            "Start with category (etl, analysis, transform, test)",
            "Be descriptive but concise"
        ]
    },
    "pipelines": {
        "pattern": "pl_{source}_{target}_{frequency}",
        "examples": ["pl_sqlserver_lakehouse_daily", "pl_api_bronze_hourly"],
        "rules": [
            "Prefix with 'pl_'",
            "Include source and target",
            "Include frequency if scheduled"
        ]
    },
    "semantic_models": {
        "pattern": "{domain} {subject} Model",
        "examples": ["Sales Performance Model", "Customer Analytics Model"],
        "rules": [
            "Use Title Case with spaces",
            "Include domain context",
            "End with 'Model'"
        ]
    },
    "reports": {
        "pattern": "{domain} - {report_name}",
        "examples": ["Sales - Executive Dashboard", "Finance - Monthly Close Report"],
        "rules": [
            "Start with domain",
            "Use dash separator",
            "Be user-friendly"
        ]
    }
}

def validate_name(item_type: str, name: str) -> Dict:
    """Validate name against conventions."""
    convention = naming_conventions.get(item_type, {})
    pattern = convention.get("pattern", "")
    rules = convention.get("rules", [])

    issues = []

    # Check basic rules
    if len(name) > 100:
        issues.append("Name too long (max 100 characters)")

    if item_type in ["lakehouses", "warehouses", "notebooks", "pipelines"]:
        if " " in name:
            issues.append("Should not contain spaces (use snake_case)")
        if name != name.lower():
            issues.append("Should be lowercase")

    # Check prefix
    if item_type == "lakehouses" and not name.startswith("lh_"):
        issues.append("Should start with 'lh_' prefix")
    elif item_type == "warehouses" and not name.startswith("wh_"):
        issues.append("Should start with 'wh_' prefix")
    elif item_type == "pipelines" and not name.startswith("pl_"):
        issues.append("Should start with 'pl_' prefix")

    return {
        "name": name,
        "item_type": item_type,
        "valid": len(issues) == 0,
        "issues": issues,
        "pattern": pattern
    }

Data Lakehouse Best Practices

lakehouse_best_practices = {
    "layer_organization": {
        "bronze_layer": {
            "purpose": "Raw data landing zone",
            "format": "Delta (preserve original schema)",
            "retention": "Keep full history",
            "naming": "{source}_{table}_raw",
            "tips": [
                "Preserve source schema exactly",
                "Add ingestion metadata columns",
                "Partition by ingestion date"
            ]
        },
        "silver_layer": {
            "purpose": "Cleansed, validated data",
            "format": "Delta (standardized schema)",
            "retention": "Keep relevant history",
            "naming": "{domain}_{entity}_clean",
            "tips": [
                "Apply data quality rules",
                "Standardize data types",
                "Handle duplicates and nulls"
            ]
        },
        "gold_layer": {
            "purpose": "Business-ready aggregates",
            "format": "Delta (optimized for queries)",
            "retention": "As needed by business",
            "naming": "{domain}_{subject}_agg",
            "tips": [
                "Pre-aggregate common metrics",
                "Optimize for Power BI DirectQuery",
                "Apply V-Order optimization"
            ]
        }
    },
    "delta_table_optimization": {
        "v_order": {
            "description": "Column-based sorting for faster reads",
            "how_to_enable": "spark.conf.set('spark.sql.parquet.vorder.enabled', 'true')",
            "best_for": "Tables frequently queried in Power BI"
        },
        "optimize_write": {
            "description": "Automatic file sizing",
            "how_to_enable": "spark.conf.set('spark.microsoft.delta.optimizeWrite.enabled', 'true')",
            "best_for": "All tables"
        },
        "partitioning": {
            "description": "Partition large tables",
            "guidance": "Partition by date for time-series, by key for lookups",
            "avoid": "Over-partitioning (< 1GB per partition is too small)"
        },
        "z_ordering": {
            "description": "Cluster data by query columns",
            "how_to": "OPTIMIZE table ZORDER BY (column)",
            "best_for": "Tables with common filter columns"
        }
    }
}

def generate_lakehouse_template(domain: str, layers: List[str]) -> str:
    """Generate lakehouse setup code."""
    code = f"""
# Lakehouse Setup for {domain} Domain

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Enable optimizations
spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

# Create schemas for each layer
"""

    for layer in layers:
        code += f"""
# {layer.upper()} Layer Tables
spark.sql("CREATE SCHEMA IF NOT EXISTS {domain}_{layer}")
"""

    code += """
# Example: Create bronze table with metadata
def create_bronze_table(df, table_name, source_name):
    from pyspark.sql.functions import current_timestamp, lit

    df_with_metadata = df.withColumn("_ingested_at", current_timestamp()) \\
                         .withColumn("_source", lit(source_name))

    df_with_metadata.write \\
        .format("delta") \\
        .mode("append") \\
        .option("mergeSchema", "true") \\
        .saveAsTable(f"{domain}_bronze.{table_name}")
"""

    return code

Security Best Practices

security_best_practices = {
    "access_control": {
        "workspace_level": [
            "Use Azure AD groups for workspace access",
            "Assign minimum required roles",
            "Review access quarterly"
        ],
        "item_level": [
            "Apply item permissions for sensitive assets",
            "Use Row-Level Security for multi-tenant data",
            "Implement Object-Level Security for sensitive columns"
        ]
    },
    "data_protection": {
        "encryption": [
            "Data encrypted at rest by default",
            "Use customer-managed keys for sensitive workloads",
            "Ensure TLS 1.2+ for transit"
        ],
        "masking": [
            "Apply dynamic data masking for PII",
            "Use column-level encryption for highly sensitive data"
        ]
    },
    "audit_logging": {
        "what_to_log": [
            "User access to workspaces",
            "Data reads and exports",
            "Schema changes",
            "Permission changes"
        ],
        "how_to": "Enable and export Microsoft 365 Unified Audit Log"
    }
}

Tomorrow, we’ll explore Fabric architecture patterns in depth!