August 12, 2024 1 min read

Microsoft Fabric and Databricks: Coexistence Strategies

Microsoft Fabric Databricks Delta Lake Data Architecture Integration

Many organizations have existing Databricks investments when adopting Microsoft Fabric. The good news is these platforms can coexist and complement each other through their shared foundation: Delta Lake.

The Integration Landscape

┌──────────────────────────────────────────────────────────────┐
│                    Enterprise Data Platform                   │
│                                                               │
│  ┌─────────────────────┐      ┌─────────────────────┐        │
│  │    Databricks       │      │   Microsoft Fabric   │        │
│  │  ┌───────────────┐  │      │  ┌───────────────┐  │        │
│  │  │ ML Workloads  │  │      │  │ Power BI      │  │        │
│  │  │ Data Science  │  │      │  │ SQL Analytics │  │        │
│  │  │ Advanced ETL  │  │      │  │ Direct Lake   │  │        │
│  │  └───────────────┘  │      │  └───────────────┘  │        │
│  └──────────┬──────────┘      └──────────┬──────────┘        │
│             │                            │                    │
│             │      Delta Lake            │                    │
│             └──────────┬─────────────────┘                    │
│                        │                                      │
│              ┌─────────▼─────────┐                           │
│              │    Azure ADLS     │                           │
│              │   (Shared Data)   │                           │
│              └───────────────────┘                           │
└──────────────────────────────────────────────────────────────┘

Integration Patterns

Pattern 1: Shared Delta Lake Storage

# Databricks writes to ADLS
# databricks_notebook.py

from delta.tables import DeltaTable

# Databricks job writes Delta tables to ADLS
output_path = "abfss://gold@datalake.dfs.core.windows.net/sales/customers"

customers_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(output_path)

# Optimize for downstream consumers
delta_table = DeltaTable.forPath(spark, output_path)
delta_table.optimize().executeCompaction()
delta_table.vacuum(168)  # 7 days retention

# Fabric reads via shortcut
# fabric_notebook.py

# Create shortcut to Databricks-managed Delta table via REST API
import requests
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
token = credential.get_token("https://api.fabric.microsoft.com/.default")

headers = {
    "Authorization": f"Bearer {token.token}",
    "Content-Type": "application/json"
}

workspace_id = "your-workspace-id"
lakehouse_id = "your-lakehouse-id"

# Create ADLS shortcut via Fabric REST API
shortcut_payload = {
    "path": "Tables/databricks_customers",
    "target": {
        "adlsGen2": {
            "location": "https://datalake.dfs.core.windows.net/gold/sales/customers"
        }
    }
}

url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items/{lakehouse_id}/shortcuts"
response = requests.post(url, headers=headers, json=shortcut_payload)
print(f"Shortcut created: {response.json()}")

# Now queryable in Fabric Spark
customers = spark.read.format("delta").load("Tables/databricks_customers")

Pattern 2: Medallion Architecture Split

class MedallionArchitecture:
    """
    Split medallion layers between Databricks and Fabric.

    Bronze/Silver: Databricks (complex transformations, ML)
    Gold: Fabric (BI, reporting, governed access)
    """

    def __init__(self, adls_account: str):
        self.adls_account = adls_account
        self.base_path = f"abfss://{{container}}@{adls_account}.dfs.core.windows.net"

    # Databricks handles Bronze and Silver
    def databricks_bronze_path(self, table: str) -> str:
        return f"{self.base_path.format(container='bronze')}/{table}"

    def databricks_silver_path(self, table: str) -> str:
        return f"{self.base_path.format(container='silver')}/{table}"

    # Fabric handles Gold
    def fabric_gold_path(self, table: str) -> str:
        return f"{self.base_path.format(container='gold')}/{table}"

    def sync_silver_to_gold(self, table_name: str, transformations: list = None):
        """Sync Silver (Databricks) to Gold (Fabric)."""

        silver_df = spark.read.format("delta").load(
            self.databricks_silver_path(table_name)
        )

        # Apply any Fabric-specific transformations
        if transformations:
            for transform in transformations:
                silver_df = transform(silver_df)

        # Write to Gold layer
        silver_df.write.format("delta") \
            .mode("overwrite") \
            .save(self.fabric_gold_path(table_name))

# Example usage
architecture = MedallionArchitecture("mydatalake")

# Databricks writes to Silver
# (done in Databricks job)

# Fabric syncs to Gold with business transformations
architecture.sync_silver_to_gold(
    table_name="sales_transactions",
    transformations=[
        lambda df: df.filter("is_valid = true"),
        lambda df: df.withColumn("region_name", map_region_codes("region_code"))
    ]
)

Pattern 3: Workload-Based Routing

class WorkloadRouter:
    """Route workloads to the optimal platform."""

    def __init__(self):
        self.workload_rules = {
            "ml_training": {
                "platform": "databricks",
                "reason": "MLflow integration, GPU clusters"
            },
            "streaming_etl": {
                "platform": "databricks",
                "reason": "Structured Streaming, Auto Loader"
            },
            "bi_reporting": {
                "platform": "fabric",
                "reason": "Direct Lake, Power BI integration"
            },
            "ad_hoc_sql": {
                "platform": "fabric",
                "reason": "SQL endpoint, warehouse experience"
            },
            "data_science_exploration": {
                "platform": "databricks",
                "reason": "Interactive notebooks, ML libraries"
            },
            "semantic_modeling": {
                "platform": "fabric",
                "reason": "Native semantic model support"
            }
        }

    def get_platform(self, workload_type: str) -> dict:
        """Determine optimal platform for workload."""

        if workload_type in self.workload_rules:
            return self.workload_rules[workload_type]

        # Default to Fabric for unknown workloads
        return {
            "platform": "fabric",
            "reason": "Default platform for general workloads"
        }

    def analyze_workload(self, workload_description: str) -> dict:
        """Analyze workload and recommend platform."""

        ml_keywords = ["training", "model", "ml", "machine learning", "inference"]
        bi_keywords = ["report", "dashboard", "visualization", "power bi"]
        streaming_keywords = ["streaming", "real-time", "kafka", "event"]

        description_lower = workload_description.lower()

        if any(kw in description_lower for kw in ml_keywords):
            return self.get_platform("ml_training")
        elif any(kw in description_lower for kw in bi_keywords):
            return self.get_platform("bi_reporting")
        elif any(kw in description_lower for kw in streaming_keywords):
            return self.get_platform("streaming_etl")
        else:
            return self.get_platform("ad_hoc_sql")

# Usage
router = WorkloadRouter()

# Route based on workload type
ml_platform = router.get_platform("ml_training")
print(f"ML workloads: {ml_platform['platform']} - {ml_platform['reason']}")

# Analyze workload description
recommendation = router.analyze_workload(
    "Build a customer churn prediction model"
)
print(f"Recommended: {recommendation['platform']}")

Unity Catalog and Fabric Catalog Integration

Reading Unity Catalog Tables in Fabric

# Access Unity Catalog tables from Fabric via external tables

# Option 1: Shortcut to Unity Catalog managed storage
# Create shortcut to the underlying storage location

unity_catalog_path = "abfss://unity-catalog@storage.dfs.core.windows.net/catalog/schema/table"

# In Fabric Lakehouse
manager.create_adls_shortcut(
    shortcut_name="unity_customers",
    storage_account="storage",
    container="unity-catalog",
    path="catalog/schema/table"
)

# Option 2: Use Databricks SQL endpoint as external source
# (Via Data Factory or direct connection)

Syncing Metadata

class CatalogSync:
    """Sync metadata between Unity Catalog and Fabric."""

    def __init__(
        self,
        databricks_workspace_url: str,
        databricks_token: str,
        fabric_workspace_id: str
    ):
        self.databricks_url = databricks_workspace_url
        self.databricks_token = databricks_token
        self.fabric_workspace_id = fabric_workspace_id

    def get_unity_catalog_tables(self, catalog: str, schema: str) -> list:
        """List tables from Unity Catalog."""

        import requests

        headers = {"Authorization": f"Bearer {self.databricks_token}"}

        response = requests.get(
            f"{self.databricks_url}/api/2.1/unity-catalog/tables",
            headers=headers,
            params={"catalog_name": catalog, "schema_name": schema}
        )

        return response.json().get("tables", [])

    def create_fabric_shortcuts(
        self,
        catalog: str,
        schema: str,
        lakehouse_id: str
    ):
        """Create Fabric shortcuts for Unity Catalog tables."""

        tables = self.get_unity_catalog_tables(catalog, schema)

        for table in tables:
            storage_location = table.get("storage_location")
            table_name = table.get("name")

            if storage_location and storage_location.startswith("abfss://"):
                # Parse ADLS path and create shortcut
                self._create_shortcut_for_table(
                    table_name=table_name,
                    storage_path=storage_location,
                    lakehouse_id=lakehouse_id
                )

    def _create_shortcut_for_table(
        self,
        table_name: str,
        storage_path: str,
        lakehouse_id: str
    ):
        """Create individual shortcut."""
        # Parse abfss:// path and create shortcut
        # Implementation depends on path structure
        pass

Performance Optimization

Optimizing for Both Platforms

class DualPlatformOptimizer:
    """Optimize Delta tables for both Databricks and Fabric."""

    def optimize_table(self, table_path: str, optimize_for: list = None):
        """Optimize table for specified platforms."""

        if optimize_for is None:
            optimize_for = ["databricks", "fabric"]

        delta_table = DeltaTable.forPath(spark, table_path)

        # Common optimizations
        delta_table.optimize().executeCompaction()

        if "fabric" in optimize_for:
            # V-Order optimization for Direct Lake
            spark.conf.set(
                "spark.databricks.delta.optimizeWrite.enabled",
                "true"
            )
            spark.conf.set(
                "spark.microsoft.delta.optimizeWrite.binSize",
                "1073741824"  # 1GB target file size
            )

            # Rewrite with V-Order
            df = spark.read.format("delta").load(table_path)
            df.write.format("delta") \
                .mode("overwrite") \
                .option("dataChange", "false") \
                .save(table_path)

        if "databricks" in optimize_for:
            # Z-Order on frequently filtered columns
            # This benefits both platforms
            delta_table.optimize().executeZOrderBy("date", "region")

    def set_compatible_properties(self, table_path: str):
        """Set table properties compatible with both platforms."""

        spark.sql(f"""
            ALTER TABLE delta.`{table_path}`
            SET TBLPROPERTIES (
                'delta.minReaderVersion' = '1',
                'delta.minWriterVersion' = '2',
                'delta.autoOptimize.optimizeWrite' = 'true',
                'delta.autoOptimize.autoCompact' = 'true'
            )
        """)

File Format Considerations

# Best practices for cross-platform Delta tables

write_config = {
    # File size
    "spark.databricks.delta.optimizeWrite.enabled": "true",
    "spark.databricks.delta.autoCompact.enabled": "true",
    "spark.sql.parquet.compression.codec": "snappy",

    # Compatibility
    "spark.databricks.delta.properties.defaults.minReaderVersion": "1",
    "spark.databricks.delta.properties.defaults.minWriterVersion": "2",

    # Avoid features not supported in Fabric
    "spark.databricks.delta.properties.defaults.enableChangeDataFeed": "false",

    # Target file size for good performance in both
    "spark.databricks.delta.targetFileSize": "134217728"  # 128MB
}

for key, value in write_config.items():
    spark.conf.set(key, value)

Governance and Security

Unified Access Control

class UnifiedAccessControl:
    """Manage access across Databricks and Fabric."""

    def __init__(self):
        self.role_mapping = {
            # Map organizational roles to platform-specific roles
            "data_engineer": {
                "databricks": ["workspace_user", "sql_access"],
                "fabric": ["contributor"]
            },
            "data_scientist": {
                "databricks": ["workspace_user", "cluster_create", "ml_access"],
                "fabric": ["viewer"]
            },
            "analyst": {
                "databricks": ["sql_access"],
                "fabric": ["viewer", "build"]
            },
            "admin": {
                "databricks": ["workspace_admin"],
                "fabric": ["admin"]
            }
        }

    def grant_access(self, user_email: str, role: str) -> dict:
        """Grant access based on organizational role."""

        if role not in self.role_mapping:
            raise ValueError(f"Unknown role: {role}")

        permissions = self.role_mapping[role]

        return {
            "user": user_email,
            "role": role,
            "databricks_permissions": permissions["databricks"],
            "fabric_permissions": permissions["fabric"],
            "action": "Grant access in both platforms"
        }

    def audit_access(self, user_email: str) -> dict:
        """Audit user access across platforms."""

        # Query both platforms for user permissions
        return {
            "user": user_email,
            "databricks_access": self._get_databricks_access(user_email),
            "fabric_access": self._get_fabric_access(user_email),
            "storage_access": self._get_storage_access(user_email)
        }

    def _get_databricks_access(self, user_email: str) -> list:
        # Query Databricks SCIM API
        pass

    def _get_fabric_access(self, user_email: str) -> list:
        # Query Fabric API
        pass

    def _get_storage_access(self, user_email: str) -> list:
        # Query Azure RBAC
        pass

Migration Strategies

Gradual Migration to Fabric

class MigrationPlanner:
    """Plan gradual migration from Databricks to Fabric."""

    def __init__(self):
        self.migration_phases = []

    def add_phase(
        self,
        name: str,
        workloads: list[str],
        timeline_weeks: int
    ):
        self.migration_phases.append({
            "name": name,
            "workloads": workloads,
            "timeline_weeks": timeline_weeks
        })

    def generate_plan(self) -> list:
        """Generate migration plan."""

        return [
            {
                "phase": 1,
                "name": "BI and Reporting",
                "workloads": ["Power BI dashboards", "Ad-hoc SQL"],
                "approach": "Create shortcuts, migrate semantic models",
                "timeline": "4 weeks"
            },
            {
                "phase": 2,
                "name": "Data Warehousing",
                "workloads": ["SQL-based ETL", "Data marts"],
                "approach": "Migrate SQL workloads to Fabric warehouse",
                "timeline": "8 weeks"
            },
            {
                "phase": 3,
                "name": "Batch ETL",
                "workloads": ["Spark jobs", "Data pipelines"],
                "approach": "Migrate non-ML Spark to Fabric notebooks",
                "timeline": "12 weeks"
            },
            {
                "phase": 4,
                "name": "ML Workloads",
                "workloads": ["ML training", "Feature engineering"],
                "approach": "Evaluate: migrate or keep in Databricks",
                "timeline": "Ongoing"
            }
        ]

Best Practices

Use shared storage: ADLS as the common data layer
Standardize on Delta Lake: Compatible format for both
Optimize for readers: Both platforms benefit from compaction
Unified governance: Single source of truth for access
Right tool for the job: Use each platform’s strengths

Conclusion

Databricks and Fabric can effectively coexist through their shared Delta Lake foundation. Use Databricks for advanced ML and complex data engineering, and Fabric for BI, semantic modeling, and governed analytics.

The key is establishing clear workload boundaries and maintaining consistent data formats across both platforms.