August 13, 2024 1 min read

OneLake Interoperability: Connecting Your Data Ecosystem

Microsoft Fabric OneLake Data Lake Interoperability Integration

OneLake is the unified storage layer for Microsoft Fabric, but its true power lies in interoperability. Let’s explore how OneLake connects with external tools, services, and platforms.

OneLake Architecture

┌────────────────────────────────────────────────────────────────┐
│                     External Access                             │
│  ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────────────┐   │
│  │ Azure    │ │ Power BI │ │ Spark    │ │ Third-Party      │   │
│  │ Storage  │ │ Desktop  │ │ Clusters │ │ Tools            │   │
│  │ Explorer │ │          │ │          │ │ (Tableau, etc.)  │   │
│  └────┬─────┘ └────┬─────┘ └────┬─────┘ └────────┬─────────┘   │
│       │            │            │                │              │
│       └────────────┴────────────┴────────────────┘              │
│                            │                                    │
│                   ┌────────▼────────┐                          │
│                   │    OneLake      │                          │
│                   │   ADLS Gen2     │                          │
│                   │   Compatible    │                          │
│                   └────────┬────────┘                          │
│                            │                                    │
│       ┌────────────────────┼────────────────────┐              │
│       │                    │                    │              │
│  ┌────▼────┐         ┌─────▼─────┐        ┌────▼────┐         │
│  │Lakehouse│         │ Warehouse │        │ KQL DB  │         │
│  │  Data   │         │   Data    │        │  Data   │         │
│  └─────────┘         └───────────┘        └─────────┘         │
└────────────────────────────────────────────────────────────────┘

ADLS Gen2 Compatibility

OneLake Paths

# OneLake uses ADLS Gen2 compatible paths
# Format: https://onelake.dfs.fabric.microsoft.com/{workspace}/{item}

class OneLakePaths:
    """Generate OneLake-compatible paths."""

    def __init__(self, workspace_name: str):
        self.workspace = workspace_name
        self.base_url = "https://onelake.dfs.fabric.microsoft.com"

    def get_lakehouse_path(
        self,
        lakehouse_name: str,
        folder: str = "Tables"
    ) -> dict:
        """Get paths for lakehouse access."""

        return {
            "https": f"{self.base_url}/{self.workspace}/{lakehouse_name}.Lakehouse/{folder}",
            "abfss": f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/{folder}",
            "wasbs": f"wasbs://{self.workspace}@onelake.blob.fabric.microsoft.com/{lakehouse_name}.Lakehouse/{folder}"
        }

    def get_warehouse_path(self, warehouse_name: str) -> dict:
        """Get paths for warehouse access."""

        return {
            "https": f"{self.base_url}/{self.workspace}/{warehouse_name}.Warehouse",
            "abfss": f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/{warehouse_name}.Warehouse"
        }

    def get_file_path(
        self,
        lakehouse_name: str,
        file_path: str
    ) -> str:
        """Get path to specific file."""

        return f"{self.base_url}/{self.workspace}/{lakehouse_name}.Lakehouse/Files/{file_path}"

# Usage
paths = OneLakePaths("my-workspace")
lakehouse_paths = paths.get_lakehouse_path("sales-lakehouse")

print(f"HTTPS: {lakehouse_paths['https']}")
print(f"ABFSS: {lakehouse_paths['abfss']}")

Azure Storage Explorer Access

# Connect to OneLake via Azure Storage Explorer
# Use Azure AD authentication

connection_config = {
    "account_name": "onelake",
    "account_url": "https://onelake.dfs.fabric.microsoft.com",
    "authentication": "Azure AD",
    "container": "workspace-name",  # Workspace appears as container
    "path": "lakehouse-name.Lakehouse/Tables"
}

# In Azure Storage Explorer:
# 1. Add connection -> Azure Data Lake Storage Gen2
# 2. Use Azure AD authentication
# 3. Enter: https://onelake.dfs.fabric.microsoft.com
# 4. Browse workspaces and items

External Spark Cluster Access

Connecting from Databricks

# Databricks notebook connecting to OneLake

# Configure authentication
spark.conf.set(
    "fs.azure.account.auth.type.onelake.dfs.fabric.microsoft.com",
    "OAuth"
)
spark.conf.set(
    "fs.azure.account.oauth.provider.type.onelake.dfs.fabric.microsoft.com",
    "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider"
)
spark.conf.set(
    "fs.azure.account.oauth2.client.id.onelake.dfs.fabric.microsoft.com",
    dbutils.secrets.get(scope="fabric", key="client-id")
)
spark.conf.set(
    "fs.azure.account.oauth2.client.secret.onelake.dfs.fabric.microsoft.com",
    dbutils.secrets.get(scope="fabric", key="client-secret")
)
spark.conf.set(
    "fs.azure.account.oauth2.client.endpoint.onelake.dfs.fabric.microsoft.com",
    f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"
)

# Read from OneLake
onelake_path = "abfss://workspace@onelake.dfs.fabric.microsoft.com/lakehouse.Lakehouse/Tables/customers"

customers_df = spark.read.format("delta").load(onelake_path)
display(customers_df)

# Write to OneLake (if permissions allow)
output_path = "abfss://workspace@onelake.dfs.fabric.microsoft.com/lakehouse.Lakehouse/Tables/processed_data"

processed_df.write.format("delta") \
    .mode("overwrite") \
    .save(output_path)

Connecting from Synapse

# Synapse Spark notebook

# OneLake is accessible with workspace managed identity
# No additional configuration needed for same-tenant access

from pyspark.sql import SparkSession

# Read OneLake data
onelake_path = "abfss://my-workspace@onelake.dfs.fabric.microsoft.com/my-lakehouse.Lakehouse/Tables/sales"

sales_df = spark.read.format("delta").load(onelake_path)

# Process and write back
aggregated = sales_df.groupBy("region").agg(
    {"amount": "sum", "order_id": "count"}
)

aggregated.write.format("delta") \
    .mode("overwrite") \
    .save(onelake_path.replace("sales", "sales_aggregated"))

Third-Party Tool Integration

Tableau Connection

# Tableau connects via SQL endpoint or Direct Lake
# Use ODBC connection with Azure AD

tableau_connection = {
    "server": "your-workspace.datawarehouse.fabric.microsoft.com",
    "port": 1433,
    "database": "lakehouse-sql-endpoint",
    "authentication": "Azure Active Directory",
    "driver": "ODBC Driver 18 for SQL Server",
    "connection_string": (
        "Driver={ODBC Driver 18 for SQL Server};"
        "Server=your-workspace.datawarehouse.fabric.microsoft.com,1433;"
        "Database=lakehouse-sql-endpoint;"
        "Authentication=ActiveDirectoryInteractive;"
        "Encrypt=yes;"
        "TrustServerCertificate=no;"
    )
}

dbt Integration

# dbt profile for Fabric
# profiles.yml

fabric_project:
  target: dev
  outputs:
    dev:
      type: fabric
      driver: 'ODBC Driver 18 for SQL Server'
      server: your-workspace.datawarehouse.fabric.microsoft.com
      port: 1433
      database: your-warehouse
      schema: dbo
      authentication: ActiveDirectoryInteractive
      # Or use service principal
      # authentication: ActiveDirectoryServicePrincipal
      # client_id: "{{ env_var('FABRIC_CLIENT_ID') }}"
      # client_secret: "{{ env_var('FABRIC_CLIENT_SECRET') }}"
      # tenant_id: "{{ env_var('FABRIC_TENANT_ID') }}"

-- dbt model example
-- models/sales_summary.sql

{{ config(materialized='table') }}

SELECT
    date_trunc('month', order_date) AS month,
    region,
    COUNT(*) AS order_count,
    SUM(amount) AS total_revenue
FROM {{ source('raw', 'orders') }}
WHERE order_date >= '2024-01-01'
GROUP BY 1, 2

Python SDK Access

# Access OneLake with Azure SDK
from azure.identity import DefaultAzureCredential
from azure.storage.filedatalake import DataLakeServiceClient

class OneLakeClient:
    """Python client for OneLake operations."""

    def __init__(self, workspace_name: str):
        self.workspace = workspace_name
        self.account_url = "https://onelake.dfs.fabric.microsoft.com"
        self.credential = DefaultAzureCredential()
        self.service_client = DataLakeServiceClient(
            account_url=self.account_url,
            credential=self.credential
        )

    def list_items(self) -> list:
        """List all items in workspace."""

        file_system_client = self.service_client.get_file_system_client(
            file_system=self.workspace
        )

        items = []
        for path in file_system_client.get_paths():
            items.append({
                "name": path.name,
                "is_directory": path.is_directory,
                "last_modified": path.last_modified
            })

        return items

    def read_file(self, lakehouse: str, file_path: str) -> bytes:
        """Read file from OneLake."""

        file_system_client = self.service_client.get_file_system_client(
            file_system=self.workspace
        )

        file_client = file_system_client.get_file_client(
            f"{lakehouse}.Lakehouse/Files/{file_path}"
        )

        download = file_client.download_file()
        return download.readall()

    def upload_file(
        self,
        lakehouse: str,
        file_path: str,
        data: bytes
    ):
        """Upload file to OneLake."""

        file_system_client = self.service_client.get_file_system_client(
            file_system=self.workspace
        )

        file_client = file_system_client.get_file_client(
            f"{lakehouse}.Lakehouse/Files/{file_path}"
        )

        file_client.upload_data(data, overwrite=True)

    def read_delta_table(self, lakehouse: str, table_name: str):
        """Read Delta table using deltalake library."""

        import deltalake

        table_path = (
            f"abfss://{self.workspace}@onelake.dfs.fabric.microsoft.com/"
            f"{lakehouse}.Lakehouse/Tables/{table_name}"
        )

        dt = deltalake.DeltaTable(
            table_path,
            storage_options={
                "azure_storage_account_name": "onelake",
                "azure_use_azure_cli": "true"
            }
        )

        return dt.to_pandas()

# Usage
client = OneLakeClient("my-workspace")

# List items
items = client.list_items()
for item in items[:5]:
    print(f"{item['name']} - Directory: {item['is_directory']}")

# Read Delta table
df = client.read_delta_table("sales-lakehouse", "customers")
print(df.head())

REST API Access

OneLake REST API

import requests
from azure.identity import DefaultAzureCredential

class OneLakeAPI:
    """OneLake REST API client."""

    def __init__(self):
        self.base_url = "https://api.fabric.microsoft.com/v1"
        self.credential = DefaultAzureCredential()

    def _get_headers(self):
        token = self.credential.get_token(
            "https://api.fabric.microsoft.com/.default"
        )
        return {
            "Authorization": f"Bearer {token.token}",
            "Content-Type": "application/json"
        }

    def list_workspaces(self) -> list:
        """List all workspaces."""

        response = requests.get(
            f"{self.base_url}/workspaces",
            headers=self._get_headers()
        )
        response.raise_for_status()
        return response.json().get("value", [])

    def list_lakehouses(self, workspace_id: str) -> list:
        """List lakehouses in workspace."""

        response = requests.get(
            f"{self.base_url}/workspaces/{workspace_id}/lakehouses",
            headers=self._get_headers()
        )
        response.raise_for_status()
        return response.json().get("value", [])

    def get_lakehouse_tables(
        self,
        workspace_id: str,
        lakehouse_id: str
    ) -> list:
        """Get tables in lakehouse."""

        response = requests.get(
            f"{self.base_url}/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables",
            headers=self._get_headers()
        )
        response.raise_for_status()
        return response.json().get("data", [])

    def run_sql_query(
        self,
        workspace_id: str,
        lakehouse_id: str,
        query: str
    ) -> list:
        """Run SQL query against lakehouse SQL endpoint."""

        payload = {
            "query": query,
            "maxRows": 10000
        }

        response = requests.post(
            f"{self.base_url}/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/sqlQueries",
            headers=self._get_headers(),
            json=payload
        )
        response.raise_for_status()
        return response.json()

# Usage
api = OneLakeAPI()

# List workspaces
workspaces = api.list_workspaces()
for ws in workspaces:
    print(f"Workspace: {ws['displayName']} ({ws['id']})")

# List lakehouses
lakehouses = api.list_lakehouses(workspaces[0]['id'])
for lh in lakehouses:
    print(f"Lakehouse: {lh['displayName']}")

# Run query
results = api.run_sql_query(
    workspace_id=workspaces[0]['id'],
    lakehouse_id=lakehouses[0]['id'],
    query="SELECT TOP 10 * FROM customers"
)

Data Export Patterns

Export to External Systems

class OneLakeExporter:
    """Export data from OneLake to external systems."""

    def __init__(self, spark):
        self.spark = spark

    def export_to_adls(
        self,
        source_table: str,
        target_account: str,
        target_container: str,
        target_path: str,
        format: str = "delta"
    ):
        """Export to external ADLS account."""

        df = self.spark.read.format("delta").table(source_table)

        target_url = f"abfss://{target_container}@{target_account}.dfs.core.windows.net/{target_path}"

        df.write.format(format) \
            .mode("overwrite") \
            .save(target_url)

    def export_to_s3(
        self,
        source_table: str,
        bucket: str,
        path: str,
        aws_access_key: str,
        aws_secret_key: str,
        format: str = "parquet"
    ):
        """Export to AWS S3."""

        # Configure S3 access
        self.spark.conf.set("fs.s3a.access.key", aws_access_key)
        self.spark.conf.set("fs.s3a.secret.key", aws_secret_key)

        df = self.spark.read.format("delta").table(source_table)

        df.write.format(format) \
            .mode("overwrite") \
            .save(f"s3a://{bucket}/{path}")

    def export_to_database(
        self,
        source_table: str,
        jdbc_url: str,
        target_table: str,
        username: str,
        password: str,
        mode: str = "overwrite"
    ):
        """Export to external database."""

        df = self.spark.read.format("delta").table(source_table)

        df.write.format("jdbc") \
            .option("url", jdbc_url) \
            .option("dbtable", target_table) \
            .option("user", username) \
            .option("password", password) \
            .mode(mode) \
            .save()

# Usage
exporter = OneLakeExporter(spark)

# Export to external ADLS
exporter.export_to_adls(
    source_table="customers",
    target_account="externalstorage",
    target_container="exports",
    target_path="customers/2024"
)

Security Considerations

Authentication Methods

auth_methods = {
    "azure_ad_interactive": {
        "description": "User signs in interactively",
        "use_case": "Development, ad-hoc access",
        "example": "Power BI Desktop, Azure Storage Explorer"
    },
    "service_principal": {
        "description": "App registration with client secret",
        "use_case": "Automated pipelines, external applications",
        "requires": ["Client ID", "Client Secret", "Tenant ID"]
    },
    "managed_identity": {
        "description": "Azure resource identity",
        "use_case": "Azure services (Synapse, Databricks, Functions)",
        "requires": ["Managed Identity enabled on resource"]
    },
    "sas_token": {
        "description": "Shared Access Signature",
        "use_case": "Not supported for OneLake",
        "note": "Use Azure AD methods instead"
    }
}

Access Control

# OneLake uses Fabric workspace permissions
# Plus item-level permissions for fine-grained control

access_levels = {
    "workspace_admin": {
        "fabric_permissions": "Full control over workspace",
        "onelake_access": "Read/Write all items"
    },
    "workspace_member": {
        "fabric_permissions": "Create and manage items",
        "onelake_access": "Read/Write items they own or have access to"
    },
    "workspace_contributor": {
        "fabric_permissions": "Edit existing items",
        "onelake_access": "Read/Write items with explicit permission"
    },
    "workspace_viewer": {
        "fabric_permissions": "View items only",
        "onelake_access": "Read-only access to items with permission"
    }
}

Best Practices

Use managed identity: Preferred for Azure services
Leverage shortcuts: Don’t copy data unnecessarily
Optimize file sizes: 128MB-1GB for best performance
Monitor access patterns: Track who accesses what
Use SQL endpoints: For BI tool connectivity

Conclusion

OneLake’s ADLS Gen2 compatibility makes it interoperable with virtually any tool that can access Azure storage. Whether connecting from Databricks, external Spark clusters, or BI tools like Tableau, OneLake serves as a universal data layer.

Use the appropriate authentication method for your use case, and leverage shortcuts to avoid unnecessary data movement.