June 23, 2024 1 min read

External Data Access Patterns in Microsoft Fabric

Microsoft Fabric External Data OneLake Data Integration Multi-Cloud

Accessing external data is essential for hybrid and multi-cloud scenarios. Today I’m exploring external data access patterns in Microsoft Fabric.

External Data Options

External Data Access:
├── OneLake Shortcuts
│   ├── Azure Data Lake Storage Gen2
│   ├── Amazon S3
│   ├── Google Cloud Storage
│   └── Dataverse
├── Data Pipelines (Copy)
├── Dataflows Gen2
├── Spark External Tables
└── Mirroring (Preview)

ADLS Gen2 Shortcuts

class ADLSShortcutManager:
    """Manage shortcuts to Azure Data Lake Storage."""

    def __init__(self, fabric_client):
        self.client = fabric_client

    def create_adls_shortcut(
        self,
        workspace_id: str,
        lakehouse_id: str,
        shortcut_name: str,
        storage_account: str,
        container: str,
        path: str = "",
        auth_type: str = "organizational"
    ):
        """Create shortcut to ADLS Gen2."""
        shortcut_config = {
            "name": shortcut_name,
            "target": {
                "type": "AzureDataLakeStorageGen2",
                "url": f"https://{storage_account}.dfs.core.windows.net/{container}",
                "subpath": path
            }
        }

        if auth_type == "organizational":
            # Uses Entra ID authentication
            shortcut_config["target"]["authentication"] = {
                "type": "OrganizationalAccount"
            }
        elif auth_type == "service_principal":
            shortcut_config["target"]["authentication"] = {
                "type": "ServicePrincipal",
                "connectionId": self._get_connection_id(storage_account)
            }

        return self.client.shortcuts.create(
            workspace_id=workspace_id,
            item_id=lakehouse_id,
            shortcut=shortcut_config
        )

    def create_hierarchical_shortcuts(
        self,
        workspace_id: str,
        lakehouse_id: str,
        storage_account: str,
        container: str,
        partitions: list
    ):
        """Create shortcuts for partitioned data."""
        shortcuts = []

        for partition in partitions:
            shortcut = self.create_adls_shortcut(
                workspace_id=workspace_id,
                lakehouse_id=lakehouse_id,
                shortcut_name=f"data_{partition['name']}",
                storage_account=storage_account,
                container=container,
                path=partition['path']
            )
            shortcuts.append(shortcut)

        return shortcuts

# Usage
adls_manager = ADLSShortcutManager(fabric_client)

# Create shortcut to external data lake
adls_manager.create_adls_shortcut(
    workspace_id="ws-analytics",
    lakehouse_id="lh-central",
    shortcut_name="external_sales_data",
    storage_account="companydatalake",
    container="sales",
    path="processed/2024"
)

# Create shortcuts for regional partitions
adls_manager.create_hierarchical_shortcuts(
    workspace_id="ws-analytics",
    lakehouse_id="lh-central",
    storage_account="companydatalake",
    container="sales",
    partitions=[
        {"name": "north_america", "path": "regions/na"},
        {"name": "europe", "path": "regions/eu"},
        {"name": "asia_pacific", "path": "regions/apac"}
    ]
)

Amazon S3 Integration

class S3ShortcutManager:
    """Manage shortcuts to Amazon S3."""

    def __init__(self, fabric_client):
        self.client = fabric_client

    def create_s3_connection(
        self,
        name: str,
        access_key_id: str,
        secret_access_key: str,
        region: str = "us-east-1"
    ):
        """Create connection to S3."""
        return self.client.connections.create(
            name=name,
            connection_type="AmazonS3",
            credentials={
                "credentialType": "Key",
                "accessKeyId": access_key_id,
                "secretAccessKey": secret_access_key
            },
            properties={
                "region": region
            }
        )

    def create_s3_shortcut(
        self,
        workspace_id: str,
        lakehouse_id: str,
        shortcut_name: str,
        bucket: str,
        path: str,
        connection_id: str
    ):
        """Create shortcut to S3 bucket."""
        return self.client.shortcuts.create(
            workspace_id=workspace_id,
            item_id=lakehouse_id,
            shortcut={
                "name": shortcut_name,
                "target": {
                    "type": "AmazonS3",
                    "location": f"s3://{bucket}/{path}",
                    "connectionId": connection_id
                }
            }
        )

    def sync_s3_catalog(
        self,
        workspace_id: str,
        lakehouse_id: str,
        bucket: str,
        connection_id: str,
        prefix: str = ""
    ):
        """Create shortcuts for all top-level folders in S3."""
        # List S3 folders
        s3_client = self._get_s3_client(connection_id)
        folders = self._list_folders(s3_client, bucket, prefix)

        shortcuts = []
        for folder in folders:
            shortcut = self.create_s3_shortcut(
                workspace_id=workspace_id,
                lakehouse_id=lakehouse_id,
                shortcut_name=f"s3_{folder['name']}",
                bucket=bucket,
                path=folder['path'],
                connection_id=connection_id
            )
            shortcuts.append(shortcut)

        return shortcuts

# Usage
s3_manager = S3ShortcutManager(fabric_client)

# Create connection
connection = s3_manager.create_s3_connection(
    name="aws-analytics-bucket",
    access_key_id="AKIAIOSFODNN7EXAMPLE",
    secret_access_key="wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY",
    region="us-west-2"
)

# Create shortcut
s3_manager.create_s3_shortcut(
    workspace_id="ws-analytics",
    lakehouse_id="lh-central",
    shortcut_name="aws_clickstream",
    bucket="company-clickstream-data",
    path="processed/2024",
    connection_id=connection.id
)

Google Cloud Storage

class GCSShortcutManager:
    """Manage shortcuts to Google Cloud Storage."""

    def __init__(self, fabric_client):
        self.client = fabric_client

    def create_gcs_connection(
        self,
        name: str,
        service_account_key: dict
    ):
        """Create connection to GCS."""
        return self.client.connections.create(
            name=name,
            connection_type="GoogleCloudStorage",
            credentials={
                "credentialType": "ServiceAccountKey",
                "serviceAccountKey": json.dumps(service_account_key)
            }
        )

    def create_gcs_shortcut(
        self,
        workspace_id: str,
        lakehouse_id: str,
        shortcut_name: str,
        bucket: str,
        path: str,
        connection_id: str
    ):
        """Create shortcut to GCS bucket."""
        return self.client.shortcuts.create(
            workspace_id=workspace_id,
            item_id=lakehouse_id,
            shortcut={
                "name": shortcut_name,
                "target": {
                    "type": "GoogleCloudStorage",
                    "location": f"gs://{bucket}/{path}",
                    "connectionId": connection_id
                }
            }
        )

# Usage
gcs_manager = GCSShortcutManager(fabric_client)

# Create shortcut to BigQuery export
gcs_manager.create_gcs_shortcut(
    workspace_id="ws-analytics",
    lakehouse_id="lh-central",
    shortcut_name="gcp_bigquery_export",
    bucket="company-bq-exports",
    path="daily_exports/customers",
    connection_id="gcs-connection-id"
)

Spark External Tables

# PySpark - Create external table pointing to cloud storage
spark.sql("""
    CREATE EXTERNAL TABLE IF NOT EXISTS external_catalog.aws_events (
        event_id STRING,
        event_type STRING,
        user_id STRING,
        event_data STRING,
        event_timestamp TIMESTAMP
    )
    USING DELTA
    LOCATION 's3a://company-events/delta/events'
    OPTIONS (
        'fs.s3a.access.key' = '${AWS_ACCESS_KEY}',
        'fs.s3a.secret.key' = '${AWS_SECRET_KEY}'
    )
""")

# Query external table
df = spark.sql("""
    SELECT
        event_type,
        COUNT(*) as event_count,
        COUNT(DISTINCT user_id) as unique_users
    FROM external_catalog.aws_events
    WHERE event_timestamp >= current_date() - INTERVAL 7 DAYS
    GROUP BY event_type
    ORDER BY event_count DESC
""")

Dataverse Integration

class DataverseConnector:
    """Connect to Dataverse from Fabric."""

    def __init__(self, fabric_client):
        self.client = fabric_client

    def create_dataverse_shortcut(
        self,
        workspace_id: str,
        lakehouse_id: str,
        shortcut_name: str,
        environment_url: str,
        table_name: str
    ):
        """Create shortcut to Dataverse table."""
        return self.client.shortcuts.create(
            workspace_id=workspace_id,
            item_id=lakehouse_id,
            shortcut={
                "name": shortcut_name,
                "target": {
                    "type": "Dataverse",
                    "environmentUrl": environment_url,
                    "tableName": table_name
                }
            }
        )

    def sync_dataverse_tables(
        self,
        workspace_id: str,
        lakehouse_id: str,
        environment_url: str,
        tables: list
    ):
        """Create shortcuts for multiple Dataverse tables."""
        shortcuts = []

        for table in tables:
            shortcut = self.create_dataverse_shortcut(
                workspace_id=workspace_id,
                lakehouse_id=lakehouse_id,
                shortcut_name=f"dv_{table.lower()}",
                environment_url=environment_url,
                table_name=table
            )
            shortcuts.append(shortcut)

        return shortcuts

# Usage
dataverse = DataverseConnector(fabric_client)

# Sync CRM tables
dataverse.sync_dataverse_tables(
    workspace_id="ws-sales",
    lakehouse_id="lh-crm",
    environment_url="https://org.crm.dynamics.com",
    tables=["account", "contact", "opportunity", "lead"]
)

Multi-Cloud Data Federation

class MultiCloudFederation:
    """Federate data across cloud providers."""

    def __init__(self, fabric_client):
        self.client = fabric_client
        self.adls = ADLSShortcutManager(fabric_client)
        self.s3 = S3ShortcutManager(fabric_client)
        self.gcs = GCSShortcutManager(fabric_client)

    def create_unified_view(
        self,
        workspace_id: str,
        lakehouse_id: str,
        sources: list
    ):
        """Create unified view across cloud sources."""
        # Create shortcuts for each source
        for source in sources:
            if source["type"] == "adls":
                self.adls.create_adls_shortcut(
                    workspace_id=workspace_id,
                    lakehouse_id=lakehouse_id,
                    shortcut_name=source["name"],
                    storage_account=source["storage_account"],
                    container=source["container"],
                    path=source.get("path", "")
                )
            elif source["type"] == "s3":
                self.s3.create_s3_shortcut(
                    workspace_id=workspace_id,
                    lakehouse_id=lakehouse_id,
                    shortcut_name=source["name"],
                    bucket=source["bucket"],
                    path=source.get("path", ""),
                    connection_id=source["connection_id"]
                )
            elif source["type"] == "gcs":
                self.gcs.create_gcs_shortcut(
                    workspace_id=workspace_id,
                    lakehouse_id=lakehouse_id,
                    shortcut_name=source["name"],
                    bucket=source["bucket"],
                    path=source.get("path", ""),
                    connection_id=source["connection_id"]
                )

        # Create unified SQL view
        self._create_unified_sql_view(workspace_id, lakehouse_id, sources)

    def _create_unified_sql_view(
        self,
        workspace_id: str,
        lakehouse_id: str,
        sources: list
    ):
        """Create SQL view combining all sources."""
        union_queries = []

        for source in sources:
            union_queries.append(f"""
                SELECT
                    '{source['cloud']}' as source_cloud,
                    '{source['name']}' as source_name,
                    *
                FROM {source['name']}
            """)

        view_sql = f"""
            CREATE OR REPLACE VIEW unified_data AS
            {' UNION ALL '.join(union_queries)}
        """

        self.client.sql.execute(workspace_id, lakehouse_id, view_sql)

# Usage
federation = MultiCloudFederation(fabric_client)

# Create unified customer view across clouds
federation.create_unified_view(
    workspace_id="ws-analytics",
    lakehouse_id="lh-central",
    sources=[
        {
            "name": "azure_customers",
            "type": "adls",
            "cloud": "azure",
            "storage_account": "companydatalake",
            "container": "customers",
            "path": "processed"
        },
        {
            "name": "aws_customers",
            "type": "s3",
            "cloud": "aws",
            "bucket": "company-customer-data",
            "path": "customers",
            "connection_id": "s3-conn-id"
        },
        {
            "name": "gcp_customers",
            "type": "gcs",
            "cloud": "gcp",
            "bucket": "company-customers-gcp",
            "path": "exported",
            "connection_id": "gcs-conn-id"
        }
    ]
)

Best Practices

Use shortcuts over copies - Zero-copy saves storage costs
Secure connections - Use managed identities when possible
Monitor performance - Cross-cloud queries have latency
Cache hot data - Materialize frequently accessed external data
Plan for failures - External sources can be unavailable

What’s Next

Tomorrow I’ll cover security fundamentals in Microsoft Fabric.