August 11, 2024 1 min read

Cross-Cloud Analytics with Microsoft Fabric

Microsoft Fabric Cross-Cloud Multi-Cloud Analytics Data Architecture

Cross-cloud analytics enables organizations to query and analyze data across Azure, AWS, and GCP from a unified platform. Microsoft Fabric’s OneLake shortcuts make this possible without data movement.

The Multi-Cloud Reality

Most enterprises use multiple clouds:

┌─────────────────────────────────────────────────────────────┐
│                    Enterprise Data Estate                    │
├─────────────────┬─────────────────┬─────────────────────────┤
│     Azure       │      AWS        │         GCP             │
│  ┌───────────┐  │  ┌───────────┐  │  ┌───────────────────┐  │
│  │ ADLS Gen2 │  │  │    S3     │  │  │  Cloud Storage    │  │
│  │ Cosmos DB │  │  │  Redshift │  │  │    BigQuery       │  │
│  │ Azure SQL │  │  │    RDS    │  │  │    Spanner        │  │
│  └───────────┘  │  └───────────┘  │  └───────────────────┘  │
└─────────────────┴─────────────────┴─────────────────────────┘
                            │
                            ▼
              ┌─────────────────────────┐
              │    Microsoft Fabric     │
              │       OneLake           │
              │   (Unified Analytics)   │
              └─────────────────────────┘

Setting Up Cross-Cloud Connections

AWS S3 Connection

from azure.identity import DefaultAzureCredential
import requests

class CrossCloudConnector:
    def __init__(self, workspace_id: str):
        self.workspace_id = workspace_id
        self.base_url = "https://api.fabric.microsoft.com/v1"

    def _get_headers(self):
        credential = DefaultAzureCredential()
        token = credential.get_token("https://api.fabric.microsoft.com/.default")
        return {
            "Authorization": f"Bearer {token.token}",
            "Content-Type": "application/json"
        }

    def create_s3_connection(
        self,
        connection_name: str,
        aws_access_key_id: str,
        aws_secret_access_key: str,
        region: str = "us-east-1"
    ) -> dict:
        """Create connection to AWS S3."""

        payload = {
            "displayName": connection_name,
            "connectionDetails": {
                "type": "AmazonS3",
                "parameters": {
                    "region": region
                }
            },
            "credentialDetails": {
                "credentials": {
                    "credentialType": "Key",
                    "accessKeyId": aws_access_key_id,
                    "secretAccessKey": aws_secret_access_key
                }
            }
        }

        response = requests.post(
            f"{self.base_url}/connections",
            headers=self._get_headers(),
            json=payload
        )

        response.raise_for_status()
        return response.json()

    def create_gcs_connection(
        self,
        connection_name: str,
        service_account_key: dict
    ) -> dict:
        """Create connection to Google Cloud Storage."""

        import base64
        import json

        payload = {
            "displayName": connection_name,
            "connectionDetails": {
                "type": "GoogleCloudStorage",
                "parameters": {}
            },
            "credentialDetails": {
                "credentials": {
                    "credentialType": "ServiceAccountKey",
                    "key": base64.b64encode(
                        json.dumps(service_account_key).encode()
                    ).decode()
                }
            }
        }

        response = requests.post(
            f"{self.base_url}/connections",
            headers=self._get_headers(),
            json=payload
        )

        response.raise_for_status()
        return response.json()

# Usage
connector = CrossCloudConnector("workspace-id")

# AWS connection
aws_conn = connector.create_s3_connection(
    connection_name="AWS-DataLake",
    aws_access_key_id="AKIA...",
    aws_secret_access_key="secret...",
    region="us-west-2"
)

# GCP connection
gcp_conn = connector.create_gcs_connection(
    connection_name="GCP-DataLake",
    service_account_key={
        "type": "service_account",
        "project_id": "my-project",
        # ... rest of service account JSON
    }
)

Creating Cross-Cloud Shortcuts

class CrossCloudLakehouse:
    def __init__(self, workspace_id: str, lakehouse_id: str):
        self.workspace_id = workspace_id
        self.lakehouse_id = lakehouse_id
        self.shortcuts = []

    def add_azure_shortcut(
        self,
        name: str,
        storage_account: str,
        container: str,
        path: str
    ):
        """Add shortcut to Azure Data Lake Storage."""

        shortcut = {
            "path": f"Tables/{name}",
            "target": {
                "type": "AdlsGen2",
                "adlsGen2": {
                    "location": f"https://{storage_account}.dfs.core.windows.net",
                    "subpath": f"/{container}/{path}"
                }
            }
        }

        self._create_shortcut(shortcut)
        self.shortcuts.append({"name": name, "cloud": "Azure"})

    def add_aws_shortcut(
        self,
        name: str,
        bucket: str,
        path: str,
        connection_id: str
    ):
        """Add shortcut to AWS S3."""

        shortcut = {
            "path": f"Tables/{name}",
            "target": {
                "type": "S3",
                "s3": {
                    "location": f"https://{bucket}.s3.amazonaws.com",
                    "subpath": f"/{path}",
                    "connectionId": connection_id
                }
            }
        }

        self._create_shortcut(shortcut)
        self.shortcuts.append({"name": name, "cloud": "AWS"})

    def add_gcp_shortcut(
        self,
        name: str,
        bucket: str,
        path: str,
        connection_id: str
    ):
        """Add shortcut to Google Cloud Storage."""

        shortcut = {
            "path": f"Tables/{name}",
            "target": {
                "type": "GoogleCloudStorage",
                "googleCloudStorage": {
                    "location": f"https://storage.googleapis.com/{bucket}",
                    "subpath": f"/{path}",
                    "connectionId": connection_id
                }
            }
        }

        self._create_shortcut(shortcut)
        self.shortcuts.append({"name": name, "cloud": "GCP"})

    def _create_shortcut(self, shortcut: dict):
        # API call to create shortcut
        pass

    def list_shortcuts(self) -> list:
        """List all cross-cloud shortcuts."""
        return self.shortcuts

# Build cross-cloud lakehouse
lakehouse = CrossCloudLakehouse("workspace-id", "lakehouse-id")

# Azure data
lakehouse.add_azure_shortcut(
    name="azure_customers",
    storage_account="azuredatalake",
    container="gold",
    path="customers"
)

# AWS data
lakehouse.add_aws_shortcut(
    name="aws_transactions",
    bucket="company-data-lake",
    path="transactions/parquet",
    connection_id="aws-connection-id"
)

# GCP data
lakehouse.add_gcp_shortcut(
    name="gcp_analytics",
    bucket="analytics-bucket",
    path="processed/events",
    connection_id="gcp-connection-id"
)

Querying Across Clouds

Unified SQL Queries

-- Query data from three different clouds in one statement
SELECT
    c.customer_id,
    c.customer_name,
    c.region AS customer_region,
    COUNT(t.transaction_id) AS transaction_count,
    SUM(t.amount) AS total_spend,
    AVG(a.session_duration) AS avg_session_minutes
FROM azure_customers c                      -- Azure ADLS
LEFT JOIN aws_transactions t                -- AWS S3
    ON c.customer_id = t.customer_id
    AND t.transaction_date >= '2024-01-01'
LEFT JOIN gcp_analytics a                   -- GCP Cloud Storage
    ON c.customer_id = a.user_id
    AND a.event_date >= '2024-01-01'
GROUP BY c.customer_id, c.customer_name, c.region
ORDER BY total_spend DESC;

Spark Cross-Cloud Analytics

from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# All data accessible through OneLake paths
azure_customers = spark.read.format("delta").load("Tables/azure_customers")
aws_transactions = spark.read.format("delta").load("Tables/aws_transactions")
gcp_analytics = spark.read.format("delta").load("Tables/gcp_analytics")

# Cross-cloud customer 360 view
customer_360 = azure_customers.alias("c") \
    .join(
        aws_transactions.alias("t"),
        col("c.customer_id") == col("t.customer_id"),
        "left"
    ) \
    .join(
        gcp_analytics.alias("a"),
        col("c.customer_id") == col("a.user_id"),
        "left"
    ) \
    .groupBy(
        col("c.customer_id"),
        col("c.customer_name"),
        col("c.segment"),
        col("c.region")
    ) \
    .agg(
        # Transaction metrics (AWS)
        count("t.transaction_id").alias("transaction_count"),
        sum("t.amount").alias("total_spend"),
        max("t.transaction_date").alias("last_transaction"),

        # Engagement metrics (GCP)
        countDistinct("a.session_id").alias("total_sessions"),
        avg("a.session_duration").alias("avg_session_duration"),
        sum("a.page_views").alias("total_page_views")
    )

# Save consolidated view locally in Fabric
customer_360.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("customer_360_view")

Cross-Cloud Data Patterns

Pattern 1: Cloud-Specific Workloads

class CloudWorkloadRouter:
    """Route analytics workloads to optimal cloud."""

    def __init__(self):
        self.workload_mapping = {
            "ml_training": "aws",      # Use SageMaker
            "streaming": "gcp",         # Use Dataflow
            "bi_reporting": "azure",    # Use Power BI
            "data_warehouse": "azure"   # Use Fabric
        }

    def get_data_for_workload(
        self,
        workload_type: str,
        required_tables: list[str]
    ) -> dict:
        """Prepare data for cloud-specific workload."""

        target_cloud = self.workload_mapping.get(workload_type, "azure")

        if target_cloud == "azure":
            # Data already in Fabric, use directly
            return {
                "location": "fabric",
                "tables": required_tables,
                "action": "query_in_place"
            }
        else:
            # Need to export data to target cloud
            return {
                "location": target_cloud,
                "tables": required_tables,
                "action": "export_required",
                "export_format": "delta"
            }

    def export_to_cloud(
        self,
        tables: list[str],
        target_cloud: str,
        target_path: str
    ):
        """Export data to another cloud for processing."""

        for table in tables:
            df = spark.read.format("delta").table(table)

            if target_cloud == "aws":
                df.write.format("delta") \
                    .mode("overwrite") \
                    .save(f"s3a://{target_path}/{table}")

            elif target_cloud == "gcp":
                df.write.format("delta") \
                    .mode("overwrite") \
                    .save(f"gs://{target_path}/{table}")

Pattern 2: Follow-the-Sun Processing

class FollowTheSunProcessor:
    """Process data in the region where it's daytime."""

    def __init__(self):
        self.regional_config = {
            "americas": {
                "cloud": "aws",
                "region": "us-east-1",
                "active_hours": (9, 17),  # EST
                "timezone": "America/New_York"
            },
            "europe": {
                "cloud": "azure",
                "region": "westeurope",
                "active_hours": (9, 17),  # CET
                "timezone": "Europe/Paris"
            },
            "asia": {
                "cloud": "gcp",
                "region": "asia-east1",
                "active_hours": (9, 17),  # JST
                "timezone": "Asia/Tokyo"
            }
        }

    def get_active_region(self) -> str:
        """Determine which region is currently in business hours."""

        from datetime import datetime
        import pytz

        for region, config in self.regional_config.items():
            tz = pytz.timezone(config["timezone"])
            local_time = datetime.now(tz)

            if config["active_hours"][0] <= local_time.hour < config["active_hours"][1]:
                return region

        return "americas"  # Default

    def route_query(self, query: str) -> dict:
        """Route query to active regional processing."""

        active_region = self.get_active_region()
        config = self.regional_config[active_region]

        return {
            "region": active_region,
            "cloud": config["cloud"],
            "execute_in": config["region"],
            "query": query
        }

Cost Optimization

Cross-Cloud Data Transfer Costs

class CrossCloudCostOptimizer:
    """Optimize costs for cross-cloud analytics."""

    def __init__(self):
        # Approximate egress costs per GB
        self.egress_costs = {
            "azure": 0.087,  # Azure to internet
            "aws": 0.09,     # AWS to internet
            "gcp": 0.12      # GCP to internet
        }

        # Inter-cloud costs are higher
        self.inter_cloud_costs = {
            ("azure", "aws"): 0.02,
            ("azure", "gcp"): 0.02,
            ("aws", "azure"): 0.02,
            ("aws", "gcp"): 0.02,
            ("gcp", "azure"): 0.02,
            ("gcp", "aws"): 0.02
        }

    def estimate_query_cost(
        self,
        tables_accessed: list[dict],
        destination_cloud: str
    ) -> dict:
        """Estimate data transfer cost for a cross-cloud query."""

        total_cost = 0.0
        transfer_details = []

        for table in tables_accessed:
            source_cloud = table["cloud"]
            estimated_size_gb = table["estimated_size_gb"]

            if source_cloud != destination_cloud:
                cost_per_gb = self.inter_cloud_costs.get(
                    (source_cloud, destination_cloud),
                    0.02
                )
                transfer_cost = estimated_size_gb * cost_per_gb
                total_cost += transfer_cost

                transfer_details.append({
                    "table": table["name"],
                    "from": source_cloud,
                    "to": destination_cloud,
                    "size_gb": estimated_size_gb,
                    "cost": transfer_cost
                })

        return {
            "total_estimated_cost": total_cost,
            "transfers": transfer_details,
            "recommendation": self._get_recommendation(transfer_details)
        }

    def _get_recommendation(self, transfers: list) -> str:
        """Provide cost optimization recommendation."""

        if not transfers:
            return "Query is local, no optimization needed"

        total_size = sum(t["size_gb"] for t in transfers)

        if total_size > 100:
            return "Consider materializing frequently joined data locally"
        elif total_size > 10:
            return "Consider caching query results"
        else:
            return "Transfer cost is minimal"

# Usage
optimizer = CrossCloudCostOptimizer()

cost_estimate = optimizer.estimate_query_cost(
    tables_accessed=[
        {"name": "azure_customers", "cloud": "azure", "estimated_size_gb": 5},
        {"name": "aws_transactions", "cloud": "aws", "estimated_size_gb": 50},
        {"name": "gcp_analytics", "cloud": "gcp", "estimated_size_gb": 20}
    ],
    destination_cloud="azure"
)

print(f"Estimated cost: ${cost_estimate['total_estimated_cost']:.2f}")
print(f"Recommendation: {cost_estimate['recommendation']}")

Materialization Strategy

def optimize_cross_cloud_joins(
    query_history: list[dict],
    materialization_threshold_gb: float = 10
) -> list[dict]:
    """Identify cross-cloud joins that should be materialized."""

    join_patterns = {}

    for query in query_history:
        tables = query.get("tables_joined", [])
        if len(tables) > 1:
            key = tuple(sorted(tables))
            if key not in join_patterns:
                join_patterns[key] = {
                    "tables": tables,
                    "count": 0,
                    "total_data_transferred_gb": 0
                }
            join_patterns[key]["count"] += 1
            join_patterns[key]["total_data_transferred_gb"] += query.get(
                "data_transferred_gb", 0
            )

    recommendations = []
    for pattern, stats in join_patterns.items():
        if stats["total_data_transferred_gb"] > materialization_threshold_gb:
            recommendations.append({
                "tables": stats["tables"],
                "query_count": stats["count"],
                "total_transfer_gb": stats["total_data_transferred_gb"],
                "action": "Create materialized view in Fabric",
                "estimated_savings": f"${stats['total_data_transferred_gb'] * 0.02:.2f}/month"
            })

    return recommendations

Best Practices

Minimize cross-cloud joins: Materialize frequently joined data
Use shortcuts wisely: Only for data that changes frequently
Consider latency: Cross-cloud queries have higher latency
Monitor egress costs: Data transfer can be expensive
Cache hot data locally: Keep frequently accessed data in Fabric

Conclusion

Cross-cloud analytics in Microsoft Fabric enables true multi-cloud data strategies. Use OneLake shortcuts to create a unified view of your data estate, but be mindful of data transfer costs and latency.

Start with read-only analytics across clouds, then materialize high-value, frequently accessed data locally for optimal performance and cost.