Back to Blog
2 min read

Microsoft Fabric Notebooks: Collaborative Data Science at Scale

Microsoft Fabric notebooks combine the interactivity of Jupyter with enterprise-grade collaboration and Spark compute. They are the ideal environment for data exploration, feature engineering, and model development.

Environment Setup

# Fabric notebooks come pre-configured with common libraries
# Access Lakehouse data directly with semantic links

from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

# Spark session is automatically available
spark = SparkSession.builder.getOrCreate()

# Read from Lakehouse tables directly
df = spark.read.table("lakehouse.sales_data")
print(f"Loaded {df.count()} rows")

Efficient Data Processing

from pyspark.sql import functions as F
from pyspark.sql.window import Window

def create_features(df):
    """Create time-series features for ML models."""

    # Window functions for rolling aggregates
    window_7d = Window.partitionBy("customer_id").orderBy("order_date").rowsBetween(-6, 0)
    window_30d = Window.partitionBy("customer_id").orderBy("order_date").rowsBetween(-29, 0)

    features_df = df \
        .withColumn("rolling_7d_avg", F.avg("amount").over(window_7d)) \
        .withColumn("rolling_30d_avg", F.avg("amount").over(window_30d)) \
        .withColumn("rolling_7d_count", F.count("order_id").over(window_7d)) \
        .withColumn("days_since_first_order",
            F.datediff(F.col("order_date"),
                      F.min("order_date").over(Window.partitionBy("customer_id")))
        ) \
        .withColumn("day_of_week", F.dayofweek("order_date")) \
        .withColumn("month", F.month("order_date")) \
        .withColumn("is_weekend",
            F.when(F.col("day_of_week").isin(1, 7), 1).otherwise(0)
        )

    return features_df

# Apply feature engineering
features = create_features(df)
features.write.format("delta").mode("overwrite").saveAsTable("gold.customer_features")

Interactive Visualization

# Fabric notebooks support rich visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# Convert to pandas for visualization (sample for large datasets)
sample_df = features.sample(0.1).toPandas()

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Distribution of order amounts
axes[0, 0].hist(sample_df["amount"], bins=50, edgecolor="black")
axes[0, 0].set_title("Distribution of Order Amounts")
axes[0, 0].set_xlabel("Amount")

# Orders by day of week
daily_orders = sample_df.groupby("day_of_week")["order_id"].count()
axes[0, 1].bar(daily_orders.index, daily_orders.values)
axes[0, 1].set_title("Orders by Day of Week")

# Rolling average trend
sample_df_sorted = sample_df.sort_values("order_date")
axes[1, 0].plot(sample_df_sorted["order_date"], sample_df_sorted["rolling_7d_avg"])
axes[1, 0].set_title("7-Day Rolling Average")

# Customer segmentation
axes[1, 1].scatter(sample_df["rolling_30d_count"],
                   sample_df["rolling_30d_avg"],
                   alpha=0.5)
axes[1, 1].set_title("Customer Activity vs Spend")

plt.tight_layout()
plt.show()

MLflow Integration

import mlflow
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Fabric integrates with MLflow for experiment tracking
mlflow.set_experiment("customer_lifetime_value")

with mlflow.start_run(run_name="rf_baseline"):
    # Prepare data
    X = sample_df[["rolling_7d_avg", "rolling_30d_avg", "rolling_7d_count",
                   "days_since_first_order", "day_of_week", "is_weekend"]]
    y = sample_df["amount"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Train model
    model = RandomForestRegressor(n_estimators=100, max_depth=10)
    model.fit(X_train, y_train)

    # Log parameters and metrics
    mlflow.log_params({"n_estimators": 100, "max_depth": 10})
    mlflow.log_metric("train_score", model.score(X_train, y_train))
    mlflow.log_metric("test_score", model.score(X_test, y_test))

    # Log model
    mlflow.sklearn.log_model(model, "model")

Fabric notebooks provide a seamless path from exploration to production, with built-in governance, collaboration, and scalable compute.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.