April 17, 2024 1 min read

Azure Databricks April 2024 Updates: AI and Analytics Convergence

Azure Databricks AI Data Engineering Analytics

Azure Databricks April 2024 Updates: AI and Analytics Convergence

Azure Databricks continues to evolve with powerful AI/BI features. April 2024 brings significant updates that blur the line between data engineering and business intelligence.

Key Updates Overview

DATABRICKS_APRIL_2024 = {
    "ai_bi": {
        "features": [
            "Genie - Natural language data exploration",
            "AI/BI Dashboards - AI-powered visualizations",
            "Lakeview Dashboards - Lightweight BI"
        ]
    },
    "ai_functions": {
        "features": [
            "ai_query() for inline LLM calls",
            "ai_analyze_sentiment()",
            "ai_summarize()",
            "ai_translate()"
        ]
    },
    "vector_search": {
        "features": [
            "Managed vector search indexes",
            "Automatic embedding generation",
            "Similarity search at scale"
        ]
    },
    "model_serving": {
        "features": [
            "Serverless model endpoints",
            "Foundation model APIs",
            "Provisioned throughput"
        ]
    }
}

Connecting to Azure Databricks

from databricks import sql
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.sql import StatementState

# Connection using Databricks SDK
client = WorkspaceClient(
    host="https://adb-xxxxx.azuredatabricks.net",
    token="dapi_xxxxx"
)

# SQL connection for queries
connection = sql.connect(
    server_hostname="adb-xxxxx.azuredatabricks.net",
    http_path="/sql/1.0/warehouses/xxxxx",
    access_token="dapi_xxxxx"
)

cursor = connection.cursor()
cursor.execute("SELECT * FROM catalog.schema.table LIMIT 10")
results = cursor.fetchall()

Using SQL AI Functions

-- AI Functions in Databricks SQL

-- Sentiment analysis
SELECT
    review_id,
    review_text,
    ai_analyze_sentiment(review_text) as sentiment
FROM reviews
WHERE date >= '2024-04-01';

-- Text summarization
SELECT
    article_id,
    ai_summarize(article_content, 100) as summary
FROM articles
WHERE category = 'technology';

-- Translation
SELECT
    message_id,
    original_text,
    ai_translate(original_text, 'es', 'en') as english_text
FROM customer_messages
WHERE language = 'Spanish';

-- Custom AI query
SELECT
    product_id,
    description,
    ai_query(
        'claude-3-haiku',
        CONCAT('Extract key features from this product description: ', description)
    ) as features
FROM products;

Working with Genie (AI/BI)

# Genie enables natural language data exploration

# Example: Using Genie programmatically via API
import requests

class GenieClient:
    """Client for Databricks Genie API"""

    def __init__(self, workspace_url: str, token: str):
        self.base_url = f"{workspace_url}/api/2.0/genie"
        self.headers = {
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json"
        }

    def ask(
        self,
        space_id: str,
        question: str,
        context: dict = None
    ) -> dict:
        """Ask Genie a question"""

        payload = {
            "space_id": space_id,
            "message": question
        }

        if context:
            payload["context"] = context

        response = requests.post(
            f"{self.base_url}/conversations/messages",
            headers=self.headers,
            json=payload
        )

        return response.json()

    def get_sql(self, conversation_id: str, message_id: str) -> str:
        """Get the SQL generated by Genie"""

        response = requests.get(
            f"{self.base_url}/conversations/{conversation_id}/messages/{message_id}/query",
            headers=self.headers
        )

        return response.json().get("query_text")

# Usage
genie = GenieClient(
    workspace_url="https://adb-xxxxx.azuredatabricks.net",
    token="dapi_xxxxx"
)

# Ask a question
result = genie.ask(
    space_id="your-genie-space-id",
    question="What were total sales by region last month?"
)

print(result)

Vector Search Integration

from databricks.vector_search.client import VectorSearchClient

# Initialize client
vsc = VectorSearchClient()

# Create vector search index
index = vsc.create_delta_sync_index(
    endpoint_name="vector_search_endpoint",
    source_table_name="catalog.schema.documents",
    index_name="catalog.schema.doc_index",
    primary_key="doc_id",
    embedding_source_column="text",
    embedding_model_endpoint_name="databricks-bge-large-en",
    pipeline_type="TRIGGERED"
)

# Search for similar documents
results = index.similarity_search(
    query_text="How do I configure auto-scaling in Databricks?",
    columns=["doc_id", "title", "text"],
    num_results=5
)

for doc in results["result"]["data_array"]:
    print(f"Score: {doc[0]:.3f} - {doc[2]}")  # Score and title

Model Serving for AI Applications

from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import (
    EndpointCoreConfigInput,
    ServedEntityInput
)

client = WorkspaceClient()

# Create model serving endpoint
endpoint = client.serving_endpoints.create_and_wait(
    name="my-ai-endpoint",
    config=EndpointCoreConfigInput(
        served_entities=[
            ServedEntityInput(
                entity_name="catalog.schema.my_model",
                entity_version="1",
                workload_size="Small",
                scale_to_zero_enabled=True
            )
        ]
    )
)

# Query the endpoint
import requests

response = requests.post(
    f"{endpoint.config.served_entities[0].state.deployment_url}/invocations",
    headers={"Authorization": f"Bearer {token}"},
    json={"inputs": {"text": "Analyze this customer feedback..."}}
)

print(response.json())

Unity Catalog ML Integration

import mlflow
from mlflow.tracking import MlflowClient

# Set the MLflow registry URI to Unity Catalog
mlflow.set_registry_uri("databricks-uc")

# Log a model to Unity Catalog
with mlflow.start_run():
    mlflow.sklearn.log_model(
        model,
        "model",
        registered_model_name="catalog.schema.my_model"
    )

# Load model from Unity Catalog
model = mlflow.pyfunc.load_model("models:/catalog.schema.my_model/Production")

# Use the model
predictions = model.predict(data)

Creating AI/BI Dashboards

# Dashboard creation via API
def create_dashboard(
    workspace_client: WorkspaceClient,
    name: str,
    warehouse_id: str,
    queries: list
) -> dict:
    """Create a Lakeview dashboard"""

    dashboard_spec = {
        "displayName": name,
        "warehouseId": warehouse_id,
        "pages": [{
            "name": "Overview",
            "widgets": []
        }]
    }

    # Add widgets for each query
    for i, query in enumerate(queries):
        widget = {
            "name": f"widget_{i}",
            "textbox_spec" if query["type"] == "text" else "query_spec": {
                "query": query["sql"]
            } if query["type"] != "text" else {"text": query["text"]}
        }
        dashboard_spec["pages"][0]["widgets"].append(widget)

    # Create dashboard (pseudo-code - actual API may vary)
    # response = workspace_client.dashboards.create(dashboard_spec)

    return dashboard_spec

# Example usage
queries = [
    {"type": "kpi", "sql": "SELECT SUM(revenue) as total_revenue FROM sales"},
    {"type": "chart", "sql": "SELECT region, SUM(revenue) FROM sales GROUP BY region"},
    {"type": "text", "text": "## Sales Overview\nKey metrics for the current period."}
]

dashboard = create_dashboard(client, "Sales Dashboard", "warehouse_id", queries)

Conclusion

Azure Databricks April 2024 updates bring AI capabilities directly into the data platform. From SQL AI functions to Genie’s natural language interface, these features make advanced analytics accessible to all users.