4 min read
Azure Databricks April 2024 Updates: AI and Analytics Convergence
Azure Databricks April 2024 Updates: AI and Analytics Convergence
Azure Databricks continues to evolve with powerful AI/BI features. April 2024 brings significant updates that blur the line between data engineering and business intelligence.
Key Updates Overview
DATABRICKS_APRIL_2024 = {
"ai_bi": {
"features": [
"Genie - Natural language data exploration",
"AI/BI Dashboards - AI-powered visualizations",
"Lakeview Dashboards - Lightweight BI"
]
},
"ai_functions": {
"features": [
"ai_query() for inline LLM calls",
"ai_analyze_sentiment()",
"ai_summarize()",
"ai_translate()"
]
},
"vector_search": {
"features": [
"Managed vector search indexes",
"Automatic embedding generation",
"Similarity search at scale"
]
},
"model_serving": {
"features": [
"Serverless model endpoints",
"Foundation model APIs",
"Provisioned throughput"
]
}
}
Connecting to Azure Databricks
from databricks import sql
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.sql import StatementState
# Connection using Databricks SDK
client = WorkspaceClient(
host="https://adb-xxxxx.azuredatabricks.net",
token="dapi_xxxxx"
)
# SQL connection for queries
connection = sql.connect(
server_hostname="adb-xxxxx.azuredatabricks.net",
http_path="/sql/1.0/warehouses/xxxxx",
access_token="dapi_xxxxx"
)
cursor = connection.cursor()
cursor.execute("SELECT * FROM catalog.schema.table LIMIT 10")
results = cursor.fetchall()
Using SQL AI Functions
-- AI Functions in Databricks SQL
-- Sentiment analysis
SELECT
review_id,
review_text,
ai_analyze_sentiment(review_text) as sentiment
FROM reviews
WHERE date >= '2024-04-01';
-- Text summarization
SELECT
article_id,
ai_summarize(article_content, 100) as summary
FROM articles
WHERE category = 'technology';
-- Translation
SELECT
message_id,
original_text,
ai_translate(original_text, 'es', 'en') as english_text
FROM customer_messages
WHERE language = 'Spanish';
-- Custom AI query
SELECT
product_id,
description,
ai_query(
'claude-3-haiku',
CONCAT('Extract key features from this product description: ', description)
) as features
FROM products;
Working with Genie (AI/BI)
# Genie enables natural language data exploration
# Example: Using Genie programmatically via API
import requests
class GenieClient:
"""Client for Databricks Genie API"""
def __init__(self, workspace_url: str, token: str):
self.base_url = f"{workspace_url}/api/2.0/genie"
self.headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json"
}
def ask(
self,
space_id: str,
question: str,
context: dict = None
) -> dict:
"""Ask Genie a question"""
payload = {
"space_id": space_id,
"message": question
}
if context:
payload["context"] = context
response = requests.post(
f"{self.base_url}/conversations/messages",
headers=self.headers,
json=payload
)
return response.json()
def get_sql(self, conversation_id: str, message_id: str) -> str:
"""Get the SQL generated by Genie"""
response = requests.get(
f"{self.base_url}/conversations/{conversation_id}/messages/{message_id}/query",
headers=self.headers
)
return response.json().get("query_text")
# Usage
genie = GenieClient(
workspace_url="https://adb-xxxxx.azuredatabricks.net",
token="dapi_xxxxx"
)
# Ask a question
result = genie.ask(
space_id="your-genie-space-id",
question="What were total sales by region last month?"
)
print(result)
Vector Search Integration
from databricks.vector_search.client import VectorSearchClient
# Initialize client
vsc = VectorSearchClient()
# Create vector search index
index = vsc.create_delta_sync_index(
endpoint_name="vector_search_endpoint",
source_table_name="catalog.schema.documents",
index_name="catalog.schema.doc_index",
primary_key="doc_id",
embedding_source_column="text",
embedding_model_endpoint_name="databricks-bge-large-en",
pipeline_type="TRIGGERED"
)
# Search for similar documents
results = index.similarity_search(
query_text="How do I configure auto-scaling in Databricks?",
columns=["doc_id", "title", "text"],
num_results=5
)
for doc in results["result"]["data_array"]:
print(f"Score: {doc[0]:.3f} - {doc[2]}") # Score and title
Model Serving for AI Applications
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.serving import (
EndpointCoreConfigInput,
ServedEntityInput
)
client = WorkspaceClient()
# Create model serving endpoint
endpoint = client.serving_endpoints.create_and_wait(
name="my-ai-endpoint",
config=EndpointCoreConfigInput(
served_entities=[
ServedEntityInput(
entity_name="catalog.schema.my_model",
entity_version="1",
workload_size="Small",
scale_to_zero_enabled=True
)
]
)
)
# Query the endpoint
import requests
response = requests.post(
f"{endpoint.config.served_entities[0].state.deployment_url}/invocations",
headers={"Authorization": f"Bearer {token}"},
json={"inputs": {"text": "Analyze this customer feedback..."}}
)
print(response.json())
Unity Catalog ML Integration
import mlflow
from mlflow.tracking import MlflowClient
# Set the MLflow registry URI to Unity Catalog
mlflow.set_registry_uri("databricks-uc")
# Log a model to Unity Catalog
with mlflow.start_run():
mlflow.sklearn.log_model(
model,
"model",
registered_model_name="catalog.schema.my_model"
)
# Load model from Unity Catalog
model = mlflow.pyfunc.load_model("models:/catalog.schema.my_model/Production")
# Use the model
predictions = model.predict(data)
Creating AI/BI Dashboards
# Dashboard creation via API
def create_dashboard(
workspace_client: WorkspaceClient,
name: str,
warehouse_id: str,
queries: list
) -> dict:
"""Create a Lakeview dashboard"""
dashboard_spec = {
"displayName": name,
"warehouseId": warehouse_id,
"pages": [{
"name": "Overview",
"widgets": []
}]
}
# Add widgets for each query
for i, query in enumerate(queries):
widget = {
"name": f"widget_{i}",
"textbox_spec" if query["type"] == "text" else "query_spec": {
"query": query["sql"]
} if query["type"] != "text" else {"text": query["text"]}
}
dashboard_spec["pages"][0]["widgets"].append(widget)
# Create dashboard (pseudo-code - actual API may vary)
# response = workspace_client.dashboards.create(dashboard_spec)
return dashboard_spec
# Example usage
queries = [
{"type": "kpi", "sql": "SELECT SUM(revenue) as total_revenue FROM sales"},
{"type": "chart", "sql": "SELECT region, SUM(revenue) FROM sales GROUP BY region"},
{"type": "text", "text": "## Sales Overview\nKey metrics for the current period."}
]
dashboard = create_dashboard(client, "Sales Dashboard", "warehouse_id", queries)
Conclusion
Azure Databricks April 2024 updates bring AI capabilities directly into the data platform. From SQL AI functions to Genie’s natural language interface, these features make advanced analytics accessible to all users.