5 min read
Databricks AI/BI: Intelligent Analytics Platform
Databricks AI/BI: Intelligent Analytics Platform
Databricks AI/BI combines the power of the lakehouse with AI-driven analytics. This guide explores how to leverage AI/BI for intelligent data analysis.
AI/BI Components
AI_BI_COMPONENTS = {
"genie": {
"description": "Natural language interface for data exploration",
"capabilities": [
"Conversational data queries",
"Automatic SQL generation",
"Context-aware responses",
"Multi-turn conversations"
]
},
"dashboards": {
"description": "AI-powered visualization and reporting",
"capabilities": [
"Auto-generated visualizations",
"Smart formatting",
"Natural language filters",
"Embedded AI insights"
]
},
"ai_functions": {
"description": "SQL functions powered by LLMs",
"capabilities": [
"Text analysis",
"Classification",
"Summarization",
"Custom queries"
]
}
}
Setting Up a Genie Space
# Genie Space configuration for a sales analytics use case
GENIE_SPACE_CONFIG = {
"name": "Sales Analytics Genie",
"description": "Natural language interface for sales data analysis",
"tables": [
{
"catalog": "main",
"schema": "sales",
"table": "fact_sales",
"description": "Sales transactions with amounts, dates, and dimensions"
},
{
"catalog": "main",
"schema": "sales",
"table": "dim_customer",
"description": "Customer information including segments and regions"
},
{
"catalog": "main",
"schema": "sales",
"table": "dim_product",
"description": "Product catalog with categories and pricing"
}
],
"instructions": """
This Genie space is for analyzing sales performance.
Key concepts:
- Revenue = SUM(amount)
- Units = SUM(quantity)
- Average Order Value = Revenue / COUNT(DISTINCT order_id)
When users ask about:
- "sales" they usually mean revenue
- "customers" refer to unique customer counts
- "products" refer to product performance
Always include appropriate filters for date ranges.
""",
"sample_questions": [
"What were total sales last month?",
"Show me the top 10 products by revenue",
"How do sales compare across regions?",
"Which customer segment is growing fastest?"
]
}
Implementing AI-Enhanced Queries
from databricks import sql
import json
class AIEnhancedQuery:
"""Execute AI-enhanced queries in Databricks"""
def __init__(self, connection):
self.connection = connection
self.cursor = connection.cursor()
def analyze_sentiment_batch(
self,
table: str,
text_column: str,
id_column: str,
batch_size: int = 1000
) -> dict:
"""Analyze sentiment for text data"""
query = f"""
SELECT
{id_column},
{text_column},
ai_analyze_sentiment({text_column}) as sentiment
FROM {table}
LIMIT {batch_size}
"""
self.cursor.execute(query)
results = self.cursor.fetchall()
# Aggregate results
sentiment_counts = {"positive": 0, "negative": 0, "neutral": 0}
for row in results:
sentiment = row[2].lower()
if sentiment in sentiment_counts:
sentiment_counts[sentiment] += 1
return {
"total_analyzed": len(results),
"sentiment_distribution": sentiment_counts,
"samples": results[:5]
}
def extract_entities(
self,
table: str,
text_column: str,
entity_types: list
) -> list:
"""Extract named entities from text"""
entity_prompt = f"Extract these entity types: {', '.join(entity_types)}"
query = f"""
SELECT
{text_column},
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT('{entity_prompt}. Text: ', {text_column})
) as entities
FROM {table}
LIMIT 100
"""
self.cursor.execute(query)
return self.cursor.fetchall()
def generate_summaries(
self,
table: str,
text_column: str,
group_column: str,
max_length: int = 100
) -> dict:
"""Generate summaries grouped by category"""
query = f"""
SELECT
{group_column},
ai_summarize(
CONCAT_WS(' ', COLLECT_LIST({text_column})),
{max_length}
) as summary
FROM {table}
GROUP BY {group_column}
"""
self.cursor.execute(query)
results = self.cursor.fetchall()
return {row[0]: row[1] for row in results}
def classify_text(
self,
table: str,
text_column: str,
categories: list
) -> list:
"""Classify text into predefined categories"""
categories_str = ", ".join(categories)
query = f"""
SELECT
{text_column},
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT(
'Classify this text into one of these categories: {categories_str}. ',
'Text: ', {text_column},
'. Return only the category name.'
)
) as category
FROM {table}
LIMIT 500
"""
self.cursor.execute(query)
return self.cursor.fetchall()
Building AI/BI Dashboards
# Dashboard with AI-generated insights
AIBI_DASHBOARD_SPEC = {
"name": "Sales Intelligence Dashboard",
"pages": [
{
"name": "Executive Summary",
"layout": "grid",
"widgets": [
{
"type": "kpi",
"position": {"row": 0, "col": 0, "width": 3, "height": 1},
"query": "SELECT SUM(revenue) as total_revenue FROM sales WHERE date >= DATE_SUB(CURRENT_DATE(), 30)",
"format": {"prefix": "$", "decimals": 0}
},
{
"type": "kpi",
"position": {"row": 0, "col": 3, "width": 3, "height": 1},
"query": "SELECT COUNT(DISTINCT customer_id) as active_customers FROM sales WHERE date >= DATE_SUB(CURRENT_DATE(), 30)",
"format": {"decimals": 0}
},
{
"type": "ai_insight",
"position": {"row": 1, "col": 0, "width": 6, "height": 2},
"query": """
SELECT
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT(
'Analyze these sales metrics and provide 3 key insights: ',
'Total Revenue: ', CAST(SUM(revenue) AS STRING),
', YoY Growth: ', CAST(AVG(yoy_growth) AS STRING), '%',
', Top Region: ', MAX(CASE WHEN rank = 1 THEN region END)
)
) as insight
FROM sales_summary
""",
"style": {"background": "light-blue"}
},
{
"type": "chart",
"chart_type": "line",
"position": {"row": 3, "col": 0, "width": 6, "height": 3},
"query": """
SELECT date, SUM(revenue) as revenue
FROM sales
WHERE date >= DATE_SUB(CURRENT_DATE(), 90)
GROUP BY date
ORDER BY date
"""
}
]
},
{
"name": "Customer Insights",
"widgets": [
{
"type": "ai_analysis",
"title": "Customer Sentiment Analysis",
"query": """
SELECT
segment,
ai_analyze_sentiment(feedback) as sentiment,
COUNT(*) as count
FROM customer_feedback
GROUP BY segment, ai_analyze_sentiment(feedback)
"""
}
]
}
]
}
Conversational Analytics with Genie
class GenieConversation:
"""Manage conversational analytics with Genie"""
def __init__(self, genie_client, space_id: str):
self.client = genie_client
self.space_id = space_id
self.conversation_id = None
self.history = []
def start_conversation(self) -> str:
"""Start a new conversation"""
response = self.client.create_conversation(self.space_id)
self.conversation_id = response["conversation_id"]
return self.conversation_id
def ask(self, question: str) -> dict:
"""Ask a question in the conversation"""
if not self.conversation_id:
self.start_conversation()
response = self.client.send_message(
conversation_id=self.conversation_id,
message=question
)
self.history.append({
"question": question,
"response": response
})
return {
"answer": response.get("text_response"),
"sql": response.get("generated_sql"),
"data": response.get("query_results"),
"visualization": response.get("suggested_chart")
}
def drill_down(self, dimension: str) -> dict:
"""Drill down on a dimension"""
return self.ask(f"Break that down by {dimension}")
def compare(self, comparison: str) -> dict:
"""Compare to another period or segment"""
return self.ask(f"Compare that to {comparison}")
def get_insights(self) -> dict:
"""Get AI-generated insights on current view"""
return self.ask("What insights can you give me about this data?")
# Usage example
# genie = GenieConversation(genie_client, "space_id")
# genie.start_conversation()
#
# result = genie.ask("What were sales last month?")
# print(result["answer"])
#
# result = genie.drill_down("region")
# print(result["answer"])
Conclusion
Databricks AI/BI brings intelligent analytics to the lakehouse. From natural language queries with Genie to AI-enhanced SQL functions, these capabilities make advanced analytics accessible to everyone.