5 min read
SQL AI Functions in Databricks: LLMs Meet SQL
SQL AI Functions in Databricks: LLMs Meet SQL
Databricks SQL AI functions bring large language model capabilities directly into SQL queries. Process text, generate insights, and enrich data without leaving SQL.
Available AI Functions
-- Overview of Databricks SQL AI Functions
-- 1. ai_analyze_sentiment() - Sentiment analysis
SELECT
review_id,
review_text,
ai_analyze_sentiment(review_text) as sentiment
FROM product_reviews;
-- 2. ai_classify() - Text classification
SELECT
ticket_id,
description,
ai_classify(description, ARRAY('bug', 'feature', 'question', 'other')) as category
FROM support_tickets;
-- 3. ai_extract() - Entity extraction
SELECT
email_id,
ai_extract(email_body, ARRAY('person_name', 'company', 'date')) as entities
FROM emails;
-- 4. ai_fix_grammar() - Grammar correction
SELECT
document_id,
ai_fix_grammar(content) as corrected_content
FROM documents;
-- 5. ai_gen() - Text generation
SELECT
product_id,
product_name,
ai_gen(CONCAT('Write a short marketing description for: ', product_name)) as description
FROM products;
-- 6. ai_mask() - PII masking
SELECT
customer_id,
ai_mask(notes, ARRAY('email', 'phone', 'ssn')) as masked_notes
FROM customer_notes;
-- 7. ai_query() - Custom LLM queries
SELECT
data_id,
ai_query('databricks-meta-llama-3-70b-instruct', custom_prompt) as result
FROM data_for_analysis;
-- 8. ai_similarity() - Semantic similarity
SELECT
doc1_id,
doc2_id,
ai_similarity(doc1_text, doc2_text) as similarity_score
FROM document_pairs;
-- 9. ai_summarize() - Text summarization
SELECT
article_id,
ai_summarize(article_content, 100) as summary
FROM articles;
-- 10. ai_translate() - Translation
SELECT
message_id,
original_text,
ai_translate(original_text, 'es', 'en') as english_translation
FROM messages;
Practical Use Cases
-- Use Case 1: Customer Feedback Analysis Pipeline
-- Create a view that enriches feedback with AI analysis
CREATE OR REPLACE VIEW enriched_feedback AS
SELECT
feedback_id,
customer_id,
feedback_text,
feedback_date,
-- Sentiment analysis
ai_analyze_sentiment(feedback_text) as sentiment,
-- Categorization
ai_classify(
feedback_text,
ARRAY('product_quality', 'shipping', 'customer_service', 'pricing', 'other')
) as category,
-- Extract mentioned products
ai_extract(feedback_text, ARRAY('product_name')) as mentioned_products,
-- Generate response suggestion
ai_gen(
CONCAT(
'As a customer service agent, write a brief, professional response to this feedback: ',
feedback_text
)
) as suggested_response
FROM customer_feedback
WHERE feedback_date >= DATE_SUB(CURRENT_DATE, 30);
-- Query the enriched view
SELECT
sentiment,
category,
COUNT(*) as count,
AVG(CASE WHEN sentiment = 'positive' THEN 1 ELSE 0 END) as positive_rate
FROM enriched_feedback
GROUP BY sentiment, category
ORDER BY count DESC;
-- Use Case 2: Document Processing Pipeline
-- Process legal documents
WITH document_analysis AS (
SELECT
document_id,
document_type,
content,
-- Summarize document
ai_summarize(content, 200) as summary,
-- Extract key entities
ai_extract(
content,
ARRAY('party_names', 'dates', 'monetary_values', 'legal_terms')
) as entities,
-- Classify document type
ai_classify(
content,
ARRAY('contract', 'amendment', 'nda', 'agreement', 'notice')
) as classified_type,
-- Check for specific clauses
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT(
'Does this document contain a non-compete clause? Answer YES or NO only. Document: ',
SUBSTRING(content, 1, 3000)
)
) as has_noncompete
FROM legal_documents
WHERE ingestion_date >= DATE_SUB(CURRENT_DATE, 7)
)
SELECT
document_id,
document_type,
classified_type,
summary,
entities,
CASE WHEN UPPER(has_noncompete) LIKE '%YES%' THEN TRUE ELSE FALSE END as has_noncompete_clause
FROM document_analysis;
-- Use Case 3: Product Catalog Enrichment
-- Enrich product data with AI-generated content
CREATE OR REPLACE TABLE enriched_products AS
SELECT
product_id,
product_name,
raw_description,
category,
-- Generate marketing description
ai_gen(
CONCAT(
'Write an engaging 2-sentence product description for an e-commerce website. ',
'Product: ', product_name, '. ',
'Category: ', category, '. ',
'Features: ', raw_description
)
) as marketing_description,
-- Generate SEO keywords
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT(
'List 5 SEO keywords for this product, separated by commas: ',
product_name, ' - ', raw_description
)
) as seo_keywords,
-- Translate to Spanish
ai_translate(raw_description, 'en', 'es') as description_es,
-- Translate to French
ai_translate(raw_description, 'en', 'fr') as description_fr
FROM products
WHERE needs_enrichment = TRUE;
Performance Optimization
-- Optimizing AI function usage
-- 1. Batch processing with windowing
WITH batched AS (
SELECT
id,
text,
ROW_NUMBER() OVER (ORDER BY id) as row_num,
FLOOR((ROW_NUMBER() OVER (ORDER BY id) - 1) / 100) as batch_id
FROM large_text_table
)
SELECT
id,
text,
ai_analyze_sentiment(text) as sentiment
FROM batched
WHERE batch_id = 0 -- Process one batch at a time
;
-- 2. Caching results in a delta table
CREATE TABLE IF NOT EXISTS sentiment_cache (
text_hash STRING,
text STRING,
sentiment STRING,
analyzed_at TIMESTAMP
);
-- Insert only new texts
INSERT INTO sentiment_cache
SELECT
md5(text) as text_hash,
text,
ai_analyze_sentiment(text) as sentiment,
CURRENT_TIMESTAMP as analyzed_at
FROM source_table s
WHERE NOT EXISTS (
SELECT 1 FROM sentiment_cache c
WHERE c.text_hash = md5(s.text)
);
-- 3. Limit text length to reduce tokens
SELECT
id,
ai_summarize(SUBSTRING(long_text, 1, 5000), 100) as summary
FROM documents;
-- 4. Use appropriate models for task
-- For simple tasks, use faster models
SELECT
ai_query(
'databricks-meta-llama-3-8b-instruct', -- Faster, smaller model
simple_prompt
) as result
FROM simple_tasks;
-- For complex tasks, use larger models
SELECT
ai_query(
'databricks-meta-llama-3-70b-instruct', -- More capable model
complex_prompt
) as result
FROM complex_tasks;
Error Handling
-- Handle potential AI function failures
SELECT
id,
text,
CASE
WHEN text IS NULL OR LENGTH(TRIM(text)) = 0 THEN 'EMPTY_INPUT'
WHEN LENGTH(text) > 10000 THEN ai_analyze_sentiment(SUBSTRING(text, 1, 10000))
ELSE ai_analyze_sentiment(text)
END as sentiment
FROM texts;
-- Using TRY for graceful failure handling
SELECT
id,
text,
TRY(ai_analyze_sentiment(text)) as sentiment,
CASE WHEN TRY(ai_analyze_sentiment(text)) IS NULL THEN 'FAILED' ELSE 'SUCCESS' END as status
FROM texts;
Cost Management
# Monitor AI function costs
AI_FUNCTION_COSTS = {
"ai_analyze_sentiment": "~0.001 per call",
"ai_classify": "~0.001 per call",
"ai_summarize": "~0.002-0.01 depending on length",
"ai_query": "Varies by model and tokens",
"ai_translate": "~0.002 per call"
}
# Best practices for cost management:
COST_TIPS = [
"Cache results for repeated text",
"Batch process during off-peak hours",
"Use smaller models for simple tasks",
"Limit input text length",
"Sample data for exploration, full data for production"
]
Conclusion
SQL AI functions bring LLM capabilities directly into your data workflows. Use them for text analysis, content generation, and data enrichment while following best practices for performance and cost.