1 min read
SQL AI Functions in Databricks: LLMs Meet SQL
I wrote “SQL AI Functions in Databricks: LLMs Meet SQL” to share practical, production-minded guidance on this topic.
Databricks SQL AI functions bring large language model capabilities directly into SQL queries. Process text, generate insights, and enrich data without leaving SQL.
Available AI Functions
-- Overview of Databricks SQL AI Functions
-- 1. ai_analyze_sentiment() - Sentiment analysis
SELECT
review_id,
review_text,
ai_analyze_sentiment(review_text) as sentiment
FROM product_reviews;
-- 2. ai_classify() - Text classification
SELECT
ticket_id,
description,
ai_classify(description, ARRAY('bug', 'feature', 'question', 'other')) as category
FROM support_tickets;
-- 3. ai_extract() - Entity extraction
SELECT
email_id,
ai_extract(email_body, ARRAY('person_name', 'company', 'date')) as entities
FROM emails;
-- 4. ai_fix_grammar() - Grammar correction
SELECT
document_id,
ai_fix_grammar(content) as corrected_content
FROM documents;
-- 5. ai_gen() - Text generation
SELECT
product_id,
product_name,
ai_gen(CONCAT('Write a short marketing description for: ', product_name)) as description
FROM products;
-- 6. ai_mask() - PII masking
SELECT
customer_id,
ai_mask(notes, ARRAY('email', 'phone', 'ssn')) as masked_notes
FROM customer_notes;
-- 7. ai_query() - Custom LLM queries
SELECT
data_id,
ai_query('databricks-meta-llama-3-70b-instruct', custom_prompt) as result
FROM data_for_analysis;
-- 8. ai_similarity() - Semantic similarity
SELECT
doc1_id,
doc2_id,
ai_similarity(doc1_text, doc2_text) as similarity_score
FROM document_pairs;
-- 9. ai_summarize() - Text summarization
SELECT
article_id,
ai_summarize(article_content, 100) as summary
FROM articles;
-- 10. ai_translate() - Translation
SELECT
message_id,
original_text,
ai_translate(original_text, 'es', 'en') as english_translation
FROM messages;
Practical Use Cases
-- Use Case 1: Customer Feedback Analysis Pipeline
-- Create a view that enriches feedback with AI analysis
CREATE OR REPLACE VIEW enriched_feedback AS
SELECT
feedback_id,
customer_id,
feedback_text,
feedback_date,
-- Sentiment analysis
ai_analyze_sentiment(feedback_text) as sentiment,
-- Categorization
ai_classify(
feedback_text,
ARRAY('product_quality', 'shipping', 'customer_service', 'pricing', 'other')
) as category,
-- Extract mentioned products
ai_extract(feedback_text, ARRAY('product_name')) as mentioned_products,
-- Generate response suggestion
ai_gen(
CONCAT(
'As a customer service agent, write a brief, professional response to this feedback: ',
feedback_text
)
) as suggested_response
FROM customer_feedback
WHERE feedback_date >= DATE_SUB(CURRENT_DATE, 30);
-- Query the enriched view
SELECT
sentiment,
category,
COUNT(*) as count,
AVG(CASE WHEN sentiment = 'positive' THEN 1 ELSE 0 END) as positive_rate
FROM enriched_feedback
GROUP BY sentiment, category
ORDER BY count DESC;
-- Use Case 2: Document Processing Pipeline
-- Process legal documents
WITH document_analysis AS (
SELECT
document_id,
document_type,
content,
-- Summarize document
ai_summarize(content, 200) as summary,
-- Extract key entities
ai_extract(
content,
ARRAY('party_names', 'dates', 'monetary_values', 'legal_terms')
) as entities,
-- Classify document type
ai_classify(
content,
ARRAY('contract', 'amendment', 'nda', 'agreement', 'notice')
) as classified_type,
-- Check for specific clauses
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT(
'Does this document contain a non-compete clause? Answer YES or NO only. Document: ',
SUBSTRING(content, 1, 3000)
)
) as has_noncompete
FROM legal_documents
WHERE ingestion_date >= DATE_SUB(CURRENT_DATE, 7)
)
SELECT
document_id,
document_type,
classified_type,
summary,
entities,
CASE WHEN UPPER(has_noncompete) LIKE '%YES%' THEN TRUE ELSE FALSE END as has_noncompete_clause
FROM document_analysis;
-- Use Case 3: Product Catalog Enrichment
-- Enrich product data with AI-generated content
CREATE OR REPLACE TABLE enriched_products AS
SELECT
product_id,
product_name,
raw_description,
category,
-- Generate marketing description
ai_gen(
CONCAT(
'Write an engaging 2-sentence product description for an e-commerce website. ',
'Product: ', product_name, '. ',
'Category: ', category, '. ',
'Features: ', raw_description
)
) as marketing_description,
-- Generate SEO keywords
ai_query(
'databricks-meta-llama-3-70b-instruct',
CONCAT(
'List 5 SEO keywords for this product, separated by commas: ',
product_name, ' - ', raw_description
)
) as seo_keywords,
-- Translate to Spanish
ai_translate(raw_description, 'en', 'es') as description_es,
-- Translate to French
ai_translate(raw_description, 'en', 'fr') as description_fr
FROM products
WHERE needs_enrichment = TRUE;
Performance Optimization
-- Optimizing AI function usage
-- 1. Batch processing with windowing
WITH batched AS (
SELECT
id,
text,
ROW_NUMBER() OVER (ORDER BY id) as row_num,
FLOOR((ROW_NUMBER() OVER (ORDER BY id) - 1) / 100) as batch_id
FROM large_text_table
)
SELECT
id,
text,
ai_analyze_sentiment(text) as sentiment
FROM batched
WHERE batch_id = 0 -- Process one batch at a time
;
-- 2. Caching results in a delta table
CREATE TABLE IF NOT EXISTS sentiment_cache (
text_hash STRING,
text STRING,
sentiment STRING,
analyzed_at TIMESTAMP
);
-- Insert only new texts
INSERT INTO sentiment_cache
SELECT
md5(text) as text_hash,
text,
ai_analyze_sentiment(text) as sentiment,
CURRENT_TIMESTAMP as analyzed_at
FROM source_table s
WHERE NOT EXISTS (
SELECT 1 FROM sentiment_cache c
WHERE c.text_hash = md5(s.text)
);
-- 3. Limit text length to reduce tokens
SELECT
id,
ai_summarize(SUBSTRING(long_text, 1, 5000), 100) as summary
FROM documents;
-- 4. Use appropriate models for task
-- For simple tasks, use faster models
SELECT
ai_query(
'databricks-meta-llama-3-8b-instruct', -- Faster, smaller model
simple_prompt
) as result
FROM simple_tasks;
-- For complex tasks, use larger models
SELECT
ai_query(
'databricks-meta-llama-3-70b-instruct', -- More capable model
complex_prompt
) as result
FROM complex_tasks;
Error Handling
-- Handle potential AI function failures
SELECT
id,
text,
CASE
WHEN text IS NULL OR LENGTH(TRIM(text)) = 0 THEN 'EMPTY_INPUT'
WHEN LENGTH(text) > 10000 THEN ai_analyze_sentiment(SUBSTRING(text, 1, 10000))
ELSE ai_analyze_sentiment(text)
END as sentiment
FROM texts;
-- Using TRY for graceful failure handling
SELECT
id,
text,
TRY(ai_analyze_sentiment(text)) as sentiment,
CASE WHEN TRY(ai_analyze_sentiment(text)) IS NULL THEN 'FAILED' ELSE 'SUCCESS' END as status
FROM texts;
Cost Management
# Monitor AI function costs
AI_FUNCTION_COSTS = {
"ai_analyze_sentiment": "~0.001 per call",
"ai_classify": "~0.001 per call",
"ai_summarize": "~0.002-0.01 depending on length",
"ai_query": "Varies by model and tokens",
"ai_translate": "~0.002 per call"
}
# Best practices for cost management:
COST_TIPS = [
"Cache results for repeated text",
"Batch process during off-peak hours",
"Use smaller models for simple tasks",
"Limit input text length",
"Sample data for exploration, full data for production"
]
Conclusion
SQL AI functions bring LLM capabilities directly into your data workflows. Use them for text analysis, content generation, and data enrichment while following best practices for performance and cost.