Back to Blog
5 min read

SQL AI Functions in Databricks: LLMs Meet SQL

SQL AI Functions in Databricks: LLMs Meet SQL

Databricks SQL AI functions bring large language model capabilities directly into SQL queries. Process text, generate insights, and enrich data without leaving SQL.

Available AI Functions

-- Overview of Databricks SQL AI Functions

-- 1. ai_analyze_sentiment() - Sentiment analysis
SELECT
    review_id,
    review_text,
    ai_analyze_sentiment(review_text) as sentiment
FROM product_reviews;

-- 2. ai_classify() - Text classification
SELECT
    ticket_id,
    description,
    ai_classify(description, ARRAY('bug', 'feature', 'question', 'other')) as category
FROM support_tickets;

-- 3. ai_extract() - Entity extraction
SELECT
    email_id,
    ai_extract(email_body, ARRAY('person_name', 'company', 'date')) as entities
FROM emails;

-- 4. ai_fix_grammar() - Grammar correction
SELECT
    document_id,
    ai_fix_grammar(content) as corrected_content
FROM documents;

-- 5. ai_gen() - Text generation
SELECT
    product_id,
    product_name,
    ai_gen(CONCAT('Write a short marketing description for: ', product_name)) as description
FROM products;

-- 6. ai_mask() - PII masking
SELECT
    customer_id,
    ai_mask(notes, ARRAY('email', 'phone', 'ssn')) as masked_notes
FROM customer_notes;

-- 7. ai_query() - Custom LLM queries
SELECT
    data_id,
    ai_query('databricks-meta-llama-3-70b-instruct', custom_prompt) as result
FROM data_for_analysis;

-- 8. ai_similarity() - Semantic similarity
SELECT
    doc1_id,
    doc2_id,
    ai_similarity(doc1_text, doc2_text) as similarity_score
FROM document_pairs;

-- 9. ai_summarize() - Text summarization
SELECT
    article_id,
    ai_summarize(article_content, 100) as summary
FROM articles;

-- 10. ai_translate() - Translation
SELECT
    message_id,
    original_text,
    ai_translate(original_text, 'es', 'en') as english_translation
FROM messages;

Practical Use Cases

-- Use Case 1: Customer Feedback Analysis Pipeline

-- Create a view that enriches feedback with AI analysis
CREATE OR REPLACE VIEW enriched_feedback AS
SELECT
    feedback_id,
    customer_id,
    feedback_text,
    feedback_date,

    -- Sentiment analysis
    ai_analyze_sentiment(feedback_text) as sentiment,

    -- Categorization
    ai_classify(
        feedback_text,
        ARRAY('product_quality', 'shipping', 'customer_service', 'pricing', 'other')
    ) as category,

    -- Extract mentioned products
    ai_extract(feedback_text, ARRAY('product_name')) as mentioned_products,

    -- Generate response suggestion
    ai_gen(
        CONCAT(
            'As a customer service agent, write a brief, professional response to this feedback: ',
            feedback_text
        )
    ) as suggested_response

FROM customer_feedback
WHERE feedback_date >= DATE_SUB(CURRENT_DATE, 30);

-- Query the enriched view
SELECT
    sentiment,
    category,
    COUNT(*) as count,
    AVG(CASE WHEN sentiment = 'positive' THEN 1 ELSE 0 END) as positive_rate
FROM enriched_feedback
GROUP BY sentiment, category
ORDER BY count DESC;
-- Use Case 2: Document Processing Pipeline

-- Process legal documents
WITH document_analysis AS (
    SELECT
        document_id,
        document_type,
        content,

        -- Summarize document
        ai_summarize(content, 200) as summary,

        -- Extract key entities
        ai_extract(
            content,
            ARRAY('party_names', 'dates', 'monetary_values', 'legal_terms')
        ) as entities,

        -- Classify document type
        ai_classify(
            content,
            ARRAY('contract', 'amendment', 'nda', 'agreement', 'notice')
        ) as classified_type,

        -- Check for specific clauses
        ai_query(
            'databricks-meta-llama-3-70b-instruct',
            CONCAT(
                'Does this document contain a non-compete clause? Answer YES or NO only. Document: ',
                SUBSTRING(content, 1, 3000)
            )
        ) as has_noncompete

    FROM legal_documents
    WHERE ingestion_date >= DATE_SUB(CURRENT_DATE, 7)
)
SELECT
    document_id,
    document_type,
    classified_type,
    summary,
    entities,
    CASE WHEN UPPER(has_noncompete) LIKE '%YES%' THEN TRUE ELSE FALSE END as has_noncompete_clause
FROM document_analysis;
-- Use Case 3: Product Catalog Enrichment

-- Enrich product data with AI-generated content
CREATE OR REPLACE TABLE enriched_products AS
SELECT
    product_id,
    product_name,
    raw_description,
    category,

    -- Generate marketing description
    ai_gen(
        CONCAT(
            'Write an engaging 2-sentence product description for an e-commerce website. ',
            'Product: ', product_name, '. ',
            'Category: ', category, '. ',
            'Features: ', raw_description
        )
    ) as marketing_description,

    -- Generate SEO keywords
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'List 5 SEO keywords for this product, separated by commas: ',
            product_name, ' - ', raw_description
        )
    ) as seo_keywords,

    -- Translate to Spanish
    ai_translate(raw_description, 'en', 'es') as description_es,

    -- Translate to French
    ai_translate(raw_description, 'en', 'fr') as description_fr

FROM products
WHERE needs_enrichment = TRUE;

Performance Optimization

-- Optimizing AI function usage

-- 1. Batch processing with windowing
WITH batched AS (
    SELECT
        id,
        text,
        ROW_NUMBER() OVER (ORDER BY id) as row_num,
        FLOOR((ROW_NUMBER() OVER (ORDER BY id) - 1) / 100) as batch_id
    FROM large_text_table
)
SELECT
    id,
    text,
    ai_analyze_sentiment(text) as sentiment
FROM batched
WHERE batch_id = 0  -- Process one batch at a time
;

-- 2. Caching results in a delta table
CREATE TABLE IF NOT EXISTS sentiment_cache (
    text_hash STRING,
    text STRING,
    sentiment STRING,
    analyzed_at TIMESTAMP
);

-- Insert only new texts
INSERT INTO sentiment_cache
SELECT
    md5(text) as text_hash,
    text,
    ai_analyze_sentiment(text) as sentiment,
    CURRENT_TIMESTAMP as analyzed_at
FROM source_table s
WHERE NOT EXISTS (
    SELECT 1 FROM sentiment_cache c
    WHERE c.text_hash = md5(s.text)
);

-- 3. Limit text length to reduce tokens
SELECT
    id,
    ai_summarize(SUBSTRING(long_text, 1, 5000), 100) as summary
FROM documents;

-- 4. Use appropriate models for task
-- For simple tasks, use faster models
SELECT
    ai_query(
        'databricks-meta-llama-3-8b-instruct',  -- Faster, smaller model
        simple_prompt
    ) as result
FROM simple_tasks;

-- For complex tasks, use larger models
SELECT
    ai_query(
        'databricks-meta-llama-3-70b-instruct',  -- More capable model
        complex_prompt
    ) as result
FROM complex_tasks;

Error Handling

-- Handle potential AI function failures

SELECT
    id,
    text,
    CASE
        WHEN text IS NULL OR LENGTH(TRIM(text)) = 0 THEN 'EMPTY_INPUT'
        WHEN LENGTH(text) > 10000 THEN ai_analyze_sentiment(SUBSTRING(text, 1, 10000))
        ELSE ai_analyze_sentiment(text)
    END as sentiment
FROM texts;

-- Using TRY for graceful failure handling
SELECT
    id,
    text,
    TRY(ai_analyze_sentiment(text)) as sentiment,
    CASE WHEN TRY(ai_analyze_sentiment(text)) IS NULL THEN 'FAILED' ELSE 'SUCCESS' END as status
FROM texts;

Cost Management

# Monitor AI function costs

AI_FUNCTION_COSTS = {
    "ai_analyze_sentiment": "~0.001 per call",
    "ai_classify": "~0.001 per call",
    "ai_summarize": "~0.002-0.01 depending on length",
    "ai_query": "Varies by model and tokens",
    "ai_translate": "~0.002 per call"
}

# Best practices for cost management:
COST_TIPS = [
    "Cache results for repeated text",
    "Batch process during off-peak hours",
    "Use smaller models for simple tasks",
    "Limit input text length",
    "Sample data for exploration, full data for production"
]

Conclusion

SQL AI functions bring LLM capabilities directly into your data workflows. Use them for text analysis, content generation, and data enrichment while following best practices for performance and cost.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.