April 22, 2024 1 min read

The ai_query Function: Custom LLM Calls in Databricks SQL

The ai_query() function enables custom LLM interactions directly in SQL. Unlike specialized functions, it allows you to craft any prompt and get intelligent responses.

Understanding ai_query

-- Basic syntax
SELECT ai_query(
    'model_name',           -- The LLM model to use
    'your_prompt_here'      -- Your custom prompt
) as result;

-- Available models (as of April 2024)
-- 'databricks-meta-llama-3-70b-instruct'  - Most capable
-- 'databricks-meta-llama-3-8b-instruct'   - Faster, cost-effective
-- 'databricks-dbrx-instruct'              - Databricks' own model
-- 'databricks-mixtral-8x7b-instruct'      - Good balance

Crafting Effective Prompts

-- Pattern 1: Simple extraction
SELECT
    order_notes,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Extract the delivery preference from this note. ',
            'Return only: morning, afternoon, evening, or none. ',
            'Note: ', order_notes
        )
    ) as delivery_preference
FROM orders;

-- Pattern 2: Structured output (JSON)
SELECT
    customer_feedback,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Analyze this customer feedback and return JSON with these fields: ',
            '{"sentiment": "positive/negative/neutral", ',
            '"main_topic": "the primary topic", ',
            '"action_needed": true/false, ',
            '"urgency": "low/medium/high"}. ',
            'Feedback: ', customer_feedback,
            '. Return only valid JSON, no explanation.'
        )
    ) as analysis_json
FROM feedback;

-- Pattern 3: Few-shot learning
SELECT
    product_description,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Categorize products based on these examples:\n',
            'Example 1: "Running shoes with cushioned sole" -> Sports & Fitness\n',
            'Example 2: "Wireless bluetooth headphones" -> Electronics\n',
            'Example 3: "Organic green tea bags" -> Food & Beverage\n',
            '\nNow categorize: "', product_description, '"\n',
            'Return only the category name.'
        )
    ) as category
FROM products;

-- Pattern 4: Chain of thought reasoning
SELECT
    financial_data,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Analyze this financial data step by step:\n',
            financial_data, '\n\n',
            'Step 1: Identify the key metrics\n',
            'Step 2: Calculate growth rates\n',
            'Step 3: Identify any concerns\n',
            'Step 4: Provide a brief summary\n\n',
            'Keep each step to one sentence.'
        )
    ) as analysis
FROM quarterly_reports;

Advanced Use Cases

-- Use Case 1: Intelligent Data Quality Checks
SELECT
    record_id,
    customer_name,
    address,
    ai_query(
        'databricks-meta-llama-3-8b-instruct',
        CONCAT(
            'Is this address valid and complete? ',
            'Check for: street, city, state/province, postal code, country. ',
            'Address: ', address,
            '. Return: VALID, INCOMPLETE (list missing), or INVALID (explain why).'
        )
    ) as address_validation
FROM customer_addresses
WHERE needs_validation = TRUE;

-- Use Case 2: Anomaly Explanation
WITH anomalies AS (
    SELECT
        date,
        metric_name,
        value,
        expected_value,
        (value - expected_value) / expected_value * 100 as deviation_pct
    FROM metrics
    WHERE ABS((value - expected_value) / expected_value) > 0.2
)
SELECT
    date,
    metric_name,
    value,
    expected_value,
    deviation_pct,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'On ', CAST(date AS STRING), ', the metric "', metric_name,
            '" had a value of ', CAST(value AS STRING),
            ' but we expected around ', CAST(expected_value AS STRING),
            ' (', ROUND(deviation_pct, 1), '% deviation). ',
            'List 3 possible business reasons for this deviation. ',
            'Be specific and practical.'
        )
    ) as possible_explanations
FROM anomalies;

-- Use Case 3: Automated Report Narratives
WITH monthly_stats AS (
    SELECT
        product_category,
        SUM(revenue) as total_revenue,
        SUM(revenue) / COUNT(DISTINCT order_id) as avg_order_value,
        COUNT(DISTINCT customer_id) as unique_customers,
        LAG(SUM(revenue)) OVER (PARTITION BY product_category ORDER BY month) as prev_revenue
    FROM sales
    WHERE month = DATE_TRUNC('month', CURRENT_DATE - INTERVAL 1 MONTH)
    GROUP BY product_category, month
)
SELECT
    product_category,
    total_revenue,
    avg_order_value,
    unique_customers,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Write a 2-sentence executive summary for the ', product_category, ' category. ',
            'Revenue: $', FORMAT_NUMBER(total_revenue, 0),
            ', Change from last month: ', ROUND((total_revenue - prev_revenue) / prev_revenue * 100, 1), '%, ',
            'Average order: $', FORMAT_NUMBER(avg_order_value, 2),
            ', Unique customers: ', FORMAT_NUMBER(unique_customers, 0), '. ',
            'Highlight the most important insight.'
        )
    ) as summary
FROM monthly_stats;

-- Use Case 4: Smart Data Matching
SELECT
    a.company_name as name_a,
    b.company_name as name_b,
    ai_query(
        'databricks-meta-llama-3-8b-instruct',
        CONCAT(
            'Are these the same company? Consider abbreviations, legal suffixes, and typos. ',
            'Company A: "', a.company_name, '". ',
            'Company B: "', b.company_name, '". ',
            'Answer: SAME, DIFFERENT, or UNCERTAIN with brief reason.'
        )
    ) as match_result
FROM company_list_a a
CROSS JOIN company_list_b b
WHERE SOUNDEX(a.company_name) = SOUNDEX(b.company_name);

Building Dynamic Prompts

-- Create a prompt template table
CREATE TABLE prompt_templates (
    template_id STRING,
    template_name STRING,
    template_text STRING,
    model_recommendation STRING
);

INSERT INTO prompt_templates VALUES
    ('sentiment_detailed',
     'Detailed Sentiment Analysis',
     'Analyze the sentiment of this text in detail. Provide: overall sentiment (positive/negative/neutral), confidence (0-100), key phrases that influenced the analysis, and any sarcasm detection. Text: {text}',
     'databricks-meta-llama-3-70b-instruct'),

    ('summarize_technical',
     'Technical Document Summary',
     'Summarize this technical document for a non-technical audience. Focus on: what it does, why it matters, and key takeaways. Keep it under 100 words. Document: {text}',
     'databricks-meta-llama-3-70b-instruct');

-- Use templates dynamically
SELECT
    d.document_id,
    d.content,
    ai_query(
        t.model_recommendation,
        REPLACE(t.template_text, '{text}', d.content)
    ) as analysis
FROM documents d
CROSS JOIN prompt_templates t
WHERE t.template_id = 'summarize_technical'
  AND d.document_type = 'technical';

Output Parsing

-- Parse JSON output from ai_query
WITH ai_results AS (
    SELECT
        feedback_id,
        ai_query(
            'databricks-meta-llama-3-70b-instruct',
            CONCAT(
                'Return JSON: {"sentiment": "pos/neg/neu", "score": 1-10, "topics": ["topic1"]}. ',
                'Analyze: ', feedback_text
            )
        ) as raw_json
    FROM feedback
)
SELECT
    feedback_id,
    raw_json,
    GET_JSON_OBJECT(raw_json, '$.sentiment') as sentiment,
    CAST(GET_JSON_OBJECT(raw_json, '$.score') AS INT) as score,
    GET_JSON_OBJECT(raw_json, '$.topics') as topics
FROM ai_results
WHERE raw_json IS NOT NULL
  AND raw_json LIKE '{%}';  -- Basic JSON validation

Best Practices

AI_QUERY_BEST_PRACTICES = {
    "prompt_engineering": [
        "Be specific about desired output format",
        "Use examples (few-shot) for complex tasks",
        "Include constraints (word limits, format)",
        "Ask for reasoning when needed"
    ],
    "performance": [
        "Use smaller models for simple tasks",
        "Cache results for repeated queries",
        "Batch process large datasets",
        "Limit input text length"
    ],
    "reliability": [
        "Validate JSON outputs before parsing",
        "Use TRY() for error handling",
        "Have fallback logic for failures",
        "Log and monitor failure rates"
    ]
}

Conclusion

The ai_query() function is your gateway to custom LLM capabilities in SQL. Master prompt engineering and output parsing to unlock powerful data enrichment and analysis workflows.