Back to Blog
5 min read

The ai_query Function: Custom LLM Calls in Databricks SQL

The ai_query Function: Custom LLM Calls in Databricks SQL

The ai_query() function enables custom LLM interactions directly in SQL. Unlike specialized functions, it allows you to craft any prompt and get intelligent responses.

Understanding ai_query

-- Basic syntax
SELECT ai_query(
    'model_name',           -- The LLM model to use
    'your_prompt_here'      -- Your custom prompt
) as result;

-- Available models (as of April 2024)
-- 'databricks-meta-llama-3-70b-instruct'  - Most capable
-- 'databricks-meta-llama-3-8b-instruct'   - Faster, cost-effective
-- 'databricks-dbrx-instruct'              - Databricks' own model
-- 'databricks-mixtral-8x7b-instruct'      - Good balance

Crafting Effective Prompts

-- Pattern 1: Simple extraction
SELECT
    order_notes,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Extract the delivery preference from this note. ',
            'Return only: morning, afternoon, evening, or none. ',
            'Note: ', order_notes
        )
    ) as delivery_preference
FROM orders;

-- Pattern 2: Structured output (JSON)
SELECT
    customer_feedback,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Analyze this customer feedback and return JSON with these fields: ',
            '{"sentiment": "positive/negative/neutral", ',
            '"main_topic": "the primary topic", ',
            '"action_needed": true/false, ',
            '"urgency": "low/medium/high"}. ',
            'Feedback: ', customer_feedback,
            '. Return only valid JSON, no explanation.'
        )
    ) as analysis_json
FROM feedback;

-- Pattern 3: Few-shot learning
SELECT
    product_description,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Categorize products based on these examples:\n',
            'Example 1: "Running shoes with cushioned sole" -> Sports & Fitness\n',
            'Example 2: "Wireless bluetooth headphones" -> Electronics\n',
            'Example 3: "Organic green tea bags" -> Food & Beverage\n',
            '\nNow categorize: "', product_description, '"\n',
            'Return only the category name.'
        )
    ) as category
FROM products;

-- Pattern 4: Chain of thought reasoning
SELECT
    financial_data,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Analyze this financial data step by step:\n',
            financial_data, '\n\n',
            'Step 1: Identify the key metrics\n',
            'Step 2: Calculate growth rates\n',
            'Step 3: Identify any concerns\n',
            'Step 4: Provide a brief summary\n\n',
            'Keep each step to one sentence.'
        )
    ) as analysis
FROM quarterly_reports;

Advanced Use Cases

-- Use Case 1: Intelligent Data Quality Checks
SELECT
    record_id,
    customer_name,
    address,
    ai_query(
        'databricks-meta-llama-3-8b-instruct',
        CONCAT(
            'Is this address valid and complete? ',
            'Check for: street, city, state/province, postal code, country. ',
            'Address: ', address,
            '. Return: VALID, INCOMPLETE (list missing), or INVALID (explain why).'
        )
    ) as address_validation
FROM customer_addresses
WHERE needs_validation = TRUE;

-- Use Case 2: Anomaly Explanation
WITH anomalies AS (
    SELECT
        date,
        metric_name,
        value,
        expected_value,
        (value - expected_value) / expected_value * 100 as deviation_pct
    FROM metrics
    WHERE ABS((value - expected_value) / expected_value) > 0.2
)
SELECT
    date,
    metric_name,
    value,
    expected_value,
    deviation_pct,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'On ', CAST(date AS STRING), ', the metric "', metric_name,
            '" had a value of ', CAST(value AS STRING),
            ' but we expected around ', CAST(expected_value AS STRING),
            ' (', ROUND(deviation_pct, 1), '% deviation). ',
            'List 3 possible business reasons for this deviation. ',
            'Be specific and practical.'
        )
    ) as possible_explanations
FROM anomalies;

-- Use Case 3: Automated Report Narratives
WITH monthly_stats AS (
    SELECT
        product_category,
        SUM(revenue) as total_revenue,
        SUM(revenue) / COUNT(DISTINCT order_id) as avg_order_value,
        COUNT(DISTINCT customer_id) as unique_customers,
        LAG(SUM(revenue)) OVER (PARTITION BY product_category ORDER BY month) as prev_revenue
    FROM sales
    WHERE month = DATE_TRUNC('month', CURRENT_DATE - INTERVAL 1 MONTH)
    GROUP BY product_category, month
)
SELECT
    product_category,
    total_revenue,
    avg_order_value,
    unique_customers,
    ai_query(
        'databricks-meta-llama-3-70b-instruct',
        CONCAT(
            'Write a 2-sentence executive summary for the ', product_category, ' category. ',
            'Revenue: $', FORMAT_NUMBER(total_revenue, 0),
            ', Change from last month: ', ROUND((total_revenue - prev_revenue) / prev_revenue * 100, 1), '%, ',
            'Average order: $', FORMAT_NUMBER(avg_order_value, 2),
            ', Unique customers: ', FORMAT_NUMBER(unique_customers, 0), '. ',
            'Highlight the most important insight.'
        )
    ) as summary
FROM monthly_stats;

-- Use Case 4: Smart Data Matching
SELECT
    a.company_name as name_a,
    b.company_name as name_b,
    ai_query(
        'databricks-meta-llama-3-8b-instruct',
        CONCAT(
            'Are these the same company? Consider abbreviations, legal suffixes, and typos. ',
            'Company A: "', a.company_name, '". ',
            'Company B: "', b.company_name, '". ',
            'Answer: SAME, DIFFERENT, or UNCERTAIN with brief reason.'
        )
    ) as match_result
FROM company_list_a a
CROSS JOIN company_list_b b
WHERE SOUNDEX(a.company_name) = SOUNDEX(b.company_name);

Building Dynamic Prompts

-- Create a prompt template table
CREATE TABLE prompt_templates (
    template_id STRING,
    template_name STRING,
    template_text STRING,
    model_recommendation STRING
);

INSERT INTO prompt_templates VALUES
    ('sentiment_detailed',
     'Detailed Sentiment Analysis',
     'Analyze the sentiment of this text in detail. Provide: overall sentiment (positive/negative/neutral), confidence (0-100), key phrases that influenced the analysis, and any sarcasm detection. Text: {text}',
     'databricks-meta-llama-3-70b-instruct'),

    ('summarize_technical',
     'Technical Document Summary',
     'Summarize this technical document for a non-technical audience. Focus on: what it does, why it matters, and key takeaways. Keep it under 100 words. Document: {text}',
     'databricks-meta-llama-3-70b-instruct');

-- Use templates dynamically
SELECT
    d.document_id,
    d.content,
    ai_query(
        t.model_recommendation,
        REPLACE(t.template_text, '{text}', d.content)
    ) as analysis
FROM documents d
CROSS JOIN prompt_templates t
WHERE t.template_id = 'summarize_technical'
  AND d.document_type = 'technical';

Output Parsing

-- Parse JSON output from ai_query
WITH ai_results AS (
    SELECT
        feedback_id,
        ai_query(
            'databricks-meta-llama-3-70b-instruct',
            CONCAT(
                'Return JSON: {"sentiment": "pos/neg/neu", "score": 1-10, "topics": ["topic1"]}. ',
                'Analyze: ', feedback_text
            )
        ) as raw_json
    FROM feedback
)
SELECT
    feedback_id,
    raw_json,
    GET_JSON_OBJECT(raw_json, '$.sentiment') as sentiment,
    CAST(GET_JSON_OBJECT(raw_json, '$.score') AS INT) as score,
    GET_JSON_OBJECT(raw_json, '$.topics') as topics
FROM ai_results
WHERE raw_json IS NOT NULL
  AND raw_json LIKE '{%}';  -- Basic JSON validation

Best Practices

AI_QUERY_BEST_PRACTICES = {
    "prompt_engineering": [
        "Be specific about desired output format",
        "Use examples (few-shot) for complex tasks",
        "Include constraints (word limits, format)",
        "Ask for reasoning when needed"
    ],
    "performance": [
        "Use smaller models for simple tasks",
        "Cache results for repeated queries",
        "Batch process large datasets",
        "Limit input text length"
    ],
    "reliability": [
        "Validate JSON outputs before parsing",
        "Use TRY() for error handling",
        "Have fallback logic for failures",
        "Log and monitor failure rates"
    ]
}

Conclusion

The ai_query() function is your gateway to custom LLM capabilities in SQL. Master prompt engineering and output parsing to unlock powerful data enrichment and analysis workflows.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.