Back to Blog
6 min read

Foundation Model APIs in Databricks: Enterprise LLM Access

Foundation Model APIs in Databricks: Enterprise LLM Access

Databricks Foundation Model APIs provide enterprise-ready access to state-of-the-art LLMs. This guide covers using these APIs for building AI applications.

Available Foundation Models

DATABRICKS_FOUNDATION_MODELS = {
    "llm_models": {
        "databricks-meta-llama-3-70b-instruct": {
            "description": "Meta's Llama 3 70B for instruction following",
            "context_window": 8192,
            "best_for": ["General tasks", "Code generation", "Analysis"]
        },
        "databricks-meta-llama-3-8b-instruct": {
            "description": "Smaller, faster Llama 3 model",
            "context_window": 8192,
            "best_for": ["Simple tasks", "High throughput", "Cost-sensitive"]
        },
        "databricks-dbrx-instruct": {
            "description": "Databricks' own MoE model",
            "context_window": 32768,
            "best_for": ["Long context", "Complex reasoning"]
        },
        "databricks-mixtral-8x7b-instruct": {
            "description": "Mistral's MoE model",
            "context_window": 32768,
            "best_for": ["Multilingual", "Balanced performance"]
        }
    },
    "embedding_models": {
        "databricks-bge-large-en": {
            "description": "BGE embeddings for English",
            "dimensions": 1024,
            "best_for": ["Semantic search", "RAG", "Similarity"]
        },
        "databricks-gte-large-en": {
            "description": "GTE embeddings",
            "dimensions": 1024,
            "best_for": ["Text classification", "Clustering"]
        }
    }
}

Using Foundation Model APIs

import requests
from typing import List, Dict, Optional

class FoundationModelClient:
    """Client for Databricks Foundation Model APIs"""

    def __init__(self, workspace_url: str, token: str):
        self.workspace_url = workspace_url
        self.headers = {
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json"
        }

    def chat_completion(
        self,
        model: str,
        messages: List[Dict],
        max_tokens: int = 1000,
        temperature: float = 0.7,
        top_p: float = 0.9
    ) -> Dict:
        """Generate chat completion"""

        url = f"{self.workspace_url}/serving-endpoints/{model}/invocations"

        payload = {
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "top_p": top_p
        }

        response = requests.post(url, headers=self.headers, json=payload)
        response.raise_for_status()

        return response.json()

    def generate_embeddings(
        self,
        model: str,
        texts: List[str]
    ) -> List[List[float]]:
        """Generate embeddings for texts"""

        url = f"{self.workspace_url}/serving-endpoints/{model}/invocations"

        payload = {
            "input": texts
        }

        response = requests.post(url, headers=self.headers, json=payload)
        response.raise_for_status()

        result = response.json()
        return [item["embedding"] for item in result["data"]]

    def stream_completion(
        self,
        model: str,
        messages: List[Dict],
        max_tokens: int = 1000
    ):
        """Stream chat completion responses"""

        url = f"{self.workspace_url}/serving-endpoints/{model}/invocations"

        payload = {
            "messages": messages,
            "max_tokens": max_tokens,
            "stream": True
        }

        with requests.post(
            url,
            headers=self.headers,
            json=payload,
            stream=True
        ) as response:
            for line in response.iter_lines():
                if line:
                    yield line.decode('utf-8')

# Usage
client = FoundationModelClient(
    workspace_url="https://adb-xxx.azuredatabricks.net",
    token="your-token"
)

# Chat completion
response = client.chat_completion(
    model="databricks-meta-llama-3-70b-instruct",
    messages=[
        {"role": "system", "content": "You are a helpful data analyst."},
        {"role": "user", "content": "Explain what a lakehouse is."}
    ]
)
print(response["choices"][0]["message"]["content"])

# Embeddings
embeddings = client.generate_embeddings(
    model="databricks-bge-large-en",
    texts=["What is Databricks?", "Explain data engineering."]
)
print(f"Embedding dimensions: {len(embeddings[0])}")

Building Applications with Foundation Models

class DataAnalysisAssistant:
    """AI assistant for data analysis using foundation models"""

    def __init__(self, client: FoundationModelClient):
        self.client = client
        self.model = "databricks-meta-llama-3-70b-instruct"

    def analyze_data_question(self, question: str, schema: dict) -> dict:
        """Analyze a natural language question about data"""

        system_prompt = f"""You are a data analyst assistant.
        You help users understand and query their data.

        Available data schema:
        {self._format_schema(schema)}

        Provide:
        1. A SQL query to answer the question
        2. An explanation of the approach
        3. Suggestions for follow-up analysis
        """

        response = self.client.chat_completion(
            model=self.model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": question}
            ],
            temperature=0.3  # Lower for more deterministic SQL
        )

        return self._parse_response(response)

    def explain_query_results(
        self,
        query: str,
        results: list,
        question: str
    ) -> str:
        """Generate natural language explanation of query results"""

        prompt = f"""The user asked: "{question}"

        This SQL query was executed:
        ```sql
        {query}
        ```

        Results:
        {self._format_results(results)}

        Provide a clear, business-friendly explanation of these results.
        Include key insights and any notable patterns."""

        response = self.client.chat_completion(
            model=self.model,
            messages=[{"role": "user", "content": prompt}]
        )

        return response["choices"][0]["message"]["content"]

    def suggest_visualizations(self, data_summary: dict) -> list:
        """Suggest appropriate visualizations for data"""

        prompt = f"""Given this data summary:
        {data_summary}

        Suggest 3 appropriate visualizations with:
        1. Chart type
        2. What to show on each axis
        3. Why this visualization is useful

        Return as a numbered list."""

        response = self.client.chat_completion(
            model=self.model,
            messages=[{"role": "user", "content": prompt}]
        )

        return response["choices"][0]["message"]["content"]

    def _format_schema(self, schema: dict) -> str:
        return "\n".join(
            f"Table: {table}\nColumns: {', '.join(cols)}"
            for table, cols in schema.items()
        )

    def _format_results(self, results: list) -> str:
        if not results:
            return "No results"
        return str(results[:10])  # Show first 10 rows

    def _parse_response(self, response: dict) -> dict:
        content = response["choices"][0]["message"]["content"]
        return {
            "response": content,
            "usage": response.get("usage", {})
        }

# Usage
assistant = DataAnalysisAssistant(client)

schema = {
    "sales": ["order_id", "customer_id", "product_id", "amount", "order_date"],
    "customers": ["customer_id", "name", "segment", "region"],
    "products": ["product_id", "name", "category", "price"]
}

result = assistant.analyze_data_question(
    question="What are the top 5 customers by total revenue this year?",
    schema=schema
)
print(result["response"])

RAG with Foundation Models

from databricks.vector_search.client import VectorSearchClient

class FoundationModelRAG:
    """RAG system using Databricks Foundation Models"""

    def __init__(
        self,
        fm_client: FoundationModelClient,
        vector_search_endpoint: str,
        index_name: str
    ):
        self.fm = fm_client
        self.vsc = VectorSearchClient()
        self.index = self.vsc.get_index(vector_search_endpoint, index_name)
        self.embedding_model = "databricks-bge-large-en"
        self.llm_model = "databricks-meta-llama-3-70b-instruct"

    def query(
        self,
        question: str,
        num_context: int = 5,
        filters: dict = None
    ) -> dict:
        """Complete RAG query pipeline"""

        # Retrieve relevant documents
        search_results = self.index.similarity_search(
            query_text=question,
            columns=["doc_id", "title", "content"],
            num_results=num_context,
            filters=filters
        )

        context = self._format_context(search_results)

        # Generate answer
        answer = self._generate_answer(question, context)

        return {
            "question": question,
            "answer": answer,
            "sources": [
                {"title": row[2], "score": row[0]}
                for row in search_results["result"]["data_array"]
            ]
        }

    def _format_context(self, search_results: dict) -> str:
        """Format search results as context"""
        contexts = []
        for row in search_results["result"]["data_array"]:
            title = row[2]
            content = row[3]
            contexts.append(f"### {title}\n{content}")
        return "\n\n".join(contexts)

    def _generate_answer(self, question: str, context: str) -> str:
        """Generate answer using LLM"""

        system_prompt = """You are a helpful assistant that answers questions
        based on the provided context. If the context doesn't contain
        the answer, say so. Always cite which source you're using."""

        response = self.fm.chat_completion(
            model=self.llm_model,
            messages=[
                {"role": "system", "content": system_prompt},
                {
                    "role": "user",
                    "content": f"""Context:
{context}

Question: {question}

Answer based on the context above:"""
                }
            ],
            temperature=0.3
        )

        return response["choices"][0]["message"]["content"]

# Usage
rag = FoundationModelRAG(
    fm_client=client,
    vector_search_endpoint="vector-search-endpoint",
    index_name="main.docs.knowledge_base_index"
)

result = rag.query("How do I create a Delta table in Databricks?")
print(result["answer"])
print("\nSources:")
for source in result["sources"]:
    print(f"  - {source['title']} (score: {source['score']:.3f})")

Cost and Performance Optimization

class OptimizedFoundationModelClient:
    """Cost and performance optimized foundation model client"""

    def __init__(self, client: FoundationModelClient):
        self.client = client
        self.cache = {}

    def select_model(
        self,
        task_complexity: str,
        latency_requirement: str
    ) -> str:
        """Select appropriate model based on requirements"""

        model_selection = {
            ("simple", "low"): "databricks-meta-llama-3-8b-instruct",
            ("simple", "normal"): "databricks-meta-llama-3-8b-instruct",
            ("moderate", "low"): "databricks-mixtral-8x7b-instruct",
            ("moderate", "normal"): "databricks-meta-llama-3-70b-instruct",
            ("complex", "low"): "databricks-dbrx-instruct",
            ("complex", "normal"): "databricks-meta-llama-3-70b-instruct"
        }

        return model_selection.get(
            (task_complexity, latency_requirement),
            "databricks-meta-llama-3-70b-instruct"
        )

    def cached_completion(
        self,
        messages: List[Dict],
        cache_key: str = None,
        **kwargs
    ) -> Dict:
        """Completion with caching"""

        if cache_key is None:
            cache_key = hash(str(messages))

        if cache_key in self.cache:
            return self.cache[cache_key]

        model = kwargs.pop("model", "databricks-meta-llama-3-70b-instruct")
        result = self.client.chat_completion(model, messages, **kwargs)

        self.cache[cache_key] = result
        return result

    def batch_embeddings(
        self,
        texts: List[str],
        batch_size: int = 100
    ) -> List[List[float]]:
        """Generate embeddings in optimized batches"""

        all_embeddings = []

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            embeddings = self.client.generate_embeddings(
                "databricks-bge-large-en",
                batch
            )
            all_embeddings.extend(embeddings)

        return all_embeddings

Conclusion

Databricks Foundation Model APIs provide enterprise-ready access to state-of-the-art LLMs. Use them for building intelligent applications, RAG systems, and data analysis assistants with built-in governance and scalability.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.