November 13, 2022 1 min read

Building LLM Applications with Azure OpenAI: Patterns and Practices

As Azure OpenAI Service expands availability following Ignite 2022, teams are starting to build production applications with large language models. Here are patterns and practices for building robust LLM applications.

Building Blocks for LLM Applications

When building applications with GPT-3 and other LLMs, you need to think about:

Prompt design and management
Chaining multiple LLM calls
Testing and evaluation
Error handling and fallbacks
Cost optimization

Designing LLM Workflows

Intent Classification

import openai

def classify_intent(user_message: str) -> str:
    """Classify the intent of a user message."""

    prompt = f"""Classify the intent of this message into one of these categories:
    - question: User is asking a question
    - complaint: User is expressing dissatisfaction
    - request: User wants something done
    - feedback: User is providing feedback
    - other: None of the above

    Message: {user_message}

    Intent:"""

    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=20,
        temperature=0
    )

    return response.choices[0].text.strip().lower()

Context Retrieval with Azure Cognitive Search

from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential

def retrieve_context(intent: str, query: str) -> str:
    """Retrieve relevant context from Azure Cognitive Search."""

    search_client = SearchClient(
        endpoint="https://your-search.search.windows.net",
        index_name="knowledge-base",
        credential=AzureKeyCredential("your-key")
    )

    # Search based on intent and query
    filter_expression = f"category eq '{intent}'" if intent != "other" else None

    results = search_client.search(
        search_text=query,
        filter=filter_expression,
        top=3,
        select=["content", "title"]
    )

    context_parts = []
    for result in results:
        context_parts.append(f"## {result['title']}\n{result['content']}")

    return "\n\n".join(context_parts) if context_parts else "No relevant information found."

Response Generation

def generate_response(query: str, context: str, intent: str) -> str:
    """Generate a response using retrieved context."""

    prompt = f"""You are a helpful customer service assistant. Use the provided context to answer the user's query.
If the context doesn't contain relevant information, say so politely.

Intent detected: {intent}

Context:
{context}

User query: {query}

Response:"""

    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=500,
        temperature=0.7
    )

    return response.choices[0].text.strip()

Building a Complete Pipeline

from dataclasses import dataclass
from typing import List, Optional

@dataclass
class QueryResult:
    query: str
    intent: str
    context: str
    response: str
    sources: List[str]

class CustomerServicePipeline:
    """End-to-end pipeline for customer service queries."""

    def __init__(self, search_endpoint: str, search_key: str, openai_key: str):
        self.search_client = SearchClient(
            endpoint=search_endpoint,
            index_name="knowledge-base",
            credential=AzureKeyCredential(search_key)
        )
        openai.api_key = openai_key

    def process_query(self, user_query: str) -> QueryResult:
        """Process a user query through the full pipeline."""

        # Step 1: Classify intent
        intent = classify_intent(user_query)

        # Step 2: Retrieve context
        context = retrieve_context(intent, user_query)

        # Step 3: Generate response
        response = generate_response(user_query, context, intent)

        return QueryResult(
            query=user_query,
            intent=intent,
            context=context,
            response=response,
            sources=[]  # Could extract from search results
        )

Testing and Evaluation

from typing import List, Dict
import json

class LLMEvaluator:
    """Evaluate LLM responses for quality."""

    def __init__(self, test_cases: List[Dict]):
        self.test_cases = test_cases
        self.results = []

    def evaluate_response(self, expected: str, actual: str) -> Dict:
        """Evaluate a single response."""

        # Simple metrics
        exact_match = expected.lower() == actual.lower()
        contains_key_info = all(
            keyword in actual.lower()
            for keyword in self._extract_keywords(expected)
        )

        # Use LLM to evaluate relevance
        relevance_score = self._llm_evaluate(expected, actual)

        return {
            "exact_match": exact_match,
            "contains_key_info": contains_key_info,
            "relevance_score": relevance_score
        }

    def _extract_keywords(self, text: str) -> List[str]:
        """Extract important keywords from text."""
        # Simple implementation - could use NLP
        common_words = {"the", "a", "an", "is", "are", "was", "were", "to", "for"}
        words = text.lower().split()
        return [w for w in words if w not in common_words and len(w) > 3][:5]

    def _llm_evaluate(self, expected: str, actual: str) -> float:
        """Use GPT to evaluate response quality."""

        prompt = f"""Rate how well the actual response addresses the expected response.
        Score from 0 to 1, where 1 is perfect.

        Expected: {expected}
        Actual: {actual}

        Score (0-1):"""

        response = openai.Completion.create(
            engine="text-davinci-002",
            prompt=prompt,
            max_tokens=10,
            temperature=0
        )

        try:
            return float(response.choices[0].text.strip())
        except:
            return 0.5

    def run_evaluation(self, pipeline: CustomerServicePipeline) -> Dict:
        """Run evaluation on all test cases."""

        for test_case in self.test_cases:
            result = pipeline.process_query(test_case["query"])
            evaluation = self.evaluate_response(
                test_case["expected_response"],
                result.response
            )
            self.results.append({
                "query": test_case["query"],
                "expected": test_case["expected_response"],
                "actual": result.response,
                **evaluation
            })

        return self._aggregate_results()

    def _aggregate_results(self) -> Dict:
        """Aggregate evaluation results."""
        n = len(self.results)
        return {
            "total_tests": n,
            "exact_match_rate": sum(r["exact_match"] for r in self.results) / n,
            "key_info_rate": sum(r["contains_key_info"] for r in self.results) / n,
            "avg_relevance": sum(r["relevance_score"] for r in self.results) / n
        }

Error Handling and Fallbacks

from tenacity import retry, stop_after_attempt, wait_exponential
import logging

class RobustLLMClient:
    """LLM client with error handling and fallbacks."""

    def __init__(self):
        self.fallback_responses = {
            "question": "I apologize, but I'm unable to answer your question at the moment. Please try again later or contact our support team.",
            "complaint": "I apologize for the inconvenience. Your feedback has been noted. Please contact our support team for immediate assistance.",
            "default": "I apologize, but I'm experiencing technical difficulties. Please try again later."
        }

    @retry(
        stop=stop_after_attempt(3),
        wait=wait_exponential(multiplier=1, min=4, max=60)
    )
    def call_with_retry(self, prompt: str, **kwargs) -> str:
        """Call OpenAI API with retry logic."""

        try:
            response = openai.Completion.create(
                engine="text-davinci-002",
                prompt=prompt,
                **kwargs
            )
            return response.choices[0].text.strip()

        except openai.error.RateLimitError:
            logging.warning("Rate limited, retrying...")
            raise

        except openai.error.APIError as e:
            logging.error(f"API error: {e}")
            raise

    def generate_with_fallback(self, prompt: str, intent: str = "default", **kwargs) -> str:
        """Generate response with fallback on failure."""

        try:
            return self.call_with_retry(prompt, **kwargs)
        except Exception as e:
            logging.error(f"All retries failed: {e}")
            return self.fallback_responses.get(intent, self.fallback_responses["default"])

Cost Optimization

import tiktoken
from functools import lru_cache

class CostOptimizedClient:
    """LLM client optimized for cost."""

    def __init__(self, model: str = "text-davinci-002"):
        self.model = model
        self.encoding = tiktoken.encoding_for_model(model)
        self.token_count = 0
        self.cost_per_1k = 0.02  # Davinci pricing

    def count_tokens(self, text: str) -> int:
        """Count tokens in text."""
        return len(self.encoding.encode(text))

    @lru_cache(maxsize=1000)
    def cached_completion(self, prompt: str, max_tokens: int = 200) -> str:
        """Cache completions to avoid redundant API calls."""

        response = openai.Completion.create(
            engine=self.model,
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=0  # Use temperature=0 for cache consistency
        )

        tokens_used = response["usage"]["total_tokens"]
        self.token_count += tokens_used

        return response.choices[0].text.strip()

    def choose_model(self, task_complexity: str) -> str:
        """Choose appropriate model based on task."""

        model_map = {
            "simple": "text-ada-001",      # $0.0004/1K
            "medium": "text-curie-001",     # $0.002/1K
            "complex": "text-davinci-002"   # $0.02/1K
        }

        return model_map.get(task_complexity, "text-curie-001")

    def get_estimated_cost(self) -> float:
        """Get estimated cost so far."""
        return (self.token_count / 1000) * self.cost_per_1k

    def truncate_prompt(self, prompt: str, max_tokens: int = 3000) -> str:
        """Truncate prompt to fit within token limits."""

        tokens = self.encoding.encode(prompt)
        if len(tokens) <= max_tokens:
            return prompt

        return self.encoding.decode(tokens[:max_tokens])

Production Deployment

from flask import Flask, request, jsonify
from azure.monitor.opentelemetry import configure_azure_monitor

app = Flask(__name__)

# Configure monitoring
configure_azure_monitor(
    connection_string="InstrumentationKey=your-key"
)

# Initialize pipeline
pipeline = CustomerServicePipeline(
    search_endpoint="https://your-search.search.windows.net",
    search_key="your-search-key",
    openai_key="your-openai-key"
)

@app.route("/api/query", methods=["POST"])
def handle_query():
    """Handle customer service queries."""

    data = request.json
    query = data.get("query", "")

    if not query:
        return jsonify({"error": "Query is required"}), 400

    try:
        result = pipeline.process_query(query)
        return jsonify({
            "response": result.response,
            "intent": result.intent,
            "sources": result.sources
        })
    except Exception as e:
        logging.error(f"Error processing query: {e}")
        return jsonify({
            "error": "Unable to process query",
            "fallback": "Please contact our support team for assistance."
        }), 500

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8080)

Best Practices

Prompt versioning: Track changes to prompts like code
Test extensively: Use diverse test cases and edge cases
Monitor in production: Track latency, costs, and quality metrics
Implement fallbacks: Handle LLM failures gracefully
Cache when possible: Reduce costs and latency for repeated queries
Use appropriate models: Match model capability to task complexity

Conclusion

Building production LLM applications requires more than just calling the API. You need structured pipelines, robust error handling, evaluation frameworks, and cost optimization. As Azure OpenAI Service matures, expect to see more tooling emerge to support these patterns.