January 2, 2023 2 min read

GPT-3.5 on Azure: Understanding the Model and Its Capabilities

With Azure OpenAI Service approaching GA, let’s dive deep into GPT-3.5 - the model powering ChatGPT - and understand how to use it effectively in your applications.

Understanding GPT-3.5 Models

GPT-3.5 is not a single model but a family of models with different capabilities:

Model	Best For	Max Tokens	Cost
text-davinci-003	Complex tasks, longer output	4,097	$0.02/1K
text-curie-001	Balanced performance	2,049	$0.002/1K
text-babbage-001	Straightforward tasks	2,049	$0.0005/1K
text-ada-001	Simple classification	2,049	$0.0004/1K

Deploying GPT-3.5 on Azure

First, deploy your model using Azure CLI:

# Create the Azure OpenAI resource
az cognitiveservices account create \
    --name my-openai-resource \
    --resource-group my-rg \
    --kind OpenAI \
    --sku S0 \
    --location eastus

# Deploy GPT-3.5 model
az cognitiveservices account deployment create \
    --name my-openai-resource \
    --resource-group my-rg \
    --deployment-name gpt35-deployment \
    --model-name text-davinci-003 \
    --model-version "1" \
    --model-format OpenAI \
    --scale-settings-scale-type "Standard"

Working with GPT-3.5 in Python

Here’s a comprehensive example showing various GPT-3.5 capabilities:

import openai
import os
from typing import Optional, List, Dict

class GPT35Client:
    """Client for working with GPT-3.5 on Azure."""

    def __init__(self):
        openai.api_type = "azure"
        openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
        openai.api_version = "2022-12-01"
        openai.api_key = os.getenv("AZURE_OPENAI_KEY")
        self.deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt35")

    def complete(
        self,
        prompt: str,
        max_tokens: int = 500,
        temperature: float = 0.7,
        stop: Optional[List[str]] = None
    ) -> str:
        """Generate a completion."""
        response = openai.Completion.create(
            engine=self.deployment,
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            stop=stop
        )
        return response.choices[0].text.strip()

    def summarize(self, text: str, max_length: int = 100) -> str:
        """Summarize text."""
        prompt = f"""Summarize the following text in approximately {max_length} words:

Text: {text}

Summary:"""
        return self.complete(prompt, max_tokens=max_length * 2)

    def extract_entities(self, text: str) -> Dict:
        """Extract named entities from text."""
        prompt = f"""Extract named entities from the following text.
Return as JSON with categories: persons, organizations, locations, dates.

Text: {text}

JSON:"""
        result = self.complete(prompt, temperature=0.1)
        import json
        return json.loads(result)

    def translate(self, text: str, target_language: str) -> str:
        """Translate text to target language."""
        prompt = f"""Translate the following text to {target_language}:

Text: {text}

Translation:"""
        return self.complete(prompt, temperature=0.3)

    def generate_code(self, description: str, language: str = "python") -> str:
        """Generate code from description."""
        prompt = f"""Write {language} code that does the following:
{description}

Only output the code, no explanations.

```{language}"""
        result = self.complete(prompt, temperature=0.2, stop=["```"])
        return result

# Usage examples
client = GPT35Client()

# Summarization
summary = client.summarize("""
Azure OpenAI Service provides REST API access to OpenAI's powerful language
models including GPT-3.5, Codex and DALL-E. These models can be easily adapted
to your specific task including content generation, summarization, semantic
search, and natural language to code translation.
""")
print(f"Summary: {summary}")

# Entity extraction
entities = client.extract_entities(
    "Microsoft CEO Satya Nadella announced in Seattle on January 2023 "
    "that Azure OpenAI Service is now generally available."
)
print(f"Entities: {entities}")

# Code generation
code = client.generate_code(
    "Read a CSV file and calculate the average of a numeric column",
    language="python"
)
print(f"Generated code:\n{code}")

Temperature and Creativity Control

Temperature controls randomness in outputs:

def demonstrate_temperature(prompt: str):
    """Show how temperature affects output."""

    temperatures = [0.0, 0.5, 1.0]

    for temp in temperatures:
        print(f"\n--- Temperature: {temp} ---")
        response = openai.Completion.create(
            engine="gpt35",
            prompt=prompt,
            max_tokens=100,
            temperature=temp,
            n=3  # Generate 3 completions
        )

        for i, choice in enumerate(response.choices):
            print(f"Completion {i+1}: {choice.text.strip()[:100]}...")

# Lower temperature = more deterministic
# Higher temperature = more creative/random
demonstrate_temperature("Write a tagline for a cloud computing company:")

Token Management

Understanding and managing tokens is crucial:

import tiktoken

def count_tokens(text: str, model: str = "text-davinci-003") -> int:
    """Count tokens in text."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def optimize_prompt(prompt: str, max_prompt_tokens: int = 3000) -> str:
    """Truncate prompt to fit within token limit."""
    encoding = tiktoken.encoding_for_model("text-davinci-003")
    tokens = encoding.encode(prompt)

    if len(tokens) <= max_prompt_tokens:
        return prompt

    # Truncate and decode
    truncated_tokens = tokens[:max_prompt_tokens]
    return encoding.decode(truncated_tokens) + "..."

def estimate_cost(prompt: str, max_tokens: int, model: str = "text-davinci-003") -> float:
    """Estimate cost of a completion request."""
    prompt_tokens = count_tokens(prompt)
    total_tokens = prompt_tokens + max_tokens

    # Pricing per 1K tokens
    pricing = {
        "text-davinci-003": 0.02,
        "text-curie-001": 0.002,
        "text-babbage-001": 0.0005,
        "text-ada-001": 0.0004
    }

    rate = pricing.get(model, 0.02)
    return (total_tokens / 1000) * rate

# Example
prompt = "Explain quantum computing in simple terms:"
print(f"Token count: {count_tokens(prompt)}")
print(f"Estimated cost: ${estimate_cost(prompt, 500):.4f}")

Practical Use Cases

Document Classification

def classify_document(document: str, categories: List[str]) -> str:
    """Classify a document into one of the given categories."""
    categories_str = ", ".join(categories)

    prompt = f"""Classify the following document into one of these categories: {categories_str}

Document: {document}

Category:"""

    response = openai.Completion.create(
        engine="gpt35",
        prompt=prompt,
        max_tokens=20,
        temperature=0.1
    )

    return response.choices[0].text.strip()

# Usage
result = classify_document(
    "The quarterly revenue exceeded expectations with a 15% YoY growth...",
    ["Finance", "Technology", "Healthcare", "Marketing"]
)
print(f"Classification: {result}")  # Output: Finance

SQL Query Generation

def generate_sql(question: str, schema: str) -> str:
    """Generate SQL query from natural language question."""
    prompt = f"""Given the following SQL schema:
{schema}

Write a SQL query to answer: {question}

SQL Query:"""

    response = openai.Completion.create(
        engine="gpt35",
        prompt=prompt,
        max_tokens=200,
        temperature=0.1,
        stop=[";"]
    )

    return response.choices[0].text.strip() + ";"

# Usage
schema = """
CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT,
    order_date DATE,
    total_amount DECIMAL(10,2)
);

CREATE TABLE customers (
    customer_id INT PRIMARY KEY,
    name VARCHAR(100),
    email VARCHAR(100)
);
"""

query = generate_sql(
    "Find the top 5 customers by total order amount",
    schema
)
print(query)

Best Practices

Use the right model size: Start with smaller models and only upgrade if needed
Set appropriate temperature: Lower for factual tasks, higher for creative
Implement token counting: Prevent exceeding context limits
Cache responses: For repeated queries, cache results
Handle rate limits: Implement exponential backoff

What’s Next

GPT-3.5 is powerful, but the real game-changer is coming: ChatGPT-style conversations with the Chat Completion API. Stay tuned for my next post on ChatGPT integration patterns.