Back to Blog
5 min read

GPT-3.5 on Azure: Understanding the Model and Its Capabilities

With Azure OpenAI Service approaching GA, let’s dive deep into GPT-3.5 - the model powering ChatGPT - and understand how to use it effectively in your applications.

Understanding GPT-3.5 Models

GPT-3.5 is not a single model but a family of models with different capabilities:

ModelBest ForMax TokensCost
text-davinci-003Complex tasks, longer output4,097$0.02/1K
text-curie-001Balanced performance2,049$0.002/1K
text-babbage-001Straightforward tasks2,049$0.0005/1K
text-ada-001Simple classification2,049$0.0004/1K

Deploying GPT-3.5 on Azure

First, deploy your model using Azure CLI:

# Create the Azure OpenAI resource
az cognitiveservices account create \
    --name my-openai-resource \
    --resource-group my-rg \
    --kind OpenAI \
    --sku S0 \
    --location eastus

# Deploy GPT-3.5 model
az cognitiveservices account deployment create \
    --name my-openai-resource \
    --resource-group my-rg \
    --deployment-name gpt35-deployment \
    --model-name text-davinci-003 \
    --model-version "1" \
    --model-format OpenAI \
    --scale-settings-scale-type "Standard"

Working with GPT-3.5 in Python

Here’s a comprehensive example showing various GPT-3.5 capabilities:

import openai
import os
from typing import Optional, List, Dict

class GPT35Client:
    """Client for working with GPT-3.5 on Azure."""

    def __init__(self):
        openai.api_type = "azure"
        openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
        openai.api_version = "2022-12-01"
        openai.api_key = os.getenv("AZURE_OPENAI_KEY")
        self.deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT", "gpt35")

    def complete(
        self,
        prompt: str,
        max_tokens: int = 500,
        temperature: float = 0.7,
        stop: Optional[List[str]] = None
    ) -> str:
        """Generate a completion."""
        response = openai.Completion.create(
            engine=self.deployment,
            prompt=prompt,
            max_tokens=max_tokens,
            temperature=temperature,
            stop=stop
        )
        return response.choices[0].text.strip()

    def summarize(self, text: str, max_length: int = 100) -> str:
        """Summarize text."""
        prompt = f"""Summarize the following text in approximately {max_length} words:

Text: {text}

Summary:"""
        return self.complete(prompt, max_tokens=max_length * 2)

    def extract_entities(self, text: str) -> Dict:
        """Extract named entities from text."""
        prompt = f"""Extract named entities from the following text.
Return as JSON with categories: persons, organizations, locations, dates.

Text: {text}

JSON:"""
        result = self.complete(prompt, temperature=0.1)
        import json
        return json.loads(result)

    def translate(self, text: str, target_language: str) -> str:
        """Translate text to target language."""
        prompt = f"""Translate the following text to {target_language}:

Text: {text}

Translation:"""
        return self.complete(prompt, temperature=0.3)

    def generate_code(self, description: str, language: str = "python") -> str:
        """Generate code from description."""
        prompt = f"""Write {language} code that does the following:
{description}

Only output the code, no explanations.

```{language}"""
        result = self.complete(prompt, temperature=0.2, stop=["```"])
        return result

# Usage examples
client = GPT35Client()

# Summarization
summary = client.summarize("""
Azure OpenAI Service provides REST API access to OpenAI's powerful language
models including GPT-3.5, Codex and DALL-E. These models can be easily adapted
to your specific task including content generation, summarization, semantic
search, and natural language to code translation.
""")
print(f"Summary: {summary}")

# Entity extraction
entities = client.extract_entities(
    "Microsoft CEO Satya Nadella announced in Seattle on January 2023 "
    "that Azure OpenAI Service is now generally available."
)
print(f"Entities: {entities}")

# Code generation
code = client.generate_code(
    "Read a CSV file and calculate the average of a numeric column",
    language="python"
)
print(f"Generated code:\n{code}")

Temperature and Creativity Control

Temperature controls randomness in outputs:

def demonstrate_temperature(prompt: str):
    """Show how temperature affects output."""

    temperatures = [0.0, 0.5, 1.0]

    for temp in temperatures:
        print(f"\n--- Temperature: {temp} ---")
        response = openai.Completion.create(
            engine="gpt35",
            prompt=prompt,
            max_tokens=100,
            temperature=temp,
            n=3  # Generate 3 completions
        )

        for i, choice in enumerate(response.choices):
            print(f"Completion {i+1}: {choice.text.strip()[:100]}...")

# Lower temperature = more deterministic
# Higher temperature = more creative/random
demonstrate_temperature("Write a tagline for a cloud computing company:")

Token Management

Understanding and managing tokens is crucial:

import tiktoken

def count_tokens(text: str, model: str = "text-davinci-003") -> int:
    """Count tokens in text."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def optimize_prompt(prompt: str, max_prompt_tokens: int = 3000) -> str:
    """Truncate prompt to fit within token limit."""
    encoding = tiktoken.encoding_for_model("text-davinci-003")
    tokens = encoding.encode(prompt)

    if len(tokens) <= max_prompt_tokens:
        return prompt

    # Truncate and decode
    truncated_tokens = tokens[:max_prompt_tokens]
    return encoding.decode(truncated_tokens) + "..."

def estimate_cost(prompt: str, max_tokens: int, model: str = "text-davinci-003") -> float:
    """Estimate cost of a completion request."""
    prompt_tokens = count_tokens(prompt)
    total_tokens = prompt_tokens + max_tokens

    # Pricing per 1K tokens
    pricing = {
        "text-davinci-003": 0.02,
        "text-curie-001": 0.002,
        "text-babbage-001": 0.0005,
        "text-ada-001": 0.0004
    }

    rate = pricing.get(model, 0.02)
    return (total_tokens / 1000) * rate

# Example
prompt = "Explain quantum computing in simple terms:"
print(f"Token count: {count_tokens(prompt)}")
print(f"Estimated cost: ${estimate_cost(prompt, 500):.4f}")

Practical Use Cases

Document Classification

def classify_document(document: str, categories: List[str]) -> str:
    """Classify a document into one of the given categories."""
    categories_str = ", ".join(categories)

    prompt = f"""Classify the following document into one of these categories: {categories_str}

Document: {document}

Category:"""

    response = openai.Completion.create(
        engine="gpt35",
        prompt=prompt,
        max_tokens=20,
        temperature=0.1
    )

    return response.choices[0].text.strip()

# Usage
result = classify_document(
    "The quarterly revenue exceeded expectations with a 15% YoY growth...",
    ["Finance", "Technology", "Healthcare", "Marketing"]
)
print(f"Classification: {result}")  # Output: Finance

SQL Query Generation

def generate_sql(question: str, schema: str) -> str:
    """Generate SQL query from natural language question."""
    prompt = f"""Given the following SQL schema:
{schema}

Write a SQL query to answer: {question}

SQL Query:"""

    response = openai.Completion.create(
        engine="gpt35",
        prompt=prompt,
        max_tokens=200,
        temperature=0.1,
        stop=[";"]
    )

    return response.choices[0].text.strip() + ";"

# Usage
schema = """
CREATE TABLE orders (
    order_id INT PRIMARY KEY,
    customer_id INT,
    order_date DATE,
    total_amount DECIMAL(10,2)
);

CREATE TABLE customers (
    customer_id INT PRIMARY KEY,
    name VARCHAR(100),
    email VARCHAR(100)
);
"""

query = generate_sql(
    "Find the top 5 customers by total order amount",
    schema
)
print(query)

Best Practices

  1. Use the right model size: Start with smaller models and only upgrade if needed
  2. Set appropriate temperature: Lower for factual tasks, higher for creative
  3. Implement token counting: Prevent exceeding context limits
  4. Cache responses: For repeated queries, cache results
  5. Handle rate limits: Implement exponential backoff

What’s Next

GPT-3.5 is powerful, but the real game-changer is coming: ChatGPT-style conversations with the Chat Completion API. Stay tuned for my next post on ChatGPT integration patterns.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.