Back to Blog
3 min read

Cost Management for Azure AI Services: Budgeting and Optimization

AI services can quickly become expensive without proper cost management. Understanding Azure AI pricing models and implementing optimization strategies ensures sustainable AI adoption.

Understanding AI Costs

Azure AI costs depend on multiple factors: model selection, token usage, deployment type (serverless vs. provisioned), and regional pricing. Visibility into these components enables informed decisions.

Cost Tracking Implementation

Build comprehensive cost monitoring:

from azure.mgmt.costmanagement import CostManagementClient
from azure.identity import DefaultAzureCredential
from dataclasses import dataclass
from datetime import datetime, timedelta
import pandas as pd

@dataclass
class CostBreakdown:
    service: str
    resource: str
    daily_cost: float
    monthly_projection: float
    trend: str

class AICostTracker:
    def __init__(self, subscription_id: str):
        self.client = CostManagementClient(
            credential=DefaultAzureCredential(),
            subscription_id=subscription_id
        )
        self.subscription_id = subscription_id

    def get_ai_costs(self, days: int = 30) -> list[CostBreakdown]:
        """Get cost breakdown for AI services."""

        scope = f"/subscriptions/{self.subscription_id}"

        query = {
            "type": "ActualCost",
            "timeframe": "Custom",
            "timePeriod": {
                "from": (datetime.now() - timedelta(days=days)).isoformat(),
                "to": datetime.now().isoformat()
            },
            "dataset": {
                "granularity": "Daily",
                "aggregation": {
                    "totalCost": {
                        "name": "Cost",
                        "function": "Sum"
                    }
                },
                "grouping": [
                    {"type": "Dimension", "name": "ServiceName"},
                    {"type": "Dimension", "name": "ResourceId"}
                ],
                "filter": {
                    "or": [
                        {"dimensions": {"name": "ServiceName", "operator": "In",
                                       "values": ["Azure OpenAI", "Cognitive Services",
                                                 "Azure AI Search"]}},
                        {"tags": {"name": "category", "operator": "In",
                                 "values": ["ai", "ml"]}}
                    ]
                }
            }
        }

        result = self.client.query.usage(scope=scope, parameters=query)

        # Process results into cost breakdown
        costs = []
        for row in result.rows:
            daily_avg = row[2] / days
            monthly_proj = daily_avg * 30

            # Calculate trend
            recent_avg = self._get_recent_average(row[1], 7)
            trend = "increasing" if recent_avg > daily_avg else "decreasing"

            costs.append(CostBreakdown(
                service=row[0],
                resource=row[1],
                daily_cost=daily_avg,
                monthly_projection=monthly_proj,
                trend=trend
            ))

        return costs

    def set_budget_alert(
        self,
        resource_group: str,
        monthly_budget: float,
        alert_thresholds: list[int] = [50, 75, 90, 100]
    ):
        """Create budget with alerts for AI resources."""

        from azure.mgmt.consumption import ConsumptionManagementClient

        consumption_client = ConsumptionManagementClient(
            credential=DefaultAzureCredential(),
            subscription_id=self.subscription_id
        )

        scope = f"/subscriptions/{self.subscription_id}/resourceGroups/{resource_group}"

        notifications = {}
        for threshold in alert_thresholds:
            notifications[f"alert_{threshold}"] = {
                "enabled": True,
                "operator": "GreaterThan",
                "threshold": threshold,
                "contactEmails": ["ai-team@company.com"],
                "contactRoles": ["Owner", "Contributor"]
            }

        budget = {
            "category": "Cost",
            "amount": monthly_budget,
            "timeGrain": "Monthly",
            "timePeriod": {
                "startDate": datetime.now().replace(day=1).isoformat(),
                "endDate": (datetime.now().replace(day=1) + timedelta(days=365)).isoformat()
            },
            "filter": {
                "tags": {
                    "name": "category",
                    "operator": "In",
                    "values": ["ai"]
                }
            },
            "notifications": notifications
        }

        consumption_client.budgets.create_or_update(
            scope=scope,
            budget_name=f"{resource_group}-ai-budget",
            parameters=budget
        )

Token Usage Optimization

Implement strategies to reduce token consumption:

class TokenOptimizer:
    def __init__(self):
        self.tiktoken = tiktoken.encoding_for_model("gpt-4")

    def estimate_cost(self, messages: list[dict], model: str) -> dict:
        """Estimate request cost before execution."""

        pricing = {
            "gpt-4": {"input": 0.03, "output": 0.06},
            "gpt-4-turbo": {"input": 0.01, "output": 0.03},
            "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
        }

        input_tokens = sum(
            len(self.tiktoken.encode(m["content"]))
            for m in messages
        )

        # Estimate output (typically 2-3x input for conversations)
        estimated_output = input_tokens * 2

        model_pricing = pricing.get(model, pricing["gpt-4"])

        return {
            "input_tokens": input_tokens,
            "estimated_output_tokens": estimated_output,
            "estimated_cost": (
                (input_tokens / 1000) * model_pricing["input"] +
                (estimated_output / 1000) * model_pricing["output"]
            )
        }

    def optimize_prompt(self, prompt: str, max_tokens: int) -> str:
        """Reduce prompt size while preserving meaning."""

        current_tokens = len(self.tiktoken.encode(prompt))

        if current_tokens <= max_tokens:
            return prompt

        # Compression strategies
        # 1. Remove redundant whitespace
        prompt = " ".join(prompt.split())

        # 2. Use abbreviations for common terms
        abbreviations = {
            "for example": "e.g.",
            "that is": "i.e.",
            "and so on": "etc."
        }
        for full, abbrev in abbreviations.items():
            prompt = prompt.replace(full, abbrev)

        return prompt

Effective cost management combines visibility, budgeting, and optimization. Regular review of AI spending patterns helps identify opportunities to reduce costs without compromising functionality.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.