Skip to content
Back to Blog
1 min read

Cost Management for Azure AI Services: Budgeting and Optimization

I wrote “Cost Management for Azure AI Services: Budgeting and Optimization” to share practical, production-minded guidance on this topic.

Understanding AI Costs

Azure AI costs depend on multiple factors: model selection, token usage, deployment type (serverless vs. provisioned), and regional pricing. Visibility into these components enables informed decisions.

Cost Tracking Implementation

Build comprehensive cost monitoring:

from azure.mgmt.costmanagement import CostManagementClient
from azure.identity import DefaultAzureCredential
from dataclasses import dataclass
from datetime import datetime, timedelta
import pandas as pd

@dataclass
class CostBreakdown:
    service: str
    resource: str
    daily_cost: float
    monthly_projection: float
    trend: str

class AICostTracker:
    def __init__(self, subscription_id: str):
        self.client = CostManagementClient(
            credential=DefaultAzureCredential(),
            subscription_id=subscription_id
        )
        self.subscription_id = subscription_id

    def get_ai_costs(self, days: int = 30) -> list[CostBreakdown]:
        """Get cost breakdown for AI services."""

        scope = f"/subscriptions/{self.subscription_id}"

        query = {
            "type": "ActualCost",
            "timeframe": "Custom",
            "timePeriod": {
                "from": (datetime.now() - timedelta(days=days)).isoformat(),
                "to": datetime.now().isoformat()
            },
            "dataset": {
                "granularity": "Daily",
                "aggregation": {
                    "totalCost": {
                        "name": "Cost",
                        "function": "Sum"
                    }
                },
                "grouping": [
                    {"type": "Dimension", "name": "ServiceName"},
                    {"type": "Dimension", "name": "ResourceId"}
                ],
                "filter": {
                    "or": [
                        {"dimensions": {"name": "ServiceName", "operator": "In",
                                       "values": ["Azure OpenAI", "Cognitive Services",
                                                 "Azure AI Search"]}},
                        {"tags": {"name": "category", "operator": "In",
                                 "values": ["ai", "ml"]}}
                    ]
                }
            }
        }

        result = self.client.query.usage(scope=scope, parameters=query)

        # Process results into cost breakdown
        costs = []
        for row in result.rows:
            daily_avg = row[2] / days
            monthly_proj = daily_avg * 30

            # Calculate trend
            recent_avg = self._get_recent_average(row[1], 7)
            trend = "increasing" if recent_avg > daily_avg else "decreasing"

            costs.append(CostBreakdown(
                service=row[0],
                resource=row[1],
                daily_cost=daily_avg,
                monthly_projection=monthly_proj,
                trend=trend
            ))

        return costs

    def set_budget_alert(
        self,
        resource_group: str,
        monthly_budget: float,
        alert_thresholds: list[int] = [50, 75, 90, 100]
    ):
        """Create budget with alerts for AI resources."""

        from azure.mgmt.consumption import ConsumptionManagementClient

        consumption_client = ConsumptionManagementClient(
            credential=DefaultAzureCredential(),
            subscription_id=self.subscription_id
        )

        scope = f"/subscriptions/{self.subscription_id}/resourceGroups/{resource_group}"

        notifications = {}
        for threshold in alert_thresholds:
            notifications[f"alert_{threshold}"] = {
                "enabled": True,
                "operator": "GreaterThan",
                "threshold": threshold,
                "contactEmails": ["ai-team@company.com"],
                "contactRoles": ["Owner", "Contributor"]
            }

        budget = {
            "category": "Cost",
            "amount": monthly_budget,
            "timeGrain": "Monthly",
            "timePeriod": {
                "startDate": datetime.now().replace(day=1).isoformat(),
                "endDate": (datetime.now().replace(day=1) + timedelta(days=365)).isoformat()
            },
            "filter": {
                "tags": {
                    "name": "category",
                    "operator": "In",
                    "values": ["ai"]
                }
            },
            "notifications": notifications
        }

        consumption_client.budgets.create_or_update(
            scope=scope,
            budget_name=f"{resource_group}-ai-budget",
            parameters=budget
        )

Token Usage Optimization

Implement strategies to reduce token consumption:

class TokenOptimizer:
    def __init__(self):
        self.tiktoken = tiktoken.encoding_for_model("gpt-4")

    def estimate_cost(self, messages: list[dict], model: str) -> dict:
        """Estimate request cost before execution."""

        pricing = {
            "gpt-4": {"input": 0.03, "output": 0.06},
            "gpt-4-turbo": {"input": 0.01, "output": 0.03},
            "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
        }

        input_tokens = sum(
            len(self.tiktoken.encode(m["content"]))
            for m in messages
        )

        # Estimate output (typically 2-3x input for conversations)
        estimated_output = input_tokens * 2

        model_pricing = pricing.get(model, pricing["gpt-4"])

        return {
            "input_tokens": input_tokens,
            "estimated_output_tokens": estimated_output,
            "estimated_cost": (
                (input_tokens / 1000) * model_pricing["input"] +
                (estimated_output / 1000) * model_pricing["output"]
            )
        }

    def optimize_prompt(self, prompt: str, max_tokens: int) -> str:
        """Reduce prompt size while preserving meaning."""

        current_tokens = len(self.tiktoken.encode(prompt))

        if current_tokens <= max_tokens:
            return prompt

        # Compression strategies
        # 1. Remove redundant whitespace
        prompt = " ".join(prompt.split())

        # 2. Use abbreviations for common terms
        abbreviations = {
            "for example": "e.g.",
            "that is": "i.e.",
            "and so on": "etc."
        }
        for full, abbrev in abbreviations.items():
            prompt = prompt.replace(full, abbrev)

        return prompt

Effective cost management combines visibility, budgeting, and optimization. Regular review of AI spending patterns helps identify opportunities to reduce costs without compromising functionality.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.