Back to Blog
6 min read

Quota Management: Controlling AI Resource Consumption

Quotas help control resource consumption and costs in AI applications. Let’s explore strategies for implementing and managing quotas effectively.

Quota System Design

from dataclasses import dataclass, field
from typing import Dict, Optional
from datetime import datetime, timedelta
from enum import Enum
import threading

class QuotaPeriod(Enum):
    HOURLY = "hourly"
    DAILY = "daily"
    WEEKLY = "weekly"
    MONTHLY = "monthly"

@dataclass
class Quota:
    """Quota definition"""
    name: str
    limit: int
    period: QuotaPeriod
    resource_type: str  # "requests", "tokens", "cost"

@dataclass
class QuotaUsage:
    """Track quota usage"""
    quota: Quota
    used: int = 0
    period_start: datetime = field(default_factory=datetime.now)

    def is_exceeded(self) -> bool:
        return self.used >= self.quota.limit

    def remaining(self) -> int:
        return max(0, self.quota.limit - self.used)

    def percentage_used(self) -> float:
        return (self.used / self.quota.limit) * 100

class QuotaManager:
    """Manage quotas for different entities"""

    def __init__(self):
        self.quotas: Dict[str, Quota] = {}
        self.usage: Dict[str, Dict[str, QuotaUsage]] = {}  # entity_id -> quota_name -> usage
        self._lock = threading.Lock()

    def define_quota(self, quota: Quota):
        """Define a new quota"""
        self.quotas[quota.name] = quota

    def get_usage(self, entity_id: str, quota_name: str) -> Optional[QuotaUsage]:
        """Get current usage for an entity"""
        with self._lock:
            if entity_id not in self.usage:
                return None
            return self.usage[entity_id].get(quota_name)

    def check_quota(self, entity_id: str, quota_name: str, amount: int = 1) -> tuple[bool, str]:
        """Check if quota allows the request"""
        with self._lock:
            if quota_name not in self.quotas:
                return True, "No quota defined"

            quota = self.quotas[quota_name]

            if entity_id not in self.usage:
                self.usage[entity_id] = {}

            if quota_name not in self.usage[entity_id]:
                self.usage[entity_id][quota_name] = QuotaUsage(quota=quota)

            usage = self.usage[entity_id][quota_name]

            # Check if period needs reset
            self._maybe_reset_period(usage)

            if usage.used + amount > quota.limit:
                return False, f"Quota exceeded: {usage.used}/{quota.limit} {quota.resource_type}"

            return True, f"Quota available: {usage.remaining()} remaining"

    def consume(self, entity_id: str, quota_name: str, amount: int = 1):
        """Consume quota"""
        with self._lock:
            if entity_id not in self.usage or quota_name not in self.usage[entity_id]:
                self.check_quota(entity_id, quota_name, 0)  # Initialize

            usage = self.usage[entity_id][quota_name]
            self._maybe_reset_period(usage)
            usage.used += amount

    def _maybe_reset_period(self, usage: QuotaUsage):
        """Reset usage if period has elapsed"""
        now = datetime.now()
        period = usage.quota.period

        should_reset = False
        if period == QuotaPeriod.HOURLY:
            should_reset = (now - usage.period_start) > timedelta(hours=1)
        elif period == QuotaPeriod.DAILY:
            should_reset = (now - usage.period_start) > timedelta(days=1)
        elif period == QuotaPeriod.WEEKLY:
            should_reset = (now - usage.period_start) > timedelta(weeks=1)
        elif period == QuotaPeriod.MONTHLY:
            should_reset = (now - usage.period_start) > timedelta(days=30)

        if should_reset:
            usage.used = 0
            usage.period_start = now

    def get_all_usage(self, entity_id: str) -> Dict[str, dict]:
        """Get all quota usage for an entity"""
        with self._lock:
            if entity_id not in self.usage:
                return {}

            return {
                name: {
                    "used": u.used,
                    "limit": u.quota.limit,
                    "remaining": u.remaining(),
                    "percentage": u.percentage_used(),
                    "period": u.quota.period.value
                }
                for name, u in self.usage[entity_id].items()
            }

# Usage
quota_manager = QuotaManager()

# Define quotas
quota_manager.define_quota(Quota(
    name="daily_requests",
    limit=1000,
    period=QuotaPeriod.DAILY,
    resource_type="requests"
))

quota_manager.define_quota(Quota(
    name="daily_tokens",
    limit=1_000_000,
    period=QuotaPeriod.DAILY,
    resource_type="tokens"
))

quota_manager.define_quota(Quota(
    name="monthly_cost",
    limit=10000,  # $100 in cents
    period=QuotaPeriod.MONTHLY,
    resource_type="cost"
))

User-Level Quotas

@dataclass
class UserTier:
    """User tier with associated quotas"""
    name: str
    daily_requests: int
    daily_tokens: int
    monthly_cost_cents: int
    max_tokens_per_request: int
    allowed_models: list

USER_TIERS = {
    "free": UserTier(
        name="free",
        daily_requests=50,
        daily_tokens=50_000,
        monthly_cost_cents=0,  # Covered by platform
        max_tokens_per_request=1000,
        allowed_models=["gpt-4o-mini"]
    ),
    "basic": UserTier(
        name="basic",
        daily_requests=500,
        daily_tokens=500_000,
        monthly_cost_cents=2000,  # $20
        max_tokens_per_request=4000,
        allowed_models=["gpt-4o-mini", "gpt-4o"]
    ),
    "pro": UserTier(
        name="pro",
        daily_requests=5000,
        daily_tokens=5_000_000,
        monthly_cost_cents=10000,  # $100
        max_tokens_per_request=8000,
        allowed_models=["gpt-4o-mini", "gpt-4o", "o1-preview"]
    )
}

class UserQuotaManager:
    """Manage quotas per user"""

    def __init__(self):
        self.quota_manager = QuotaManager()
        self.user_tiers: Dict[str, str] = {}  # user_id -> tier_name

    def set_user_tier(self, user_id: str, tier_name: str):
        """Set user's tier"""
        if tier_name not in USER_TIERS:
            raise ValueError(f"Unknown tier: {tier_name}")

        self.user_tiers[user_id] = tier_name
        tier = USER_TIERS[tier_name]

        # Define user's quotas based on tier
        self.quota_manager.define_quota(Quota(
            name=f"{user_id}_daily_requests",
            limit=tier.daily_requests,
            period=QuotaPeriod.DAILY,
            resource_type="requests"
        ))

        self.quota_manager.define_quota(Quota(
            name=f"{user_id}_daily_tokens",
            limit=tier.daily_tokens,
            period=QuotaPeriod.DAILY,
            resource_type="tokens"
        ))

    def check_request(self, user_id: str, model: str,
                     estimated_tokens: int) -> tuple[bool, str]:
        """Check if user can make request"""

        tier_name = self.user_tiers.get(user_id, "free")
        tier = USER_TIERS[tier_name]

        # Check model access
        if model not in tier.allowed_models:
            return False, f"Model {model} not available in {tier_name} tier"

        # Check token limit per request
        if estimated_tokens > tier.max_tokens_per_request:
            return False, f"Request exceeds max tokens ({tier.max_tokens_per_request})"

        # Check daily request quota
        allowed, msg = self.quota_manager.check_quota(
            user_id, f"{user_id}_daily_requests"
        )
        if not allowed:
            return False, msg

        # Check daily token quota
        allowed, msg = self.quota_manager.check_quota(
            user_id, f"{user_id}_daily_tokens", estimated_tokens
        )
        if not allowed:
            return False, msg

        return True, "Request allowed"

    def record_usage(self, user_id: str, tokens_used: int, cost_cents: int):
        """Record usage after request"""
        self.quota_manager.consume(user_id, f"{user_id}_daily_requests", 1)
        self.quota_manager.consume(user_id, f"{user_id}_daily_tokens", tokens_used)

Quota Enforcement Middleware

from functools import wraps

class QuotaExceededError(Exception):
    """Raised when quota is exceeded"""
    def __init__(self, message: str, quota_name: str, usage: dict):
        super().__init__(message)
        self.quota_name = quota_name
        self.usage = usage

def enforce_quota(user_quota_manager: UserQuotaManager):
    """Decorator to enforce quotas on API calls"""

    def decorator(func):
        @wraps(func)
        def wrapper(user_id: str, prompt: str, model: str = "gpt-4o", *args, **kwargs):
            # Estimate tokens
            estimated_tokens = len(prompt) // 4 + 1000

            # Check quota
            allowed, message = user_quota_manager.check_request(
                user_id, model, estimated_tokens
            )

            if not allowed:
                usage = user_quota_manager.quota_manager.get_all_usage(user_id)
                raise QuotaExceededError(message, "request", usage)

            # Execute
            result = func(user_id, prompt, model, *args, **kwargs)

            # Record usage (assuming result has token info)
            tokens_used = result.get("usage", {}).get("total_tokens", estimated_tokens)
            cost_cents = calculate_cost_cents(model, tokens_used)
            user_quota_manager.record_usage(user_id, tokens_used, cost_cents)

            return result

        return wrapper
    return decorator

# Usage
user_quota_mgr = UserQuotaManager()
user_quota_mgr.set_user_tier("user123", "basic")

@enforce_quota(user_quota_mgr)
def generate_response(user_id: str, prompt: str, model: str = "gpt-4o") -> dict:
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return {
        "content": response.choices[0].message.content,
        "usage": response.usage.model_dump()
    }

# This will check quotas before execution
try:
    result = generate_response("user123", "Hello, world!")
except QuotaExceededError as e:
    print(f"Quota exceeded: {e}")
    print(f"Current usage: {e.usage}")

Quota Alerts

class QuotaAlertManager:
    """Send alerts when quotas are approaching limits"""

    def __init__(self, quota_manager: QuotaManager):
        self.quota_manager = quota_manager
        self.alert_thresholds = [0.5, 0.75, 0.9, 1.0]  # 50%, 75%, 90%, 100%
        self.sent_alerts: Dict[str, set] = {}  # entity_id -> set of sent threshold percentages

    def check_and_alert(self, entity_id: str, quota_name: str) -> Optional[dict]:
        """Check quota and return alert if threshold crossed"""

        usage = self.quota_manager.get_usage(entity_id, quota_name)
        if not usage:
            return None

        percentage = usage.percentage_used() / 100

        # Initialize sent alerts for this entity
        key = f"{entity_id}:{quota_name}"
        if key not in self.sent_alerts:
            self.sent_alerts[key] = set()

        for threshold in self.alert_thresholds:
            if percentage >= threshold and threshold not in self.sent_alerts[key]:
                self.sent_alerts[key].add(threshold)

                return {
                    "entity_id": entity_id,
                    "quota_name": quota_name,
                    "threshold": threshold,
                    "current_percentage": percentage * 100,
                    "used": usage.used,
                    "limit": usage.quota.limit,
                    "remaining": usage.remaining(),
                    "severity": self._get_severity(threshold)
                }

        return None

    def _get_severity(self, threshold: float) -> str:
        if threshold >= 1.0:
            return "critical"
        elif threshold >= 0.9:
            return "warning"
        elif threshold >= 0.75:
            return "notice"
        else:
            return "info"

    def reset_alerts(self, entity_id: str, quota_name: str):
        """Reset alerts when period resets"""
        key = f"{entity_id}:{quota_name}"
        if key in self.sent_alerts:
            self.sent_alerts[key].clear()

# Integration with notification system
async def notify_quota_alert(alert: dict):
    """Send notification for quota alert"""
    if alert["severity"] == "critical":
        # Send immediate alert
        await send_email(
            to=get_admin_email(),
            subject=f"CRITICAL: Quota exceeded for {alert['entity_id']}",
            body=f"Quota {alert['quota_name']} has reached {alert['current_percentage']:.1f}%"
        )
    elif alert["severity"] == "warning":
        # Send warning
        await send_slack_message(
            channel="#alerts",
            message=f"Warning: {alert['entity_id']} quota at {alert['current_percentage']:.1f}%"
        )

Effective quota management protects both your users and your budget. Implement quotas at multiple levels - per request, per user, and per organization - to maintain control over resource consumption.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.