Back to Blog
3 min read

Enterprise AI: Building Secure API Gateways for LLM Access

Exposing LLM capabilities across the enterprise requires robust API gateways that handle authentication, rate limiting, content filtering, and cost allocation. Azure API Management provides the foundation for secure, scalable LLM access.

Gateway Architecture

from dataclasses import dataclass
from typing import Dict, Optional, List
from datetime import datetime, timedelta
import hashlib
import asyncio

@dataclass
class APIPolicy:
    rate_limit_per_minute: int
    daily_token_budget: int
    allowed_models: List[str]
    content_filtering_enabled: bool
    pii_detection_enabled: bool

@dataclass
class ClientContext:
    client_id: str
    department: str
    policy: APIPolicy
    tokens_used_today: int = 0
    requests_this_minute: int = 0

class LLMGateway:
    def __init__(self, backend_client, content_filter):
        self.backend_client = backend_client
        self.content_filter = content_filter
        self.clients: Dict[str, ClientContext] = {}
        self.request_log: List[Dict] = []

    async def process_request(self, api_key: str, request: Dict) -> Dict:
        """Process LLM request through gateway policies."""

        # Authenticate and get client context
        client = self._authenticate(api_key)
        if not client:
            return {"error": "Invalid API key", "status": 401}

        # Rate limiting
        if not self._check_rate_limit(client):
            return {"error": "Rate limit exceeded", "status": 429}

        # Token budget check
        estimated_tokens = self._estimate_tokens(request)
        if client.tokens_used_today + estimated_tokens > client.policy.daily_token_budget:
            return {"error": "Daily token budget exceeded", "status": 429}

        # Model authorization
        model = request.get("model", "gpt-4o")
        if model not in client.policy.allowed_models:
            return {"error": f"Model {model} not authorized", "status": 403}

        # Content filtering
        if client.policy.content_filtering_enabled:
            filter_result = await self.content_filter.check(request.get("messages", []))
            if not filter_result["safe"]:
                self._log_blocked_request(client, request, filter_result)
                return {"error": "Content policy violation", "status": 400}

        # PII detection
        if client.policy.pii_detection_enabled:
            pii_result = await self._detect_pii(request)
            if pii_result["pii_detected"]:
                return {"error": "PII detected in request", "status": 400,
                        "details": pii_result["types"]}

        # Forward to backend
        response = await self.backend_client.chat.completions.create(**request)

        # Update usage tracking
        actual_tokens = response.usage.total_tokens
        client.tokens_used_today += actual_tokens

        # Log for billing
        self._log_request(client, request, response, actual_tokens)

        return {
            "response": response,
            "usage": {
                "tokens_used": actual_tokens,
                "daily_remaining": client.policy.daily_token_budget - client.tokens_used_today
            }
        }

    def _check_rate_limit(self, client: ClientContext) -> bool:
        """Check if client is within rate limits."""
        client.requests_this_minute += 1
        return client.requests_this_minute <= client.policy.rate_limit_per_minute

    def _estimate_tokens(self, request: Dict) -> int:
        """Estimate token count for budget checking."""
        messages = request.get("messages", [])
        text = " ".join(m.get("content", "") for m in messages)
        return len(text) // 4  # Rough estimate

    def _log_request(self, client: ClientContext, request: Dict,
                     response: any, tokens: int):
        """Log request for billing and audit."""
        self.request_log.append({
            "timestamp": datetime.now().isoformat(),
            "client_id": client.client_id,
            "department": client.department,
            "model": request.get("model"),
            "tokens": tokens,
            "latency_ms": getattr(response, "_response_ms", 0)
        })

Cost Allocation by Department

def generate_billing_report(gateway: LLMGateway, period_start: datetime) -> Dict:
    """Generate department-level billing report."""
    costs_by_dept = {}

    for log in gateway.request_log:
        if datetime.fromisoformat(log["timestamp"]) >= period_start:
            dept = log["department"]
            if dept not in costs_by_dept:
                costs_by_dept[dept] = {"tokens": 0, "requests": 0}

            costs_by_dept[dept]["tokens"] += log["tokens"]
            costs_by_dept[dept]["requests"] += 1

    return costs_by_dept

A well-designed gateway enables enterprise-wide AI adoption while maintaining security, compliance, and cost control.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.