Back to Blog
2 min read

Monitoring AI Applications with Azure Monitor and Application Insights

Effective monitoring is essential for production AI applications. Azure Monitor and Application Insights provide comprehensive observability for tracking performance, detecting anomalies, and debugging issues in AI workloads.

Implementing Custom AI Metrics

Track AI-specific metrics in your application:

from opencensus.ext.azure import metrics_exporter
from opencensus.stats import aggregation, measure, stats, view
from opencensus.tags import tag_map
from opentelemetry import metrics
from azure.monitor.opentelemetry import configure_azure_monitor
import time

class AIMetricsTracker:
    def __init__(self, connection_string: str):
        configure_azure_monitor(connection_string=connection_string)

        self.meter = metrics.get_meter(__name__)

        # Define metrics
        self.inference_latency = self.meter.create_histogram(
            name="ai.inference.latency",
            description="Time taken for model inference",
            unit="ms"
        )

        self.token_usage = self.meter.create_counter(
            name="ai.token.usage",
            description="Total tokens consumed"
        )

        self.inference_count = self.meter.create_counter(
            name="ai.inference.count",
            description="Number of inference requests"
        )

        self.error_count = self.meter.create_counter(
            name="ai.error.count",
            description="Number of inference errors"
        )

    def record_inference(
        self,
        latency_ms: float,
        tokens_used: int,
        model_name: str,
        success: bool
    ):
        """Record metrics for an inference call."""

        attributes = {"model": model_name}

        self.inference_latency.record(latency_ms, attributes)
        self.token_usage.add(tokens_used, attributes)
        self.inference_count.add(1, attributes)

        if not success:
            self.error_count.add(1, attributes)

Creating a Monitored AI Service

Wrap AI calls with comprehensive tracking:

from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
import logging

tracer = trace.get_tracer(__name__)
logger = logging.getLogger(__name__)

class MonitoredAIService:
    def __init__(self, openai_client, metrics_tracker: AIMetricsTracker):
        self.client = openai_client
        self.metrics = metrics_tracker

    def generate_completion(self, prompt: str, model: str = "gpt-4") -> dict:
        """Generate completion with full observability."""

        with tracer.start_as_current_span("ai_completion") as span:
            span.set_attribute("ai.model", model)
            span.set_attribute("ai.prompt_length", len(prompt))

            start_time = time.time()
            success = True

            try:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": prompt}]
                )

                span.set_attribute("ai.completion_tokens", response.usage.completion_tokens)
                span.set_attribute("ai.prompt_tokens", response.usage.prompt_tokens)
                span.set_status(Status(StatusCode.OK))

                result = {
                    "content": response.choices[0].message.content,
                    "tokens": response.usage.total_tokens
                }

            except Exception as e:
                success = False
                span.set_status(Status(StatusCode.ERROR, str(e)))
                logger.error(f"AI completion failed: {e}")
                raise

            finally:
                latency_ms = (time.time() - start_time) * 1000
                tokens = response.usage.total_tokens if success else 0

                self.metrics.record_inference(
                    latency_ms=latency_ms,
                    tokens_used=tokens,
                    model_name=model,
                    success=success
                )

            return result

Setting Up Alerts

Configure alerts for latency spikes, error rate increases, and token usage anomalies to catch issues before they impact users.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.