2 min read
Implementing Observability for AI Applications with OpenTelemetry
Traditional observability tools weren’t designed for AI workloads. In 2025, we developed new patterns for monitoring LLM applications effectively. Here’s how to implement comprehensive AI observability.
The Three Pillars for AI
Beyond standard metrics, traces, and logs, AI applications need:
- Token usage tracking
- Latency by model/operation
- Response quality metrics
- Cost attribution
OpenTelemetry Setup
from opentelemetry import trace, metrics
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
# Initialize providers
trace.set_tracer_provider(TracerProvider())
trace.get_tracer_provider().add_span_processor(
BatchSpanProcessor(OTLPSpanExporter(endpoint="your-collector:4317"))
)
meter_provider = MeterProvider(
metric_readers=[PeriodicExportingMetricReader(
OTLPMetricExporter(endpoint="your-collector:4317"),
export_interval_millis=60000
)]
)
metrics.set_meter_provider(meter_provider)
tracer = trace.get_tracer("ai-service")
meter = metrics.get_meter("ai-service")
Custom AI Metrics
# Define AI-specific metrics
token_counter = meter.create_counter(
"ai.tokens.total",
description="Total tokens used",
unit="tokens"
)
latency_histogram = meter.create_histogram(
"ai.completion.latency",
description="LLM completion latency",
unit="ms"
)
cost_counter = meter.create_counter(
"ai.cost.total",
description="Estimated AI cost",
unit="usd"
)
class AIMetricsCollector:
def __init__(self):
self.pricing = {
"gpt-4": {"input": 0.03, "output": 0.06},
"gpt-4o": {"input": 0.005, "output": 0.015},
"gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015}
}
def record_completion(
self,
model: str,
input_tokens: int,
output_tokens: int,
latency_ms: float,
user_id: str
):
attributes = {"model": model, "user_id": user_id}
# Record tokens
token_counter.add(input_tokens, {**attributes, "direction": "input"})
token_counter.add(output_tokens, {**attributes, "direction": "output"})
# Record latency
latency_histogram.record(latency_ms, attributes)
# Calculate and record cost
pricing = self.pricing.get(model, {"input": 0.01, "output": 0.03})
cost = (input_tokens * pricing["input"] + output_tokens * pricing["output"]) / 1000
cost_counter.add(cost, attributes)
Tracing AI Operations
from opentelemetry.trace import Status, StatusCode
async def traced_completion(prompt: str, model: str = "gpt-4"):
with tracer.start_as_current_span("llm.completion") as span:
span.set_attribute("ai.model", model)
span.set_attribute("ai.prompt.length", len(prompt))
start_time = time.time()
try:
response = await openai_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
# Record completion attributes
span.set_attribute("ai.response.length", len(response.choices[0].message.content))
span.set_attribute("ai.tokens.input", response.usage.prompt_tokens)
span.set_attribute("ai.tokens.output", response.usage.completion_tokens)
span.set_attribute("ai.finish_reason", response.choices[0].finish_reason)
latency = (time.time() - start_time) * 1000
metrics_collector.record_completion(
model, response.usage.prompt_tokens,
response.usage.completion_tokens, latency, user_id
)
return response
except Exception as e:
span.set_status(Status(StatusCode.ERROR, str(e)))
span.record_exception(e)
raise
Dashboard Queries (KQL)
// Token usage by model over time
customMetrics
| where name == "ai.tokens.total"
| summarize TotalTokens = sum(value) by bin(timestamp, 1h), tostring(customDimensions.model)
| render timechart
// P95 latency by model
customMetrics
| where name == "ai.completion.latency"
| summarize P95 = percentile(value, 95) by tostring(customDimensions.model)
Observability is essential for AI applications. Without it, you’re flying blind on costs, performance, and quality. Implement these patterns early and iterate based on production insights.