Back to Blog
5 min read

LLM Observability Tools: Comparing the Landscape

The LLM observability space has exploded with specialized tools. Let’s compare the major players and understand when to use each.

Tool Categories

from dataclasses import dataclass
from typing import List, Dict
from enum import Enum

class ObservabilityCategory(Enum):
    TRACING = "tracing"
    EVALUATION = "evaluation"
    MONITORING = "monitoring"
    DEBUGGING = "debugging"
    ANALYTICS = "analytics"

@dataclass
class ObservabilityTool:
    name: str
    categories: List[ObservabilityCategory]
    pricing: str  # "free", "freemium", "paid"
    self_hosted: bool
    cloud_hosted: bool
    key_features: List[str]
    integrations: List[str]
    best_for: str

TOOLS = [
    ObservabilityTool(
        name="LangSmith",
        categories=[ObservabilityCategory.TRACING, ObservabilityCategory.DEBUGGING, ObservabilityCategory.EVALUATION],
        pricing="freemium",
        self_hosted=False,
        cloud_hosted=True,
        key_features=[
            "LangChain native integration",
            "Prompt playground",
            "Dataset management",
            "Automated testing"
        ],
        integrations=["LangChain", "LangGraph", "OpenAI", "Anthropic"],
        best_for="LangChain-based applications"
    ),
    ObservabilityTool(
        name="Weights & Biases",
        categories=[ObservabilityCategory.MONITORING, ObservabilityCategory.ANALYTICS, ObservabilityCategory.EVALUATION],
        pricing="freemium",
        self_hosted=True,
        cloud_hosted=True,
        key_features=[
            "Experiment tracking",
            "Model versioning",
            "Prompt management",
            "Team collaboration"
        ],
        integrations=["OpenAI", "Anthropic", "HuggingFace", "LangChain"],
        best_for="ML teams with existing W&B usage"
    ),
    ObservabilityTool(
        name="Phoenix (Arize)",
        categories=[ObservabilityCategory.TRACING, ObservabilityCategory.DEBUGGING],
        pricing="free",
        self_hosted=True,
        cloud_hosted=False,
        key_features=[
            "Local-first development",
            "OpenTelemetry compatible",
            "Embedding visualization",
            "No data leaves your system"
        ],
        integrations=["OpenAI", "LangChain", "LlamaIndex", "OpenTelemetry"],
        best_for="Privacy-conscious teams"
    ),
    ObservabilityTool(
        name="Arize AI",
        categories=[ObservabilityCategory.MONITORING, ObservabilityCategory.EVALUATION, ObservabilityCategory.ANALYTICS],
        pricing="paid",
        self_hosted=False,
        cloud_hosted=True,
        key_features=[
            "Production monitoring",
            "Drift detection",
            "Performance analytics",
            "Alerting"
        ],
        integrations=["OpenAI", "Anthropic", "AWS Bedrock", "Azure OpenAI"],
        best_for="Production ML monitoring"
    ),
    ObservabilityTool(
        name="Helicone",
        categories=[ObservabilityCategory.MONITORING, ObservabilityCategory.ANALYTICS],
        pricing="freemium",
        self_hosted=True,
        cloud_hosted=True,
        key_features=[
            "Proxy-based (no code changes)",
            "Cost tracking",
            "Rate limiting",
            "Caching"
        ],
        integrations=["OpenAI", "Anthropic", "Azure OpenAI", "Any LLM API"],
        best_for="Quick setup, cost management"
    )
]

def recommend_tool(requirements: Dict) -> List[ObservabilityTool]:
    """Recommend tools based on requirements"""
    recommendations = []

    for tool in TOOLS:
        score = 0

        # Check categories
        if requirements.get("categories"):
            matching = set(tool.categories) & set(requirements["categories"])
            score += len(matching) * 2

        # Check pricing
        if requirements.get("free_only") and tool.pricing == "free":
            score += 3
        elif requirements.get("budget_friendly") and tool.pricing in ["free", "freemium"]:
            score += 2

        # Check hosting
        if requirements.get("self_hosted") and tool.self_hosted:
            score += 2
        if requirements.get("cloud_hosted") and tool.cloud_hosted:
            score += 1

        # Check integrations
        if requirements.get("integrations"):
            matching = set(tool.integrations) & set(requirements["integrations"])
            score += len(matching)

        if score > 0:
            recommendations.append((tool, score))

    return [t[0] for t in sorted(recommendations, key=lambda x: x[1], reverse=True)]

Integration Examples

LangSmith Integration

# pip install langsmith langchain

from langchain.callbacks import LangSmithCallbackHandler
from langchain_openai import ChatOpenAI

# Set environment variables
# LANGCHAIN_TRACING_V2=true
# LANGCHAIN_API_KEY=your_key
# LANGCHAIN_PROJECT=my-project

llm = ChatOpenAI(
    model="gpt-4o",
    callbacks=[LangSmithCallbackHandler()]
)

# All calls are automatically traced
response = llm.invoke("Hello, world!")

Weights & Biases Integration

# pip install wandb weave

import wandb
import weave

# Initialize W&B
wandb.init(project="llm-app")

# Use Weave for LLM tracing
@weave.op()
def call_llm(prompt: str) -> str:
    from openai import OpenAI
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

# Calls are tracked in W&B
result = call_llm("What is machine learning?")

Phoenix/Arize Integration

# pip install arize-phoenix openinference-instrumentation-openai

import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider

# Start Phoenix
session = px.launch_app()

# Set up OpenTelemetry
tracer_provider = TracerProvider()
trace.set_tracer_provider(tracer_provider)

# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

# Now all OpenAI calls are traced
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Hello!"}]
)

# View traces at http://localhost:6006

Helicone Integration

from openai import OpenAI

# Simply change the base URL and add a header
client = OpenAI(
    base_url="https://oai.helicone.ai/v1",
    default_headers={
        "Helicone-Auth": f"Bearer {HELICONE_API_KEY}"
    }
)

# All calls are now logged through Helicone
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Hello!"}]
)

Building Your Own Observability

class LightweightLLMObserver:
    """Simple observability for small teams"""

    def __init__(self, storage_path: str = "./llm_logs"):
        self.storage_path = storage_path
        os.makedirs(storage_path, exist_ok=True)

    def log_call(self, model: str, messages: List[Dict],
                response: str, usage: Dict, latency_ms: float):
        """Log an LLM call"""
        import json
        from datetime import datetime

        entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "model": model,
            "messages": messages,
            "response": response,
            "usage": usage,
            "latency_ms": latency_ms,
            "cost_usd": self._calculate_cost(model, usage)
        }

        # Append to daily log file
        date_str = datetime.utcnow().strftime("%Y-%m-%d")
        log_file = f"{self.storage_path}/calls_{date_str}.jsonl"

        with open(log_file, "a") as f:
            f.write(json.dumps(entry) + "\n")

    def get_daily_summary(self, date_str: str = None) -> Dict:
        """Get summary for a day"""
        import json
        from datetime import datetime

        date_str = date_str or datetime.utcnow().strftime("%Y-%m-%d")
        log_file = f"{self.storage_path}/calls_{date_str}.jsonl"

        if not os.path.exists(log_file):
            return {"error": "No logs for this date"}

        entries = []
        with open(log_file, "r") as f:
            for line in f:
                entries.append(json.loads(line.strip()))

        return {
            "date": date_str,
            "total_calls": len(entries),
            "total_tokens": sum(e["usage"].get("total_tokens", 0) for e in entries),
            "total_cost_usd": sum(e.get("cost_usd", 0) for e in entries),
            "avg_latency_ms": sum(e["latency_ms"] for e in entries) / len(entries) if entries else 0,
            "by_model": self._group_by_model(entries)
        }

    def _calculate_cost(self, model: str, usage: Dict) -> float:
        pricing = {
            "gpt-4o": (2.50, 10.00),
            "gpt-4o-mini": (0.15, 0.60),
        }
        input_rate, output_rate = pricing.get(model, (2.50, 10.00))
        input_tokens = usage.get("prompt_tokens", 0)
        output_tokens = usage.get("completion_tokens", 0)
        return (input_tokens * input_rate + output_tokens * output_rate) / 1_000_000

    def _group_by_model(self, entries: List[Dict]) -> Dict:
        by_model = {}
        for e in entries:
            model = e["model"]
            if model not in by_model:
                by_model[model] = {"calls": 0, "tokens": 0, "cost": 0}
            by_model[model]["calls"] += 1
            by_model[model]["tokens"] += e["usage"].get("total_tokens", 0)
            by_model[model]["cost"] += e.get("cost_usd", 0)
        return by_model

Choose your observability tools based on your team size, budget, privacy requirements, and existing tooling. Start simple and add more sophisticated tools as your needs grow.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.