Skip to content
Back to Blog
1 min read

LLM Observability Tools: Comparing the Landscape

I wrote “LLM Observability Tools: Comparing the Landscape” to share practical, production-minded guidance on this topic.

Tool Categories

from dataclasses import dataclass
from typing import List, Dict
from enum import Enum

class ObservabilityCategory(Enum):
    TRACING = "tracing"
    EVALUATION = "evaluation"
    MONITORING = "monitoring"
    DEBUGGING = "debugging"
    ANALYTICS = "analytics"

@dataclass
class ObservabilityTool:
    name: str
    categories: List[ObservabilityCategory]
    pricing: str  # "free", "freemium", "paid"
    self_hosted: bool
    cloud_hosted: bool
    key_features: List[str]
    integrations: List[str]
    best_for: str

TOOLS = [
    ObservabilityTool(
        name="LangSmith",
        categories=[ObservabilityCategory.TRACING, ObservabilityCategory.DEBUGGING, ObservabilityCategory.EVALUATION],
        pricing="freemium",
        self_hosted=False,
        cloud_hosted=True,
        key_features=[
            "LangChain native integration",
            "Prompt playground",
            "Dataset management",
            "Automated testing"
        ],
        integrations=["LangChain", "LangGraph", "OpenAI", "Anthropic"],
        best_for="LangChain-based applications"
    ),
    ObservabilityTool(
        name="Weights & Biases",
        categories=[ObservabilityCategory.MONITORING, ObservabilityCategory.ANALYTICS, ObservabilityCategory.EVALUATION],
        pricing="freemium",
        self_hosted=True,
        cloud_hosted=True,
        key_features=[
            "Experiment tracking",
            "Model versioning",
            "Prompt management",
            "Team collaboration"
        ],
        integrations=["OpenAI", "Anthropic", "HuggingFace", "LangChain"],
        best_for="ML teams with existing W&B usage"
    ),
    ObservabilityTool(
        name="Phoenix (Arize)",
        categories=[ObservabilityCategory.TRACING, ObservabilityCategory.DEBUGGING],
        pricing="free",
        self_hosted=True,
        cloud_hosted=False,
        key_features=[
            "Local-first development",
            "OpenTelemetry compatible",
            "Embedding visualization",
            "No data leaves your system"
        ],
        integrations=["OpenAI", "LangChain", "LlamaIndex", "OpenTelemetry"],
        best_for="Privacy-conscious teams"
    ),
    ObservabilityTool(
        name="Arize AI",
        categories=[ObservabilityCategory.MONITORING, ObservabilityCategory.EVALUATION, ObservabilityCategory.ANALYTICS],
        pricing="paid",
        self_hosted=False,
        cloud_hosted=True,
        key_features=[
            "Production monitoring",
            "Drift detection",
            "Performance analytics",
            "Alerting"
        ],
        integrations=["OpenAI", "Anthropic", "AWS Bedrock", "Azure OpenAI"],
        best_for="Production ML monitoring"
    ),
    ObservabilityTool(
        name="Helicone",
        categories=[ObservabilityCategory.MONITORING, ObservabilityCategory.ANALYTICS],
        pricing="freemium",
        self_hosted=True,
        cloud_hosted=True,
        key_features=[
            "Proxy-based (no code changes)",
            "Cost tracking",
            "Rate limiting",
            "Caching"
        ],
        integrations=["OpenAI", "Anthropic", "Azure OpenAI", "Any LLM API"],
        best_for="Quick setup, cost management"
    )
]

def recommend_tool(requirements: Dict) -> List[ObservabilityTool]:
    """Recommend tools based on requirements"""
    recommendations = []

    for tool in TOOLS:
        score = 0

        # Check categories
        if requirements.get("categories"):
            matching = set(tool.categories) & set(requirements["categories"])
            score += len(matching) * 2

        # Check pricing
        if requirements.get("free_only") and tool.pricing == "free":
            score += 3
        elif requirements.get("budget_friendly") and tool.pricing in ["free", "freemium"]:
            score += 2

        # Check hosting
        if requirements.get("self_hosted") and tool.self_hosted:
            score += 2
        if requirements.get("cloud_hosted") and tool.cloud_hosted:
            score += 1

        # Check integrations
        if requirements.get("integrations"):
            matching = set(tool.integrations) & set(requirements["integrations"])
            score += len(matching)

        if score > 0:
            recommendations.append((tool, score))

    return [t[0] for t in sorted(recommendations, key=lambda x: x[1], reverse=True)]

Integration Examples

LangSmith Integration

# pip install langsmith langchain

from langchain.callbacks import LangSmithCallbackHandler
from langchain_openai import ChatOpenAI

# Set environment variables
# LANGCHAIN_TRACING_V2=true
# LANGCHAIN_API_KEY=your_key
# LANGCHAIN_PROJECT=my-project

llm = ChatOpenAI(
    model="gpt-4o",
    callbacks=[LangSmithCallbackHandler()]
)

# All calls are automatically traced
response = llm.invoke("Hello, world!")

Weights & Biases Integration

# pip install wandb weave

import wandb
import weave

# Initialize W&B
wandb.init(project="llm-app")

# Use Weave for LLM tracing
@weave.op()
def call_llm(prompt: str) -> str:
    from openai import OpenAI
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

# Calls are tracked in W&B
result = call_llm("What is machine learning?")

Phoenix/Arize Integration

# pip install arize-phoenix openinference-instrumentation-openai

import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider

# Start Phoenix
session = px.launch_app()

# Set up OpenTelemetry
tracer_provider = TracerProvider()
trace.set_tracer_provider(tracer_provider)

# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

# Now all OpenAI calls are traced
from openai import OpenAI
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Hello!"}]
)

# View traces at http://localhost:6006

Helicone Integration

from openai import OpenAI

# Simply change the base URL and add a header
client = OpenAI(
    base_url="https://oai.helicone.ai/v1",
    default_headers={
        "Helicone-Auth": f"Bearer {HELICONE_API_KEY}"
    }
)

# All calls are now logged through Helicone
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[{"role": "user", "content": "Hello!"}]
)

Building Your Own Observability

class LightweightLLMObserver:
    """Simple observability for small teams"""

    def __init__(self, storage_path: str = "./llm_logs"):
        self.storage_path = storage_path
        os.makedirs(storage_path, exist_ok=True)

    def log_call(self, model: str, messages: List[Dict],
                response: str, usage: Dict, latency_ms: float):
        """Log an LLM call"""
        import json
        from datetime import datetime

        entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "model": model,
            "messages": messages,
            "response": response,
            "usage": usage,
            "latency_ms": latency_ms,
            "cost_usd": self._calculate_cost(model, usage)
        }

        # Append to daily log file
        date_str = datetime.utcnow().strftime("%Y-%m-%d")
        log_file = f"{self.storage_path}/calls_{date_str}.jsonl"

        with open(log_file, "a") as f:
            f.write(json.dumps(entry) + "\n")

    def get_daily_summary(self, date_str: str = None) -> Dict:
        """Get summary for a day"""
        import json
        from datetime import datetime

        date_str = date_str or datetime.utcnow().strftime("%Y-%m-%d")
        log_file = f"{self.storage_path}/calls_{date_str}.jsonl"

        if not os.path.exists(log_file):
            return {"error": "No logs for this date"}

        entries = []
        with open(log_file, "r") as f:
            for line in f:
                entries.append(json.loads(line.strip()))

        return {
            "date": date_str,
            "total_calls": len(entries),
            "total_tokens": sum(e["usage"].get("total_tokens", 0) for e in entries),
            "total_cost_usd": sum(e.get("cost_usd", 0) for e in entries),
            "avg_latency_ms": sum(e["latency_ms"] for e in entries) / len(entries) if entries else 0,
            "by_model": self._group_by_model(entries)
        }

    def _calculate_cost(self, model: str, usage: Dict) -> float:
        pricing = {
            "gpt-4o": (2.50, 10.00),
            "gpt-4o-mini": (0.15, 0.60),
        }
        input_rate, output_rate = pricing.get(model, (2.50, 10.00))
        input_tokens = usage.get("prompt_tokens", 0)
        output_tokens = usage.get("completion_tokens", 0)
        return (input_tokens * input_rate + output_tokens * output_rate) / 1_000_000

    def _group_by_model(self, entries: List[Dict]) -> Dict:
        by_model = {}
        for e in entries:
            model = e["model"]
            if model not in by_model:
                by_model[model] = {"calls": 0, "tokens": 0, "cost": 0}
            by_model[model]["calls"] += 1
            by_model[model]["tokens"] += e["usage"].get("total_tokens", 0)
            by_model[model]["cost"] += e.get("cost_usd", 0)
        return by_model

Choose your observability tools based on your team size, budget, privacy requirements, and existing tooling. Start simple and add more sophisticated tools as your needs grow.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.