Back to Blog
4 min read

Phoenix Observability: Local-First LLM Tracing

Phoenix from Arize provides local-first, privacy-preserving LLM observability. Let’s explore how to use it for development and production monitoring.

Getting Started with Phoenix

# pip install arize-phoenix openinference-instrumentation-openai

import phoenix as px
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry import trace
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
from phoenix.otel import register

# Launch Phoenix app
session = px.launch_app()
print(f"Phoenix UI available at: {session.url}")

# Set up OpenTelemetry with Phoenix
tracer_provider = TracerProvider()
register(tracer_provider=tracer_provider)
trace.set_tracer_provider(tracer_provider)

# Instrument OpenAI
OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

Automatic Instrumentation

from openai import OpenAI

# All OpenAI calls are automatically traced
client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is machine learning?"}
    ]
)

# View in Phoenix UI - shows:
# - Full request/response
# - Token counts
# - Latency
# - Model used

Multi-Framework Instrumentation

# Instrument multiple frameworks
from openinference.instrumentation.langchain import LangChainInstrumentor
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from openinference.instrumentation.anthropic import AnthropicInstrumentor

# LangChain
LangChainInstrumentor().instrument(tracer_provider=tracer_provider)

# LlamaIndex
LlamaIndexInstrumentor().instrument(tracer_provider=tracer_provider)

# Anthropic
AnthropicInstrumentor().instrument(tracer_provider=tracer_provider)

# Now all calls to these libraries are traced
from langchain_openai import ChatOpenAI
from llama_index.llms.openai import OpenAI as LlamaOpenAI
import anthropic

# Each framework's calls appear in Phoenix

Custom Spans and Attributes

from opentelemetry import trace

tracer = trace.get_tracer(__name__)

def process_document(document: str) -> dict:
    """Process a document with custom tracing"""

    with tracer.start_as_current_span("process_document") as span:
        span.set_attribute("document_length", len(document))

        # Extract entities
        with tracer.start_as_current_span("extract_entities") as extract_span:
            entities = extract_entities_with_llm(document)
            extract_span.set_attribute("entity_count", len(entities))

        # Summarize
        with tracer.start_as_current_span("summarize") as summary_span:
            summary = summarize_with_llm(document)
            summary_span.set_attribute("summary_length", len(summary))

        # Generate insights
        with tracer.start_as_current_span("generate_insights"):
            insights = generate_insights(document, entities, summary)

        return {
            "entities": entities,
            "summary": summary,
            "insights": insights
        }

def extract_entities_with_llm(text: str) -> list:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Extract named entities as JSON array."},
            {"role": "user", "content": text}
        ]
    )
    return json.loads(response.choices[0].message.content)

Embedding Visualization

import phoenix as px
import pandas as pd
import numpy as np

# Create embeddings for analysis
def create_embedding(text: str) -> list:
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input=text
    )
    return response.data[0].embedding

# Sample data
documents = [
    "Machine learning is a subset of AI",
    "Deep learning uses neural networks",
    "Natural language processing handles text",
    "Computer vision processes images",
    "Reinforcement learning learns from rewards"
]

embeddings = [create_embedding(doc) for doc in documents]

# Create DataFrame for Phoenix
df = pd.DataFrame({
    "text": documents,
    "embedding": embeddings,
    "category": ["ML", "DL", "NLP", "CV", "RL"]
})

# View embeddings in Phoenix
px.Client().log_dataframe(
    df,
    dataframe_name="document_embeddings",
    embedding_column="embedding"
)

# Phoenix provides:
# - 2D/3D embedding visualization
# - Cluster analysis
# - Similarity search
# - Drift detection

Evaluation in Phoenix

from phoenix.evals import (
    HallucinationEvaluator,
    RelevanceEvaluator,
    QAEvaluator
)

# Create evaluators
hallucination_eval = HallucinationEvaluator(
    model=OpenAIModel(model_name="gpt-4o")
)

relevance_eval = RelevanceEvaluator(
    model=OpenAIModel(model_name="gpt-4o")
)

# Evaluate responses
def evaluate_response(question: str, context: str, response: str) -> dict:
    """Evaluate an LLM response"""

    # Check for hallucination
    hallucination_result = hallucination_eval.evaluate(
        input=question,
        reference=context,
        output=response
    )

    # Check relevance
    relevance_result = relevance_eval.evaluate(
        input=question,
        output=response
    )

    return {
        "is_hallucination": hallucination_result.score < 0.5,
        "hallucination_score": hallucination_result.score,
        "is_relevant": relevance_result.score > 0.7,
        "relevance_score": relevance_result.score
    }

# Bulk evaluation
eval_data = [
    {
        "question": "What is Python?",
        "context": "Python is a programming language.",
        "response": "Python is a high-level programming language."
    },
    # ... more examples
]

for item in eval_data:
    result = evaluate_response(**item)
    print(f"Question: {item['question']}")
    print(f"Evaluation: {result}")

Exporting Data

import phoenix as px

# Get client
client = px.Client()

# Export traces
traces_df = client.get_spans_dataframe()

# Export to various formats
traces_df.to_csv("traces.csv")
traces_df.to_parquet("traces.parquet")

# Query specific traces
filtered_traces = client.get_spans_dataframe(
    filter_condition="span_kind == 'LLM'"
)

# Get evaluation results
eval_df = client.get_evaluations_dataframe()

# Analysis
print(f"Total LLM calls: {len(filtered_traces)}")
print(f"Average latency: {filtered_traces['latency_ms'].mean():.2f}ms")
print(f"Total tokens: {filtered_traces['total_tokens'].sum()}")

Production Deployment

# Phoenix can run as a standalone server for production

# Option 1: Docker
# docker run -p 6006:6006 arizephoenix/phoenix:latest

# Option 2: Python process
import phoenix as px
from phoenix.server import ThreadServer

# Start Phoenix server
server = ThreadServer(port=6006)
server.run()

# Configure client to connect
import os
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = "http://localhost:6006"

# Instrument application
from openinference.instrumentation.openai import OpenAIInstrumentor
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk.trace.export import BatchSpanProcessor

# Export to Phoenix server
exporter = OTLPSpanExporter(endpoint="http://localhost:6006/v1/traces")
tracer_provider.add_span_processor(BatchSpanProcessor(exporter))

# Now traces are sent to Phoenix server

Phoenix provides powerful local-first observability that keeps your data private while offering the visualization and analysis capabilities needed for effective LLM development.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.