October 17, 2024 1 min read

MLflow Tracing for LLM Applications: Open Source Observability

MLflow has expanded to support LLM tracing, providing an open-source alternative for LLM observability. Let’s explore how to use MLflow for tracking and evaluating LLM applications.

Setting Up MLflow for LLMs

# pip install mlflow>=2.9.0

import mlflow
from mlflow.tracking import MlflowClient

# Start MLflow tracking server (or use managed service)
# mlflow server --host 127.0.0.1 --port 5000

# Set tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Create or set experiment
mlflow.set_experiment("llm-application")

Automatic LLM Tracing

import mlflow
from openai import OpenAI

# Enable autologging for OpenAI
mlflow.openai.autolog()

client = OpenAI()

# All OpenAI calls are automatically traced
with mlflow.start_run(run_name="chat-session"):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Hello!"}]
    )

    # Metrics are automatically logged:
    # - Input/output tokens
    # - Latency
    # - Model used
    # - Full request/response

# View in MLflow UI at http://127.0.0.1:5000

Manual Tracing with Spans

import mlflow
from mlflow import trace

@trace
def process_request(user_input: str) -> str:
    """Traced request processing"""

    # Nested traces
    intent = classify_intent(user_input)
    response = generate_response(user_input, intent)

    return response

@trace
def classify_intent(text: str) -> str:
    """Classify user intent"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Classify as: question, command, or chat"},
            {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content.strip()

@trace
def generate_response(text: str, intent: str) -> str:
    """Generate response based on intent"""
    system_prompts = {
        "question": "Answer the question accurately.",
        "command": "Execute the command and report results.",
        "chat": "Engage in friendly conversation."
    }

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompts.get(intent, "Be helpful.")},
            {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content

# Run with automatic trace collection
with mlflow.start_run():
    result = process_request("What is machine learning?")

Custom Metrics and Artifacts

import mlflow

def log_llm_metrics(response, latency_ms: float, cost_usd: float):
    """Log custom LLM metrics"""

    mlflow.log_metrics({
        "latency_ms": latency_ms,
        "cost_usd": cost_usd,
        "input_tokens": response.usage.prompt_tokens,
        "output_tokens": response.usage.completion_tokens,
        "total_tokens": response.usage.total_tokens
    })

def log_conversation(messages: list, response: str):
    """Log conversation as artifact"""

    conversation = {
        "messages": messages,
        "response": response,
        "timestamp": datetime.utcnow().isoformat()
    }

    # Save as JSON artifact
    with open("conversation.json", "w") as f:
        json.dump(conversation, f, indent=2)

    mlflow.log_artifact("conversation.json", "conversations")

# Usage
with mlflow.start_run():
    start = time.time()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Hello!"}]
    )

    latency = (time.time() - start) * 1000
    cost = calculate_cost("gpt-4o", response.usage)

    log_llm_metrics(response, latency, cost)
    log_conversation(
        [{"role": "user", "content": "Hello!"}],
        response.choices[0].message.content
    )

LLM Evaluation with MLflow

import mlflow
from mlflow.metrics.genai import answer_correctness, answer_relevance

# Define evaluation dataset
eval_data = pd.DataFrame({
    "inputs": [
        "What is the capital of France?",
        "Who wrote Hamlet?",
        "What is 2+2?"
    ],
    "ground_truth": [
        "Paris",
        "William Shakespeare",
        "4"
    ]
})

# Define model function
def model(inputs):
    responses = []
    for input_text in inputs["inputs"]:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": input_text}]
        )
        responses.append(response.choices[0].message.content)
    return responses

# Run evaluation
with mlflow.start_run():
    results = mlflow.evaluate(
        model=model,
        data=eval_data,
        targets="ground_truth",
        model_type="text",
        evaluators=["default"],
        extra_metrics=[
            answer_correctness(model_judge="openai:/gpt-4o"),
            answer_relevance()
        ]
    )

    print(f"Metrics: {results.metrics}")
    print(f"Results table: {results.tables}")

Prompt Registry

import mlflow
from mlflow.models import Model

class PromptTemplate:
    """MLflow-compatible prompt template"""

    def __init__(self, system_prompt: str, user_template: str):
        self.system_prompt = system_prompt
        self.user_template = user_template

    def format(self, **kwargs) -> list:
        return [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": self.user_template.format(**kwargs)}
        ]

    def save(self, path: str):
        """Save prompt to file"""
        config = {
            "system_prompt": self.system_prompt,
            "user_template": self.user_template
        }
        with open(f"{path}/prompt_config.json", "w") as f:
            json.dump(config, f)

    @classmethod
    def load(cls, path: str):
        """Load prompt from file"""
        with open(f"{path}/prompt_config.json", "r") as f:
            config = json.load(f)
        return cls(**config)

# Register prompt as MLflow model
def log_prompt(prompt: PromptTemplate, name: str):
    """Log prompt to MLflow"""
    with mlflow.start_run():
        # Log prompt parameters
        mlflow.log_params({
            "system_prompt_length": len(prompt.system_prompt),
            "user_template_length": len(prompt.user_template)
        })

        # Log prompt as artifact
        mlflow.pyfunc.log_model(
            artifact_path="prompt",
            python_model=PromptWrapper(prompt),
            registered_model_name=name
        )

class PromptWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, prompt: PromptTemplate):
        self.prompt = prompt

    def predict(self, context, model_input):
        # Format prompt with input
        messages = self.prompt.format(**model_input.to_dict('records')[0])
        return messages

# Usage
qa_prompt = PromptTemplate(
    system_prompt="Answer questions accurately.",
    user_template="Question: {question}"
)

log_prompt(qa_prompt, "qa-prompt")

# Load and use registered prompt
loaded_prompt = mlflow.pyfunc.load_model("models:/qa-prompt/latest")

Model Comparison

import mlflow

def compare_models(models: list, eval_data: pd.DataFrame):
    """Compare multiple model configurations"""

    results = {}

    for model_config in models:
        with mlflow.start_run(run_name=model_config["name"]):
            mlflow.log_params(model_config)

            # Run evaluation
            metrics = evaluate_model(model_config, eval_data)

            mlflow.log_metrics(metrics)
            results[model_config["name"]] = metrics

    return results

def evaluate_model(config: dict, data: pd.DataFrame) -> dict:
    """Evaluate a single model configuration"""

    correct = 0
    total_latency = 0

    for _, row in data.iterrows():
        start = time.time()

        response = client.chat.completions.create(
            model=config["model"],
            messages=[{"role": "user", "content": row["input"]}],
            temperature=config.get("temperature", 0.7)
        )

        latency = time.time() - start
        total_latency += latency

        if row["expected"].lower() in response.choices[0].message.content.lower():
            correct += 1

    return {
        "accuracy": correct / len(data),
        "avg_latency_ms": (total_latency / len(data)) * 1000
    }

# Compare configurations
models = [
    {"name": "gpt-4o-default", "model": "gpt-4o", "temperature": 0.7},
    {"name": "gpt-4o-precise", "model": "gpt-4o", "temperature": 0.1},
    {"name": "gpt-4o-mini", "model": "gpt-4o-mini", "temperature": 0.7}
]

results = compare_models(models, eval_data)

MLflow provides a solid open-source foundation for LLM observability. Its strength lies in the familiar MLOps workflow and integration with the broader ML ecosystem.