Back to Blog
4 min read

MLflow Tracing for LLM Applications: Open Source Observability

MLflow has expanded to support LLM tracing, providing an open-source alternative for LLM observability. Let’s explore how to use MLflow for tracking and evaluating LLM applications.

Setting Up MLflow for LLMs

# pip install mlflow>=2.9.0

import mlflow
from mlflow.tracking import MlflowClient

# Start MLflow tracking server (or use managed service)
# mlflow server --host 127.0.0.1 --port 5000

# Set tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Create or set experiment
mlflow.set_experiment("llm-application")

Automatic LLM Tracing

import mlflow
from openai import OpenAI

# Enable autologging for OpenAI
mlflow.openai.autolog()

client = OpenAI()

# All OpenAI calls are automatically traced
with mlflow.start_run(run_name="chat-session"):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Hello!"}]
    )

    # Metrics are automatically logged:
    # - Input/output tokens
    # - Latency
    # - Model used
    # - Full request/response

# View in MLflow UI at http://127.0.0.1:5000

Manual Tracing with Spans

import mlflow
from mlflow import trace

@trace
def process_request(user_input: str) -> str:
    """Traced request processing"""

    # Nested traces
    intent = classify_intent(user_input)
    response = generate_response(user_input, intent)

    return response

@trace
def classify_intent(text: str) -> str:
    """Classify user intent"""
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Classify as: question, command, or chat"},
            {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content.strip()

@trace
def generate_response(text: str, intent: str) -> str:
    """Generate response based on intent"""
    system_prompts = {
        "question": "Answer the question accurately.",
        "command": "Execute the command and report results.",
        "chat": "Engage in friendly conversation."
    }

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompts.get(intent, "Be helpful.")},
            {"role": "user", "content": text}
        ]
    )
    return response.choices[0].message.content

# Run with automatic trace collection
with mlflow.start_run():
    result = process_request("What is machine learning?")

Custom Metrics and Artifacts

import mlflow

def log_llm_metrics(response, latency_ms: float, cost_usd: float):
    """Log custom LLM metrics"""

    mlflow.log_metrics({
        "latency_ms": latency_ms,
        "cost_usd": cost_usd,
        "input_tokens": response.usage.prompt_tokens,
        "output_tokens": response.usage.completion_tokens,
        "total_tokens": response.usage.total_tokens
    })

def log_conversation(messages: list, response: str):
    """Log conversation as artifact"""

    conversation = {
        "messages": messages,
        "response": response,
        "timestamp": datetime.utcnow().isoformat()
    }

    # Save as JSON artifact
    with open("conversation.json", "w") as f:
        json.dump(conversation, f, indent=2)

    mlflow.log_artifact("conversation.json", "conversations")

# Usage
with mlflow.start_run():
    start = time.time()

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": "Hello!"}]
    )

    latency = (time.time() - start) * 1000
    cost = calculate_cost("gpt-4o", response.usage)

    log_llm_metrics(response, latency, cost)
    log_conversation(
        [{"role": "user", "content": "Hello!"}],
        response.choices[0].message.content
    )

LLM Evaluation with MLflow

import mlflow
from mlflow.metrics.genai import answer_correctness, answer_relevance

# Define evaluation dataset
eval_data = pd.DataFrame({
    "inputs": [
        "What is the capital of France?",
        "Who wrote Hamlet?",
        "What is 2+2?"
    ],
    "ground_truth": [
        "Paris",
        "William Shakespeare",
        "4"
    ]
})

# Define model function
def model(inputs):
    responses = []
    for input_text in inputs["inputs"]:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": input_text}]
        )
        responses.append(response.choices[0].message.content)
    return responses

# Run evaluation
with mlflow.start_run():
    results = mlflow.evaluate(
        model=model,
        data=eval_data,
        targets="ground_truth",
        model_type="text",
        evaluators=["default"],
        extra_metrics=[
            answer_correctness(model_judge="openai:/gpt-4o"),
            answer_relevance()
        ]
    )

    print(f"Metrics: {results.metrics}")
    print(f"Results table: {results.tables}")

Prompt Registry

import mlflow
from mlflow.models import Model

class PromptTemplate:
    """MLflow-compatible prompt template"""

    def __init__(self, system_prompt: str, user_template: str):
        self.system_prompt = system_prompt
        self.user_template = user_template

    def format(self, **kwargs) -> list:
        return [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": self.user_template.format(**kwargs)}
        ]

    def save(self, path: str):
        """Save prompt to file"""
        config = {
            "system_prompt": self.system_prompt,
            "user_template": self.user_template
        }
        with open(f"{path}/prompt_config.json", "w") as f:
            json.dump(config, f)

    @classmethod
    def load(cls, path: str):
        """Load prompt from file"""
        with open(f"{path}/prompt_config.json", "r") as f:
            config = json.load(f)
        return cls(**config)

# Register prompt as MLflow model
def log_prompt(prompt: PromptTemplate, name: str):
    """Log prompt to MLflow"""
    with mlflow.start_run():
        # Log prompt parameters
        mlflow.log_params({
            "system_prompt_length": len(prompt.system_prompt),
            "user_template_length": len(prompt.user_template)
        })

        # Log prompt as artifact
        mlflow.pyfunc.log_model(
            artifact_path="prompt",
            python_model=PromptWrapper(prompt),
            registered_model_name=name
        )

class PromptWrapper(mlflow.pyfunc.PythonModel):
    def __init__(self, prompt: PromptTemplate):
        self.prompt = prompt

    def predict(self, context, model_input):
        # Format prompt with input
        messages = self.prompt.format(**model_input.to_dict('records')[0])
        return messages

# Usage
qa_prompt = PromptTemplate(
    system_prompt="Answer questions accurately.",
    user_template="Question: {question}"
)

log_prompt(qa_prompt, "qa-prompt")

# Load and use registered prompt
loaded_prompt = mlflow.pyfunc.load_model("models:/qa-prompt/latest")

Model Comparison

import mlflow

def compare_models(models: list, eval_data: pd.DataFrame):
    """Compare multiple model configurations"""

    results = {}

    for model_config in models:
        with mlflow.start_run(run_name=model_config["name"]):
            mlflow.log_params(model_config)

            # Run evaluation
            metrics = evaluate_model(model_config, eval_data)

            mlflow.log_metrics(metrics)
            results[model_config["name"]] = metrics

    return results

def evaluate_model(config: dict, data: pd.DataFrame) -> dict:
    """Evaluate a single model configuration"""

    correct = 0
    total_latency = 0

    for _, row in data.iterrows():
        start = time.time()

        response = client.chat.completions.create(
            model=config["model"],
            messages=[{"role": "user", "content": row["input"]}],
            temperature=config.get("temperature", 0.7)
        )

        latency = time.time() - start
        total_latency += latency

        if row["expected"].lower() in response.choices[0].message.content.lower():
            correct += 1

    return {
        "accuracy": correct / len(data),
        "avg_latency_ms": (total_latency / len(data)) * 1000
    }

# Compare configurations
models = [
    {"name": "gpt-4o-default", "model": "gpt-4o", "temperature": 0.7},
    {"name": "gpt-4o-precise", "model": "gpt-4o", "temperature": 0.1},
    {"name": "gpt-4o-mini", "model": "gpt-4o-mini", "temperature": 0.7}
]

results = compare_models(models, eval_data)

MLflow provides a solid open-source foundation for LLM observability. Its strength lies in the familiar MLOps workflow and integration with the broader ML ecosystem.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.