1 min read
Evaluation Improvements in Azure AI Studio
I wrote “Evaluation Improvements in Azure AI Studio” to share practical, production-minded guidance on this topic.
Why Evaluation Matters
Without proper evaluation:
- Models may hallucinate without detection
- Quality degrades silently over time
- Compliance violations go unnoticed
- User experience suffers
Evaluation Framework Overview
Input Data
↓
┌─────────────────────────────────────┐
│ Evaluation Pipeline │
├─────────────────────────────────────┤
│ ┌─────────┐ ┌─────────┐ ┌─────┐ │
│ │Relevance│ │Grounded-│ │Coher│ │
│ │ │ │ ness │ │ence │ │
│ └────┬────┘ └────┬────┘ └──┬──┘ │
│ │ │ │ │
│ ┌────┴────────────┴──────────┴──┐ │
│ │ Aggregation │ │
│ └───────────────────────────────┘ │
└─────────────────────────────────────┘
↓
Quality Metrics & Alerts
Built-in Evaluators
Relevance Evaluator
Measures if the answer addresses the question:
from azure.ai.evaluation import RelevanceEvaluator
# Configure with model
relevance_eval = RelevanceEvaluator(model_config={
"azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
"api_key": os.environ["AZURE_OPENAI_KEY"],
"azure_deployment": "gpt-4o",
"api_version": "2024-05-01-preview"
})
# Single evaluation
result = relevance_eval(
question="What are the benefits of microservices?",
answer="Microservices enable independent deployment, scaling, and technology choices."
)
print(f"Relevance score: {result['relevance']}") # 1-5 scale
print(f"Reason: {result['relevance_reason']}")
Groundedness Evaluator
Checks if the answer is based on provided context:
from azure.ai.evaluation import GroundednessEvaluator
groundedness_eval = GroundednessEvaluator(model_config)
result = groundedness_eval(
question="What is the company's revenue?",
answer="The company reported $5 billion in revenue.",
context="Annual Report 2023: Total revenue was $5 billion, up 15% from 2022."
)
print(f"Groundedness score: {result['groundedness']}")
print(f"Ungrounded statements: {result.get('ungrounded_statements', [])}")
Coherence Evaluator
Assesses logical flow and clarity:
from azure.ai.evaluation import CoherenceEvaluator
coherence_eval = CoherenceEvaluator(model_config)
result = coherence_eval(
question="Explain cloud computing",
answer="""Cloud computing delivers computing services over the internet.
These services include servers, storage, databases, and networking.
Benefits include scalability, cost efficiency, and global reach."""
)
print(f"Coherence score: {result['coherence']}")
Fluency Evaluator
Measures language quality:
from azure.ai.evaluation import FluencyEvaluator
fluency_eval = FluencyEvaluator(model_config)
result = fluency_eval(
question="What is AI?",
answer="Artificial intelligence is the simulation of human intelligence by machines."
)
print(f"Fluency score: {result['fluency']}")
Content Safety Evaluation
from azure.ai.evaluation import ContentSafetyEvaluator
safety_eval = ContentSafetyEvaluator(
azure_ai_project={
"subscription_id": os.environ["SUBSCRIPTION_ID"],
"resource_group_name": os.environ["RESOURCE_GROUP"],
"project_name": os.environ["PROJECT_NAME"]
}
)
result = safety_eval(
question="User input here",
answer="Model response here"
)
print(f"Violence: {result['violence']}")
print(f"Sexual: {result['sexual']}")
print(f"Self-harm: {result['self_harm']}")
print(f"Hate/Unfairness: {result['hate_unfairness']}")
Batch Evaluation
from azure.ai.evaluation import evaluate
# Prepare evaluation data
eval_data = [
{
"question": "What is Azure?",
"context": "Azure is Microsoft's cloud computing platform.",
"answer": "Azure is Microsoft's cloud platform offering compute, storage, and more."
},
{
"question": "How does Azure pricing work?",
"context": "Azure uses pay-as-you-go pricing based on resource consumption.",
"answer": "You pay for what you use with Azure's consumption-based model."
}
]
# Run batch evaluation
results = evaluate(
data=eval_data,
evaluators={
"relevance": RelevanceEvaluator(model_config),
"groundedness": GroundednessEvaluator(model_config),
"coherence": CoherenceEvaluator(model_config)
},
evaluator_config={
"relevance": {
"question": "${data.question}",
"answer": "${data.answer}"
},
"groundedness": {
"question": "${data.question}",
"answer": "${data.answer}",
"context": "${data.context}"
},
"coherence": {
"question": "${data.question}",
"answer": "${data.answer}"
}
},
output_path="./eval_results"
)
# Summary metrics
print(f"Average relevance: {results.metrics['relevance.relevance.mean']:.2f}")
print(f"Average groundedness: {results.metrics['groundedness.groundedness.mean']:.2f}")
print(f"Average coherence: {results.metrics['coherence.coherence.mean']:.2f}")
Custom Evaluators
from azure.ai.evaluation import Evaluator
class DomainAccuracyEvaluator(Evaluator):
"""Custom evaluator for domain-specific accuracy."""
def __init__(self, knowledge_base: dict):
self.knowledge_base = knowledge_base
def __call__(
self,
question: str,
answer: str,
**kwargs
) -> dict:
# Check for factual accuracy against knowledge base
facts_mentioned = self._extract_facts(answer)
correct_facts = 0
total_facts = len(facts_mentioned)
for fact in facts_mentioned:
if self._verify_fact(fact):
correct_facts += 1
accuracy = correct_facts / total_facts if total_facts > 0 else 1.0
return {
"domain_accuracy": accuracy,
"facts_checked": total_facts,
"facts_correct": correct_facts
}
def _extract_facts(self, text: str) -> list:
# Extract factual claims from text
# This is a simplified example
return text.split(". ")
def _verify_fact(self, fact: str) -> bool:
# Verify against knowledge base
for key, value in self.knowledge_base.items():
if key.lower() in fact.lower():
return value.lower() in fact.lower()
return True # Assume true if not in KB
# Use custom evaluator
kb = {"azure regions": "60+ regions worldwide"}
domain_eval = DomainAccuracyEvaluator(kb)
result = domain_eval(
question="How many Azure regions are there?",
answer="Azure has 60+ regions worldwide."
)
print(f"Domain accuracy: {result['domain_accuracy']}")
Composite Evaluation
from azure.ai.evaluation import CompositeEvaluator
class RAGEvaluator(CompositeEvaluator):
"""Comprehensive evaluator for RAG applications."""
def __init__(self, model_config: dict, project_config: dict):
super().__init__({
"relevance": RelevanceEvaluator(model_config),
"groundedness": GroundednessEvaluator(model_config),
"coherence": CoherenceEvaluator(model_config),
"fluency": FluencyEvaluator(model_config),
"safety": ContentSafetyEvaluator(project_config)
})
def __call__(self, **kwargs) -> dict:
results = super().__call__(**kwargs)
# Calculate composite score
quality_scores = [
results.get("relevance", 0),
results.get("groundedness", 0),
results.get("coherence", 0),
results.get("fluency", 0)
]
safety_passed = all(
results.get(metric, "safe") == "safe"
for metric in ["violence", "sexual", "self_harm", "hate_unfairness"]
)
results["composite_score"] = sum(quality_scores) / len(quality_scores)
results["safety_passed"] = safety_passed
results["overall_passed"] = (
results["composite_score"] >= 3.5 and safety_passed
)
return results
# Usage
rag_eval = RAGEvaluator(model_config, project_config)
result = rag_eval(
question="What is Azure?",
answer="Azure is Microsoft's cloud platform.",
context="Azure provides cloud services..."
)
print(f"Overall passed: {result['overall_passed']}")
CI/CD Integration
# azure-pipelines.yml
trigger:
branches:
include:
- main
stages:
- stage: Evaluate
jobs:
- job: AIEvaluation
pool:
vmImage: 'ubuntu-latest'
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: '3.11'
- script: |
pip install azure-ai-evaluation azure-identity
displayName: 'Install dependencies'
- script: |
python run_evaluation.py \
--data ./test_data.jsonl \
--output ./results \
--threshold 3.5
displayName: 'Run AI Evaluation'
env:
AZURE_OPENAI_ENDPOINT: $(AZURE_OPENAI_ENDPOINT)
AZURE_OPENAI_KEY: $(AZURE_OPENAI_KEY)
- task: PublishPipelineArtifact@1
inputs:
targetPath: './results'
artifact: 'evaluation-results'
- script: |
python check_thresholds.py ./results
displayName: 'Check Quality Thresholds'
# check_thresholds.py
import json
import sys
def check_thresholds(results_path: str, min_score: float = 3.5):
with open(f"{results_path}/metrics.json") as f:
metrics = json.load(f)
failed = []
if metrics.get("relevance.relevance.mean", 0) < min_score:
failed.append(f"Relevance: {metrics['relevance.relevance.mean']:.2f}")
if metrics.get("groundedness.groundedness.mean", 0) < min_score:
failed.append(f"Groundedness: {metrics['groundedness.groundedness.mean']:.2f}")
if failed:
print("Quality thresholds not met:")
for f in failed:
print(f" - {f}")
sys.exit(1)
print("All quality thresholds passed!")
sys.exit(0)
if __name__ == "__main__":
check_thresholds(sys.argv[1])
Monitoring Over Time
from azure.monitor.ingestion import LogsIngestionClient
from azure.identity import DefaultAzureCredential
from datetime import datetime
class EvaluationMonitor:
def __init__(self, dcr_endpoint: str, dcr_id: str, stream_name: str):
self.client = LogsIngestionClient(
endpoint=dcr_endpoint,
credential=DefaultAzureCredential()
)
self.dcr_id = dcr_id
self.stream_name = stream_name
def log_evaluation(self, results: dict, metadata: dict = None):
"""Log evaluation results to Azure Monitor."""
log_entry = {
"TimeGenerated": datetime.utcnow().isoformat(),
"RelevanceScore": results.get("relevance", 0),
"GroundednessScore": results.get("groundedness", 0),
"CoherenceScore": results.get("coherence", 0),
"SafetyPassed": results.get("safety_passed", True),
"CompositeScore": results.get("composite_score", 0),
"ModelVersion": metadata.get("model_version", "unknown"),
"FlowVersion": metadata.get("flow_version", "unknown")
}
self.client.upload(
rule_id=self.dcr_id,
stream_name=self.stream_name,
logs=[log_entry]
)
# Usage
monitor = EvaluationMonitor(dcr_endpoint, dcr_id, stream_name)
monitor.log_evaluation(eval_results, {"model_version": "gpt-4o-2024-05-13"})
Best Practices
- Evaluate continuously - Not just at deployment
- Use multiple metrics - No single metric tells the whole story
- Set appropriate thresholds - Based on your use case
- Monitor trends - Detect gradual degradation
- Include safety - Always evaluate for harmful content
What’s Next
Tomorrow I’ll cover tracing and debugging for AI applications.