6 min read
AI Agent Best Practices: Lessons from Production
After a month of exploring AI agents, let me share the key lessons learned for building production-ready agent systems.
Design Principles
1. Start Simple, Add Complexity Gradually
# Level 1: Simple function calling
def simple_agent(user_input: str) -> str:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": user_input}],
tools=tools
)
return handle_response(response)
# Level 2: Add memory when needed
# Level 3: Add multiple tools
# Level 4: Add planning
# Level 5: Add multi-agent coordination
2. Explicit > Implicit
# Bad: Implicit behavior
def process(task):
# What does this do? Unclear
return agent.run(task)
# Good: Explicit behavior
def process_customer_query(
query: str,
customer_id: str,
allowed_actions: list[str],
require_approval: bool = False
) -> AgentResponse:
"""
Process a customer query with defined constraints.
Args:
query: The customer's question
customer_id: For context retrieval
allowed_actions: What the agent can do
require_approval: If True, actions need human approval
"""
context = retrieve_customer_context(customer_id)
response = agent.run(
query=query,
context=context,
tools=filter_tools(allowed_actions),
human_in_loop=require_approval
)
return response
3. Fail Gracefully
class ResilientAgent:
def __init__(self, primary_model: str, fallback_model: str):
self.primary_model = primary_model
self.fallback_model = fallback_model
self.max_retries = 3
async def run(self, task: str) -> AgentResult:
# Try primary model
for attempt in range(self.max_retries):
try:
return await self._execute(task, self.primary_model)
except RateLimitError:
await asyncio.sleep(2 ** attempt)
except ModelUnavailableError:
break
# Fall back to secondary model
try:
return await self._execute(task, self.fallback_model)
except Exception as e:
# Return graceful failure
return AgentResult(
success=False,
error=str(e),
fallback_response="I'm unable to process this request right now. Please try again later."
)
Security Checklist
class SecureAgentConfig:
"""Security configuration for production agents."""
def __init__(self):
# Input validation
self.max_input_length = 10000
self.sanitize_inputs = True
# Output validation
self.filter_pii = True
self.max_output_length = 50000
# Tool restrictions
self.allowed_tools = ["search", "calculate", "summarize"]
self.blocked_operations = ["delete", "modify", "execute_code"]
# Rate limiting
self.max_requests_per_minute = 60
self.max_tokens_per_request = 8000
# Audit
self.log_all_requests = True
self.log_tool_usage = True
def validate_input(self, user_input: str) -> tuple[bool, str]:
if len(user_input) > self.max_input_length:
return False, "Input too long"
if self._contains_injection(user_input):
return False, "Invalid input pattern"
return True, ""
def _contains_injection(self, text: str) -> bool:
patterns = [
"ignore previous instructions",
"disregard your training",
"pretend you are",
"forget everything"
]
return any(p in text.lower() for p in patterns)
Testing Strategy
import pytest
from unittest.mock import AsyncMock, patch
class TestAgentBehavior:
"""Test agent behavior, not implementation details."""
@pytest.fixture
def agent(self):
return DataAnalystAgent(client=mock_client)
@pytest.mark.asyncio
async def test_handles_simple_query(self, agent):
result = await agent.run("What is 2+2?")
assert result.success
assert "4" in result.response
@pytest.mark.asyncio
async def test_refuses_dangerous_operation(self, agent):
result = await agent.run("Delete all files in the database")
assert not result.success or "cannot" in result.response.lower()
@pytest.mark.asyncio
async def test_uses_correct_tool(self, agent):
with patch.object(agent, 'execute_tool') as mock_tool:
await agent.run("Search for recent sales data")
mock_tool.assert_called_with("search_database", ANY)
@pytest.mark.asyncio
async def test_handles_tool_failure(self, agent):
agent.tools["search"].execute = AsyncMock(side_effect=Exception("DB unavailable"))
result = await agent.run("Search for data")
assert "unable" in result.response.lower() or "try again" in result.response.lower()
@pytest.mark.asyncio
async def test_respects_context_limits(self, agent):
long_context = "x" * 100000
result = await agent.run(f"Summarize this: {long_context}")
# Should handle gracefully, not crash
assert result is not None
Monitoring and Observability
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
@dataclass
class AgentMetrics:
request_id: str
timestamp: datetime
user_id: str
agent_name: str
# Performance
total_duration_ms: float
llm_duration_ms: float
tool_duration_ms: float
tokens_used: int
# Quality
success: bool
error_type: Optional[str]
tools_used: list[str]
retry_count: int
# Cost
estimated_cost_usd: float
class AgentMonitor:
def __init__(self, exporter):
self.exporter = exporter
async def track_request(self, func):
"""Decorator to track agent requests."""
async def wrapper(*args, **kwargs):
start_time = datetime.utcnow()
metrics = AgentMetrics(
request_id=str(uuid.uuid4()),
timestamp=start_time,
user_id=kwargs.get("user_id", "unknown"),
agent_name=kwargs.get("agent_name", "unknown"),
total_duration_ms=0,
llm_duration_ms=0,
tool_duration_ms=0,
tokens_used=0,
success=False,
error_type=None,
tools_used=[],
retry_count=0,
estimated_cost_usd=0
)
try:
result = await func(*args, **kwargs)
metrics.success = True
return result
except Exception as e:
metrics.error_type = type(e).__name__
raise
finally:
metrics.total_duration_ms = (datetime.utcnow() - start_time).total_seconds() * 1000
await self.exporter.export(metrics)
return wrapper
def create_dashboard_query(self) -> str:
"""KQL query for Azure Monitor dashboard."""
return """
customMetrics
| where name startswith "agent."
| summarize
AvgDuration = avg(value),
P95Duration = percentile(value, 95),
SuccessRate = countif(customDimensions.success == "true") * 100.0 / count(),
TotalRequests = count()
by bin(timestamp, 1h), tostring(customDimensions.agent_name)
| order by timestamp desc
"""
Cost Management
class CostAwareAgent:
"""Agent that tracks and manages costs."""
PRICING = {
"gpt-4o": {"input": 5.0, "output": 15.0},
"gpt-4o-mini": {"input": 0.15, "output": 0.60},
"gpt-3.5-turbo": {"input": 0.50, "output": 1.50}
}
def __init__(self, client, budget_limit_usd: float = 100.0):
self.client = client
self.budget_limit = budget_limit_usd
self.spent_today = 0.0
self.model_preference = ["gpt-4o-mini", "gpt-4o"]
async def run(self, task: str, complexity: str = "auto") -> AgentResult:
# Select model based on task and budget
model = self._select_model(task, complexity)
if self.spent_today >= self.budget_limit * 0.9:
# Near budget, use cheapest model
model = "gpt-3.5-turbo"
response = await self._execute(task, model)
# Track cost
cost = self._calculate_cost(response, model)
self.spent_today += cost
return AgentResult(
response=response.content,
model_used=model,
cost_usd=cost
)
def _select_model(self, task: str, complexity: str) -> str:
if complexity == "simple":
return "gpt-4o-mini"
elif complexity == "complex":
return "gpt-4o"
else:
# Auto-detect complexity
if len(task) > 2000 or any(kw in task.lower() for kw in ["analyze", "compare", "evaluate"]):
return "gpt-4o"
return "gpt-4o-mini"
def _calculate_cost(self, response, model: str) -> float:
pricing = self.PRICING[model]
input_cost = (response.usage.prompt_tokens / 1_000_000) * pricing["input"]
output_cost = (response.usage.completion_tokens / 1_000_000) * pricing["output"]
return input_cost + output_cost
Deployment Patterns
# Kubernetes deployment for agent service
apiVersion: apps/v1
kind: Deployment
metadata:
name: ai-agent-service
spec:
replicas: 3
selector:
matchLabels:
app: ai-agent
template:
spec:
containers:
- name: agent
image: myregistry.azurecr.io/ai-agent:latest
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
env:
- name: AZURE_OPENAI_ENDPOINT
valueFrom:
secretKeyRef:
name: ai-secrets
key: openai-endpoint
- name: MAX_CONCURRENT_REQUESTS
value: "10"
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: ai-agent-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: ai-agent-service
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
Summary Checklist
Before Production:
- Input validation and sanitization
- Output filtering (PII, sensitive data)
- Rate limiting configured
- Error handling for all failure modes
- Fallback models configured
- Logging and monitoring in place
- Cost tracking enabled
- Security review completed
- Load testing performed
- Rollback plan documented
In Production:
- Monitor error rates
- Track latency (P50, P95, P99)
- Review costs daily
- Audit tool usage
- Check for prompt injection attempts
- Validate output quality
- Update models as new versions release
What’s Next
Tomorrow starts June with a deep dive into Microsoft Fabric updates.