1 min read
Guardrails for LLMs: Building Safe AI Applications
I wrote “Guardrails for LLMs: Building Safe AI Applications” to share practical, production-minded guidance on this topic.
LLM Guardrails Implementation
from azure.ai.openai import AzureOpenAI
from azure.ai.contentsafety import ContentSafetyClient
from typing import Optional
import re
class GuardrailPipeline:
def __init__(self, openai_client: AzureOpenAI, safety_client: ContentSafetyClient):
self.openai = openai_client
self.safety = safety_client
async def check_input(self, user_input: str) -> dict:
"""Validate user input before processing."""
checks = {
"content_safety": await self.check_content_safety(user_input),
"prompt_injection": self.check_prompt_injection(user_input),
"pii_detected": self.check_pii(user_input),
"allowed_topic": await self.check_topic(user_input)
}
is_safe = all(c["passed"] for c in checks.values())
return {"safe": is_safe, "checks": checks}
async def check_content_safety(self, text: str) -> dict:
"""Check for harmful content."""
result = await self.safety.analyze_text(text=text)
return {
"passed": all(cat.severity < 2 for cat in result.categories),
"categories": {cat.category: cat.severity for cat in result.categories}
}
def check_prompt_injection(self, text: str) -> dict:
"""Detect potential prompt injection attempts."""
injection_patterns = [
r"ignore (?:previous|above|all) instructions",
r"you are now",
r"new instructions:",
r"system prompt:",
r"<\|.*\|>", # Special tokens
]
for pattern in injection_patterns:
if re.search(pattern, text.lower()):
return {"passed": False, "reason": f"Matched pattern: {pattern}"}
return {"passed": True}
def check_pii(self, text: str) -> dict:
"""Detect PII in text."""
pii_patterns = {
"ssn": r"\b\d{3}-\d{2}-\d{4}\b",
"credit_card": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b",
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
}
detected = []
for pii_type, pattern in pii_patterns.items():
if re.search(pattern, text):
detected.append(pii_type)
return {"passed": len(detected) == 0, "detected": detected}
async def check_output(self, response: str, context: str) -> dict:
"""Validate LLM output before returning to user."""
checks = {
"content_safety": await self.check_content_safety(response),
"factuality": await self.check_factuality(response, context),
"relevance": await self.check_relevance(response, context)
}
is_safe = all(c["passed"] for c in checks.values())
return {"safe": is_safe, "checks": checks}
async def check_factuality(self, response: str, context: str) -> dict:
"""Check if response is grounded in context."""
result = await self.openai.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "system",
"content": "Check if the response is fully supported by the context. Return JSON."
}, {
"role": "user",
"content": f"Context: {context}\n\nResponse: {response}"
}],
response_format={"type": "json_object"}
)
return json.loads(result.choices[0].message.content)
Comprehensive guardrails are essential for responsible AI deployment.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n