3 min read
Implementing Guardrails for Production LLM Applications
Production LLM applications require robust guardrails to prevent harmful outputs, detect prompt injection, and ensure responses stay within acceptable bounds. Azure AI provides built-in safety features that can be extended with custom rules.
The Need for Guardrails
LLMs can generate harmful content, leak sensitive information, or be manipulated through prompt injection. A layered defense approach protects users and organizations.
Azure AI Content Safety
Integrate Azure’s content filtering with custom rules:
from azure.ai.contentsafety import ContentSafetyClient
from azure.ai.contentsafety.models import TextCategory, AnalyzeTextOptions
from azure.identity import DefaultAzureCredential
from enum import Enum
from dataclasses import dataclass
class SafetyLevel(Enum):
SAFE = "safe"
WARNING = "warning"
BLOCKED = "blocked"
@dataclass
class SafetyResult:
level: SafetyLevel
categories: dict
blocked_reason: str = None
class ContentGuardrails:
def __init__(self, endpoint: str):
self.safety_client = ContentSafetyClient(
endpoint=endpoint,
credential=DefaultAzureCredential()
)
self.custom_blocklist = set()
self.pii_patterns = []
def analyze_input(self, text: str) -> SafetyResult:
"""Analyze user input for safety issues."""
# Azure Content Safety check
response = self.safety_client.analyze_text(
AnalyzeTextOptions(text=text)
)
categories = {
"hate": response.hate_result.severity if response.hate_result else 0,
"self_harm": response.self_harm_result.severity if response.self_harm_result else 0,
"sexual": response.sexual_result.severity if response.sexual_result else 0,
"violence": response.violence_result.severity if response.violence_result else 0
}
# Check severity thresholds
if any(severity >= 4 for severity in categories.values()):
return SafetyResult(
level=SafetyLevel.BLOCKED,
categories=categories,
blocked_reason="Content safety violation"
)
# Check for prompt injection patterns
if self.detect_prompt_injection(text):
return SafetyResult(
level=SafetyLevel.BLOCKED,
categories=categories,
blocked_reason="Potential prompt injection detected"
)
# Check custom blocklist
if self.contains_blocked_terms(text):
return SafetyResult(
level=SafetyLevel.WARNING,
categories=categories,
blocked_reason="Contains restricted terms"
)
return SafetyResult(level=SafetyLevel.SAFE, categories=categories)
def detect_prompt_injection(self, text: str) -> bool:
"""Detect common prompt injection patterns."""
injection_patterns = [
r"ignore (?:all )?(?:previous|above|prior) instructions",
r"disregard (?:all )?(?:previous|above|prior)",
r"you are now",
r"new instructions:",
r"system prompt:",
r"\\[INST\\]",
r"<\\|im_start\\|>",
r"### (?:Human|Assistant|System):"
]
text_lower = text.lower()
for pattern in injection_patterns:
if re.search(pattern, text_lower, re.IGNORECASE):
return True
return False
def validate_output(self, output: str, context: dict) -> SafetyResult:
"""Validate LLM output before returning to user."""
# Check for PII leakage
if self.contains_pii(output):
return SafetyResult(
level=SafetyLevel.WARNING,
categories={},
blocked_reason="Output may contain PII"
)
# Check for confidential data patterns
if self.contains_sensitive_data(output, context.get("sensitivity_rules", [])):
return SafetyResult(
level=SafetyLevel.BLOCKED,
categories={},
blocked_reason="Output contains sensitive data"
)
# Run content safety on output
return self.analyze_input(output)
Implementing Output Constraints
Enforce response boundaries and format requirements:
class OutputConstraints:
def __init__(self, max_length: int = 2000):
self.max_length = max_length
self.required_disclaimers = []
def enforce_constraints(self, output: str, topic: str) -> str:
"""Apply output constraints and add required elements."""
# Truncate if too long
if len(output) > self.max_length:
output = output[:self.max_length] + "..."
# Add topic-specific disclaimers
disclaimers = self.get_disclaimers(topic)
if disclaimers:
output += f"\n\n{disclaimers}"
return output
def get_disclaimers(self, topic: str) -> str:
"""Get required disclaimers for specific topics."""
topic_disclaimers = {
"medical": "This is for informational purposes only and not medical advice.",
"financial": "This is not financial advice. Consult a professional.",
"legal": "This is not legal advice. Consult an attorney."
}
return topic_disclaimers.get(topic, "")
Guardrails are essential for production LLM applications. Combining Azure AI Content Safety with custom rules creates a robust defense against misuse while maintaining a positive user experience.