Back to Blog
2 min read

Guardrails for LLMs: Building Safe AI Applications

Production AI applications need guardrails to ensure safety, accuracy, and appropriate behavior.

LLM Guardrails Implementation

from azure.ai.openai import AzureOpenAI
from azure.ai.contentsafety import ContentSafetyClient
from typing import Optional
import re

class GuardrailPipeline:
    def __init__(self, openai_client: AzureOpenAI, safety_client: ContentSafetyClient):
        self.openai = openai_client
        self.safety = safety_client

    async def check_input(self, user_input: str) -> dict:
        """Validate user input before processing."""
        checks = {
            "content_safety": await self.check_content_safety(user_input),
            "prompt_injection": self.check_prompt_injection(user_input),
            "pii_detected": self.check_pii(user_input),
            "allowed_topic": await self.check_topic(user_input)
        }

        is_safe = all(c["passed"] for c in checks.values())
        return {"safe": is_safe, "checks": checks}

    async def check_content_safety(self, text: str) -> dict:
        """Check for harmful content."""
        result = await self.safety.analyze_text(text=text)
        return {
            "passed": all(cat.severity < 2 for cat in result.categories),
            "categories": {cat.category: cat.severity for cat in result.categories}
        }

    def check_prompt_injection(self, text: str) -> dict:
        """Detect potential prompt injection attempts."""
        injection_patterns = [
            r"ignore (?:previous|above|all) instructions",
            r"you are now",
            r"new instructions:",
            r"system prompt:",
            r"<\|.*\|>",  # Special tokens
        ]

        for pattern in injection_patterns:
            if re.search(pattern, text.lower()):
                return {"passed": False, "reason": f"Matched pattern: {pattern}"}

        return {"passed": True}

    def check_pii(self, text: str) -> dict:
        """Detect PII in text."""
        pii_patterns = {
            "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
            "credit_card": r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b",
            "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
        }

        detected = []
        for pii_type, pattern in pii_patterns.items():
            if re.search(pattern, text):
                detected.append(pii_type)

        return {"passed": len(detected) == 0, "detected": detected}

    async def check_output(self, response: str, context: str) -> dict:
        """Validate LLM output before returning to user."""
        checks = {
            "content_safety": await self.check_content_safety(response),
            "factuality": await self.check_factuality(response, context),
            "relevance": await self.check_relevance(response, context)
        }

        is_safe = all(c["passed"] for c in checks.values())
        return {"safe": is_safe, "checks": checks}

    async def check_factuality(self, response: str, context: str) -> dict:
        """Check if response is grounded in context."""
        result = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": "Check if the response is fully supported by the context. Return JSON."
            }, {
                "role": "user",
                "content": f"Context: {context}\n\nResponse: {response}"
            }],
            response_format={"type": "json_object"}
        )
        return json.loads(result.choices[0].message.content)

Comprehensive guardrails are essential for responsible AI deployment.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.