September 14, 2024 1 min read

JSON Schema Enforcement in LLM Applications

JSON Schema OpenAI AI Data Validation API

Enforcing JSON schemas in LLM outputs is crucial for building reliable applications. Let’s explore how to define, validate, and enforce schemas effectively.

Understanding JSON Schema Basics

# A comprehensive JSON Schema example
entity_schema = {
    "$schema": "https://json-schema.org/draft/2020-12/schema",
    "type": "object",
    "properties": {
        "name": {
            "type": "string",
            "description": "Entity name",
            "minLength": 1,
            "maxLength": 200
        },
        "type": {
            "type": "string",
            "enum": ["person", "organization", "location", "event"]
        },
        "confidence": {
            "type": "number",
            "minimum": 0,
            "maximum": 1
        },
        "attributes": {
            "type": "object",
            "additionalProperties": {"type": "string"}
        },
        "relationships": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "target": {"type": "string"},
                    "relation": {"type": "string"}
                },
                "required": ["target", "relation"]
            }
        }
    },
    "required": ["name", "type", "confidence"],
    "additionalProperties": False
}

OpenAI Strict Mode

from openai import OpenAI

client = OpenAI()

def extract_with_strict_schema(text: str, schema: dict) -> dict:
    """
    Use OpenAI's strict mode for guaranteed schema compliance
    """
    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "system",
                "content": "Extract entities from the text according to the schema."
            },
            {
                "role": "user",
                "content": text
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "entity_extraction",
                "strict": True,  # Enforces schema compliance
                "schema": schema
            }
        }
    )

    import json
    return json.loads(response.choices[0].message.content)

Schema Design Patterns

Pattern 1: Discriminated Unions

# Handle different types with a discriminator field
action_schema = {
    "type": "object",
    "properties": {
        "action_type": {
            "type": "string",
            "enum": ["create", "update", "delete"]
        },
        "target": {"type": "string"},
        "payload": {"type": "object"}
    },
    "required": ["action_type", "target"],
    "allOf": [
        {
            "if": {
                "properties": {"action_type": {"const": "create"}}
            },
            "then": {
                "properties": {
                    "payload": {
                        "type": "object",
                        "required": ["name", "data"]
                    }
                },
                "required": ["payload"]
            }
        },
        {
            "if": {
                "properties": {"action_type": {"const": "delete"}}
            },
            "then": {
                "properties": {
                    "confirm": {"type": "boolean"}
                },
                "required": ["confirm"]
            }
        }
    ],
    "additionalProperties": False
}

Pattern 2: Recursive Structures

# Schema for tree-like structures
tree_node_schema = {
    "type": "object",
    "properties": {
        "id": {"type": "string"},
        "value": {"type": "string"},
        "children": {
            "type": "array",
            "items": {"$ref": "#"}  # Recursive reference
        }
    },
    "required": ["id", "value"],
    "additionalProperties": False
}

# For OpenAI, you need to flatten recursive schemas
def flatten_tree_schema(max_depth: int = 3) -> dict:
    """Generate a non-recursive schema with fixed depth"""

    def build_level(depth: int) -> dict:
        if depth == 0:
            return {
                "type": "object",
                "properties": {
                    "id": {"type": "string"},
                    "value": {"type": "string"}
                },
                "required": ["id", "value"],
                "additionalProperties": False
            }

        return {
            "type": "object",
            "properties": {
                "id": {"type": "string"},
                "value": {"type": "string"},
                "children": {
                    "type": "array",
                    "items": build_level(depth - 1)
                }
            },
            "required": ["id", "value"],
            "additionalProperties": False
        }

    return build_level(max_depth)

Pattern 3: Optional Fields with Defaults

from typing import Optional, List
from pydantic import BaseModel, Field

class ConfiguredExtraction(BaseModel):
    """Schema with sensible defaults"""

    primary_entity: str = Field(
        description="The main entity being discussed"
    )

    secondary_entities: List[str] = Field(
        default_factory=list,
        description="Other entities mentioned"
    )

    sentiment: str = Field(
        default="neutral",
        description="Overall sentiment"
    )

    confidence: float = Field(
        default=0.5,
        ge=0.0,
        le=1.0,
        description="Confidence in extraction"
    )

    metadata: Optional[dict] = Field(
        default=None,
        description="Additional context if available"
    )

Validation Strategies

import json
from jsonschema import validate, ValidationError, Draft202012Validator

class SchemaValidator:
    """Validate LLM outputs against JSON schemas"""

    def __init__(self, schema: dict):
        self.schema = schema
        self.validator = Draft202012Validator(schema)

    def validate(self, data: dict) -> tuple[bool, list]:
        """
        Validate data and return (is_valid, errors)
        """
        errors = list(self.validator.iter_errors(data))
        return len(errors) == 0, [str(e) for e in errors]

    def validate_and_fix(self, data: dict) -> dict:
        """
        Attempt to fix common validation issues
        """
        is_valid, errors = self.validate(data)

        if is_valid:
            return data

        fixed_data = data.copy()

        for error in self.validator.iter_errors(data):
            path = list(error.path)

            # Handle type coercion
            if error.validator == "type":
                expected_type = error.schema.get("type")
                current_value = self._get_nested(fixed_data, path)

                if expected_type == "string":
                    self._set_nested(fixed_data, path, str(current_value))
                elif expected_type == "integer":
                    self._set_nested(fixed_data, path, int(float(current_value)))
                elif expected_type == "number":
                    self._set_nested(fixed_data, path, float(current_value))

        return fixed_data

    def _get_nested(self, data: dict, path: list):
        for key in path:
            data = data[key]
        return data

    def _set_nested(self, data: dict, path: list, value):
        for key in path[:-1]:
            data = data[key]
        data[path[-1]] = value

# Usage
validator = SchemaValidator(entity_schema)
is_valid, errors = validator.validate(extracted_data)

Handling Schema Evolution

from typing import Dict, Any

class SchemaVersionManager:
    """Manage schema versions for backward compatibility"""

    def __init__(self):
        self.schemas: Dict[str, Dict[str, Any]] = {}
        self.migrations: Dict[str, callable] = {}

    def register_schema(self, version: str, schema: dict):
        self.schemas[version] = schema

    def register_migration(self, from_version: str, to_version: str,
                          migration_fn: callable):
        self.migrations[f"{from_version}->{to_version}"] = migration_fn

    def migrate(self, data: dict, from_version: str, to_version: str) -> dict:
        """Migrate data between schema versions"""
        migration_key = f"{from_version}->{to_version}"

        if migration_key in self.migrations:
            return self.migrations[migration_key](data)

        raise ValueError(f"No migration path from {from_version} to {to_version}")

# Example usage
manager = SchemaVersionManager()

# Register schemas
manager.register_schema("v1", {"properties": {"name": {"type": "string"}}})
manager.register_schema("v2", {
    "properties": {
        "full_name": {"type": "string"},
        "created_at": {"type": "string"}
    }
})

# Register migration
def migrate_v1_to_v2(data):
    return {
        "full_name": data.get("name", ""),
        "created_at": "2024-09-14T00:00:00Z"
    }

manager.register_migration("v1", "v2", migrate_v1_to_v2)

Best Practices

Always use strict mode when available for guaranteed compliance
Design schemas defensively with clear constraints
Include descriptions for every field to guide the LLM
Handle missing fields gracefully with defaults
Version your schemas for long-term maintenance
Validate both input and output for complete safety

JSON schema enforcement transforms unpredictable LLM outputs into reliable, type-safe data that your applications can trust.