Back to Blog
2 min read

Structured Outputs: Getting Reliable Data from LLMs

Getting consistent, parseable output from LLMs is essential for production applications. Here’s how to achieve it.

Structured Output Patterns

from azure.ai.openai import AzureOpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
import json

class ExtractedEntity(BaseModel):
    """Schema for extracted entities."""
    name: str = Field(description="Entity name")
    type: str = Field(description="Entity type (person, org, location, etc)")
    confidence: float = Field(ge=0, le=1, description="Confidence score")

class ExtractionResult(BaseModel):
    """Schema for extraction results."""
    entities: List[ExtractedEntity]
    summary: str
    sentiment: str = Field(description="positive, negative, or neutral")

class StructuredOutputAgent:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client

    async def extract_with_schema(self, text: str, schema: type[BaseModel]) -> BaseModel:
        """Extract structured data using JSON mode."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": f"""Extract information according to this schema:
                {schema.model_json_schema()}
                Return valid JSON only."""
            }, {
                "role": "user",
                "content": text
            }],
            response_format={"type": "json_object"}
        )

        data = json.loads(response.choices[0].message.content)
        return schema.model_validate(data)

    async def extract_with_function(self, text: str, schema: type[BaseModel]) -> BaseModel:
        """Extract using function calling for better structure adherence."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Extract information from: {text}"
            }],
            tools=[{
                "type": "function",
                "function": {
                    "name": "extract_data",
                    "description": "Extract structured data",
                    "parameters": schema.model_json_schema()
                }
            }],
            tool_choice={"type": "function", "function": {"name": "extract_data"}}
        )

        args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
        return schema.model_validate(args)

    async def extract_with_retry(self, text: str, schema: type[BaseModel], max_retries: int = 3) -> BaseModel:
        """Extract with validation and retry."""
        for attempt in range(max_retries):
            try:
                result = await self.extract_with_schema(text, schema)
                return result
            except Exception as e:
                if attempt == max_retries - 1:
                    raise
                # Add error context for next attempt
                continue
        raise ValueError("Failed to extract valid structure")

# Usage
result = await agent.extract_with_schema(article_text, ExtractionResult)
print(f"Found {len(result.entities)} entities, sentiment: {result.sentiment}")

Structured outputs enable reliable integration of LLM capabilities into data pipelines.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.