Skip to content
Back to Blog
1 min read

Structured Outputs: Getting Reliable Data from LLMs

I wrote “Structured Outputs: Getting Reliable Data from LLMs” to share practical, production-minded guidance on this topic.

Structured Output Patterns

from azure.ai.openai import AzureOpenAI
from pydantic import BaseModel, Field
from typing import List, Optional
import json

class ExtractedEntity(BaseModel):
    """Schema for extracted entities."""
    name: str = Field(description="Entity name")
    type: str = Field(description="Entity type (person, org, location, etc)")
    confidence: float = Field(ge=0, le=1, description="Confidence score")

class ExtractionResult(BaseModel):
    """Schema for extraction results."""
    entities: List[ExtractedEntity]
    summary: str
    sentiment: str = Field(description="positive, negative, or neutral")

class StructuredOutputAgent:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client

    async def extract_with_schema(self, text: str, schema: type[BaseModel]) -> BaseModel:
        """Extract structured data using JSON mode."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": f"""Extract information according to this schema:
                {schema.model_json_schema()}
                Return valid JSON only."""
            }, {
                "role": "user",
                "content": text
            }],
            response_format={"type": "json_object"}
        )

        data = json.loads(response.choices[0].message.content)
        return schema.model_validate(data)

    async def extract_with_function(self, text: str, schema: type[BaseModel]) -> BaseModel:
        """Extract using function calling for better structure adherence."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Extract information from: {text}"
            }],
            tools=[{
                "type": "function",
                "function": {
                    "name": "extract_data",
                    "description": "Extract structured data",
                    "parameters": schema.model_json_schema()
                }
            }],
            tool_choice={"type": "function", "function": {"name": "extract_data"}}
        )

        args = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
        return schema.model_validate(args)

    async def extract_with_retry(self, text: str, schema: type[BaseModel], max_retries: int = 3) -> BaseModel:
        """Extract with validation and retry."""
        for attempt in range(max_retries):
            try:
                result = await self.extract_with_schema(text, schema)
                return result
            except Exception as e:
                if attempt == max_retries - 1:
                    raise
                # Add error context for next attempt
                continue
        raise ValueError("Failed to extract valid structure")

# Usage
result = await agent.extract_with_schema(article_text, ExtractionResult)
print(f"Found {len(result.entities)} entities, sentiment: {result.sentiment}")

Structured outputs enable reliable integration of LLM capabilities into data pipelines.\n\n## Takeaways\n\nAdd a concise, personal takeaway and recommended next steps here.\n

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.