Back to Blog
2 min read

Data Quality Automation with AI: Beyond Rule-Based Validation

AI enables data quality checks that go beyond static rules to detect anomalies and semantic issues.

AI-Powered Data Quality

from azure.ai.openai import AzureOpenAI
import pandas as pd
from sklearn.ensemble import IsolationForest

class AIDataQuality:
    def __init__(self, openai_client: AzureOpenAI):
        self.openai = openai_client
        self.anomaly_detector = IsolationForest(contamination=0.01)

    async def validate_semantic(self, df: pd.DataFrame, column: str, expected_type: str) -> dict:
        """Validate semantic correctness of data."""
        sample = df[column].dropna().sample(min(100, len(df))).tolist()

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": f"""Analyze if these values are valid {expected_type}.
                Return JSON with:
                - valid_percentage: float
                - invalid_examples: list of invalid values
                - issues: list of detected problems"""
            }, {
                "role": "user",
                "content": str(sample)
            }],
            response_format={"type": "json_object"}
        )
        return json.loads(response.choices[0].message.content)

    def detect_anomalies(self, df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
        """Detect statistical anomalies in numeric columns."""
        numeric_data = df[columns].fillna(0)
        predictions = self.anomaly_detector.fit_predict(numeric_data)
        df['is_anomaly'] = predictions == -1
        return df[df['is_anomaly']]

    async def check_consistency(self, df: pd.DataFrame, rules: list[str]) -> list[dict]:
        """Check cross-column consistency using AI."""
        sample = df.head(100).to_dict('records')

        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "system",
                "content": f"""Check data consistency against these rules: {rules}
                Identify violations and explain why they're problematic."""
            }, {
                "role": "user",
                "content": str(sample)
            }]
        )
        return self.parse_violations(response)

    async def suggest_fixes(self, invalid_records: list[dict]) -> list[dict]:
        """Suggest corrections for invalid data."""
        response = await self.openai.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": f"Suggest corrections for these records:\n{invalid_records}"
            }]
        )
        return self.parse_suggestions(response)

AI-powered data quality catches issues that traditional rules miss.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.