October 22, 2023 1 min read

PII Detection and Protection in AI Applications

PII Detection Privacy Data Protection Security AI

Introduction

Personally Identifiable Information (PII) detection is critical for privacy-preserving AI applications. This post covers techniques for detecting, masking, and protecting PII in text data processed by AI systems.

PII Types and Patterns

from dataclasses import dataclass
from typing import List, Dict, Pattern
from enum import Enum
import re

class PIIType(Enum):
    SSN = "social_security_number"
    CREDIT_CARD = "credit_card"
    EMAIL = "email"
    PHONE = "phone_number"
    ADDRESS = "address"
    NAME = "person_name"
    DOB = "date_of_birth"
    PASSPORT = "passport_number"
    DRIVER_LICENSE = "driver_license"
    BANK_ACCOUNT = "bank_account"
    IP_ADDRESS = "ip_address"
    MEDICAL_RECORD = "medical_record"

@dataclass
class PIIPattern:
    pii_type: PIIType
    pattern: str
    description: str
    confidence: float
    region: str = "US"

class PIIPatternLibrary:
    """Library of PII detection patterns"""

    @staticmethod
    def get_patterns() -> List[PIIPattern]:
        return [
            PIIPattern(
                pii_type=PIIType.SSN,
                pattern=r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b",
                description="US Social Security Number",
                confidence=0.9,
                region="US"
            ),
            PIIPattern(
                pii_type=PIIType.CREDIT_CARD,
                pattern=r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
                description="Credit Card Number (16 digits)",
                confidence=0.85
            ),
            PIIPattern(
                pii_type=PIIType.EMAIL,
                pattern=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
                description="Email Address",
                confidence=0.95
            ),
            PIIPattern(
                pii_type=PIIType.PHONE,
                pattern=r"\b(?:\+1[-\s]?)?\(?[2-9]\d{2}\)?[-\s]?\d{3}[-\s]?\d{4}\b",
                description="US Phone Number",
                confidence=0.8,
                region="US"
            ),
            PIIPattern(
                pii_type=PIIType.DOB,
                pattern=r"\b(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])[/-](?:19|20)\d{2}\b",
                description="Date of Birth (MM/DD/YYYY)",
                confidence=0.7
            ),
            PIIPattern(
                pii_type=PIIType.IP_ADDRESS,
                pattern=r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
                description="IPv4 Address",
                confidence=0.9
            ),
            PIIPattern(
                pii_type=PIIType.PASSPORT,
                pattern=r"\b[A-Z]{1,2}\d{6,9}\b",
                description="Passport Number",
                confidence=0.6
            )
        ]

PII Detector Implementation

@dataclass
class PIIMatch:
    pii_type: PIIType
    value: str
    start: int
    end: int
    confidence: float
    masked_value: str

class PIIDetector:
    """Comprehensive PII detection"""

    def __init__(self):
        self.patterns = PIIPatternLibrary.get_patterns()
        self.compiled_patterns = {
            p.pii_type: re.compile(p.pattern, re.IGNORECASE)
            for p in self.patterns
        }
        self.pattern_confidence = {
            p.pii_type: p.confidence
            for p in self.patterns
        }

    def detect(self, text: str) -> List[PIIMatch]:
        """Detect all PII in text"""
        matches = []

        for pii_type, pattern in self.compiled_patterns.items():
            for match in pattern.finditer(text):
                value = match.group()

                # Validate match
                if self._validate_match(pii_type, value):
                    matches.append(PIIMatch(
                        pii_type=pii_type,
                        value=value,
                        start=match.start(),
                        end=match.end(),
                        confidence=self.pattern_confidence[pii_type],
                        masked_value=self._mask_value(pii_type, value)
                    ))

        # Sort by position
        matches.sort(key=lambda m: m.start)

        # Remove overlapping matches (keep higher confidence)
        matches = self._remove_overlaps(matches)

        return matches

    def _validate_match(self, pii_type: PIIType, value: str) -> bool:
        """Validate detected PII"""
        if pii_type == PIIType.SSN:
            return self._validate_ssn(value)
        elif pii_type == PIIType.CREDIT_CARD:
            return self._validate_credit_card(value)
        elif pii_type == PIIType.EMAIL:
            return self._validate_email(value)
        return True

    def _validate_ssn(self, value: str) -> bool:
        """Validate SSN format"""
        digits = re.sub(r'\D', '', value)
        if len(digits) != 9:
            return False
        # Check for invalid area numbers
        area = int(digits[:3])
        if area == 0 or area == 666 or area >= 900:
            return False
        return True

    def _validate_credit_card(self, value: str) -> bool:
        """Validate credit card using Luhn algorithm"""
        digits = re.sub(r'\D', '', value)
        if len(digits) != 16:
            return False

        # Luhn algorithm
        total = 0
        for i, digit in enumerate(reversed(digits)):
            d = int(digit)
            if i % 2 == 1:
                d *= 2
                if d > 9:
                    d -= 9
            total += d

        return total % 10 == 0

    def _validate_email(self, value: str) -> bool:
        """Validate email format"""
        # Additional validation beyond regex
        if '..' in value:
            return False
        if value.startswith('.') or value.endswith('.'):
            return False
        return True

    def _mask_value(self, pii_type: PIIType, value: str) -> str:
        """Mask PII value"""
        masking_strategies = {
            PIIType.SSN: lambda v: "***-**-" + v[-4:] if len(v) >= 4 else "***",
            PIIType.CREDIT_CARD: lambda v: "**** **** **** " + re.sub(r'\D', '', v)[-4:],
            PIIType.EMAIL: lambda v: v[0] + "***@" + v.split('@')[-1] if '@' in v else "***",
            PIIType.PHONE: lambda v: "***-***-" + re.sub(r'\D', '', v)[-4:],
            PIIType.DOB: lambda v: "**/**/****",
            PIIType.IP_ADDRESS: lambda v: ".".join(["***"] * 4),
        }

        strategy = masking_strategies.get(pii_type, lambda v: "***")
        return strategy(value)

    def _remove_overlaps(self, matches: List[PIIMatch]) -> List[PIIMatch]:
        """Remove overlapping matches, keeping higher confidence"""
        if not matches:
            return matches

        result = [matches[0]]

        for match in matches[1:]:
            prev = result[-1]
            # Check for overlap
            if match.start < prev.end:
                # Keep higher confidence
                if match.confidence > prev.confidence:
                    result[-1] = match
            else:
                result.append(match)

        return result

# Usage
detector = PIIDetector()

text = """
Contact John Doe at john.doe@email.com or call 555-123-4567.
SSN: 123-45-6789, Credit Card: 4532-1234-5678-9012
"""

matches = detector.detect(text)
for match in matches:
    print(f"Type: {match.pii_type.value}")
    print(f"Value: {match.value} -> {match.masked_value}")
    print(f"Confidence: {match.confidence}")
    print()

PII Redaction and Masking

class PIIRedactor:
    """Redact PII from text"""

    def __init__(self, detector: PIIDetector = None):
        self.detector = detector or PIIDetector()

    def redact(self, text: str, replacement: str = "[REDACTED]") -> Dict:
        """Redact all PII from text"""
        matches = self.detector.detect(text)

        if not matches:
            return {
                "original": text,
                "redacted": text,
                "pii_count": 0,
                "pii_found": []
            }

        # Redact from end to start to preserve positions
        redacted = text
        for match in reversed(matches):
            redacted = redacted[:match.start] + replacement + redacted[match.end:]

        return {
            "original": text,
            "redacted": redacted,
            "pii_count": len(matches),
            "pii_found": [
                {
                    "type": m.pii_type.value,
                    "masked": m.masked_value,
                    "position": (m.start, m.end)
                }
                for m in matches
            ]
        }

    def mask(self, text: str) -> Dict:
        """Mask PII while preserving format"""
        matches = self.detector.detect(text)

        if not matches:
            return {
                "original": text,
                "masked": text,
                "pii_count": 0
            }

        masked = text
        for match in reversed(matches):
            masked = masked[:match.start] + match.masked_value + masked[match.end:]

        return {
            "original": text,
            "masked": masked,
            "pii_count": len(matches)
        }

    def tokenize(self, text: str) -> Dict:
        """Replace PII with tokens for later restoration"""
        matches = self.detector.detect(text)

        if not matches:
            return {
                "tokenized": text,
                "tokens": {},
                "pii_count": 0
            }

        import uuid
        tokens = {}
        tokenized = text

        for match in reversed(matches):
            token = f"[PII_{match.pii_type.name}_{uuid.uuid4().hex[:8]}]"
            tokens[token] = match.value
            tokenized = tokenized[:match.start] + token + tokenized[match.end:]

        return {
            "tokenized": tokenized,
            "tokens": tokens,
            "pii_count": len(matches)
        }

    def restore(self, tokenized_text: str, tokens: Dict) -> str:
        """Restore PII from tokens"""
        restored = tokenized_text
        for token, value in tokens.items():
            restored = restored.replace(token, value)
        return restored

# Usage
redactor = PIIRedactor()

text = "Email me at john@example.com or call 555-123-4567"

# Redact
result = redactor.redact(text)
print(f"Redacted: {result['redacted']}")

# Mask
result = redactor.mask(text)
print(f"Masked: {result['masked']}")

# Tokenize
result = redactor.tokenize(text)
print(f"Tokenized: {result['tokenized']}")
restored = redactor.restore(result['tokenized'], result['tokens'])
print(f"Restored: {restored}")

Context-Aware PII Detection

class ContextAwarePIIDetector:
    """PII detection with context awareness"""

    def __init__(self):
        self.base_detector = PIIDetector()
        self.context_indicators = {
            PIIType.SSN: ["ssn", "social security", "social sec"],
            PIIType.CREDIT_CARD: ["card", "credit", "visa", "mastercard", "payment"],
            PIIType.PHONE: ["phone", "call", "tel", "mobile", "cell"],
            PIIType.EMAIL: ["email", "e-mail", "contact", "mail"],
            PIIType.DOB: ["born", "birthday", "dob", "birth date", "date of birth"],
            PIIType.ADDRESS: ["address", "street", "city", "zip", "postal"]
        }

    def detect_with_context(self, text: str) -> List[Dict]:
        """Detect PII with context boost"""
        # Base detection
        matches = self.base_detector.detect(text)

        # Boost confidence based on context
        text_lower = text.lower()
        enhanced_matches = []

        for match in matches:
            context_boost = self._calculate_context_boost(
                text_lower,
                match.start,
                match.pii_type
            )

            enhanced_confidence = min(1.0, match.confidence + context_boost)

            enhanced_matches.append({
                "pii_type": match.pii_type.value,
                "value": match.value,
                "masked_value": match.masked_value,
                "base_confidence": match.confidence,
                "context_boost": context_boost,
                "final_confidence": enhanced_confidence,
                "position": (match.start, match.end)
            })

        return enhanced_matches

    def _calculate_context_boost(
        self,
        text: str,
        position: int,
        pii_type: PIIType
    ) -> float:
        """Calculate confidence boost from context"""
        # Look at surrounding text (50 chars before)
        start = max(0, position - 50)
        context = text[start:position]

        indicators = self.context_indicators.get(pii_type, [])
        boost = 0.0

        for indicator in indicators:
            if indicator in context:
                boost += 0.1

        return min(0.2, boost)  # Cap boost at 0.2

class NamedEntityPIIDetector:
    """Detect names and other entities as PII"""

    def __init__(self):
        # In production, use spaCy or similar NER model
        self.name_patterns = [
            r"\b[A-Z][a-z]+\s+[A-Z][a-z]+\b",  # First Last
            r"\b[A-Z][a-z]+\s+[A-Z]\.\s+[A-Z][a-z]+\b",  # First M. Last
        ]
        self.title_prefixes = ["mr", "mrs", "ms", "dr", "prof"]

    def detect_names(self, text: str) -> List[Dict]:
        """Detect potential names in text"""
        matches = []

        for pattern in self.name_patterns:
            for match in re.finditer(pattern, text):
                value = match.group()

                # Check for title prefix
                has_title = any(
                    text[max(0, match.start()-5):match.start()].lower().endswith(title)
                    for title in self.title_prefixes
                )

                confidence = 0.7 if has_title else 0.5

                matches.append({
                    "type": "person_name",
                    "value": value,
                    "position": (match.start(), match.end()),
                    "confidence": confidence
                })

        return matches

PII Protection Pipeline

class PIIProtectionPipeline:
    """Complete PII protection pipeline"""

    def __init__(self, mode: str = "mask"):
        self.detector = PIIDetector()
        self.context_detector = ContextAwarePIIDetector()
        self.redactor = PIIRedactor(self.detector)
        self.mode = mode  # mask, redact, or tokenize

    def process(self, text: str) -> Dict:
        """Process text through PII protection pipeline"""
        # Detect with context
        detections = self.context_detector.detect_with_context(text)

        # Apply protection based on mode
        if self.mode == "mask":
            protected = self.redactor.mask(text)
        elif self.mode == "redact":
            protected = self.redactor.redact(text)
        elif self.mode == "tokenize":
            protected = self.redactor.tokenize(text)
        else:
            protected = {"processed": text}

        return {
            "original_length": len(text),
            "detections": detections,
            "pii_count": len(detections),
            "protection_mode": self.mode,
            **protected
        }

    def process_for_llm(self, text: str) -> Dict:
        """Process text for LLM input with PII tokenization"""
        # Tokenize PII
        result = self.redactor.tokenize(text)

        return {
            "llm_safe_text": result["tokenized"],
            "pii_tokens": result["tokens"],
            "pii_count": result["pii_count"],
            "can_restore": True
        }

    def restore_from_llm(self, llm_output: str, tokens: Dict) -> str:
        """Restore PII in LLM output"""
        return self.redactor.restore(llm_output, tokens)

# Usage
pipeline = PIIProtectionPipeline(mode="mask")

text = """
Customer: John Smith
Email: john.smith@company.com
Phone: (555) 123-4567
SSN: 123-45-6789
"""

result = pipeline.process(text)
print(f"PII found: {result['pii_count']}")
print(f"Protected text:\n{result['masked']}")

# For LLM processing
llm_prep = pipeline.process_for_llm(text)
print(f"\nLLM-safe text:\n{llm_prep['llm_safe_text']}")

Compliance and Reporting

from datetime import datetime

class PIIComplianceReporter:
    """Generate PII compliance reports"""

    def __init__(self):
        self.pipeline = PIIProtectionPipeline()
        self.processing_log = []

    def process_and_log(self, text: str, source: str = "unknown") -> Dict:
        """Process text and log for compliance"""
        result = self.pipeline.process(text)

        log_entry = {
            "timestamp": datetime.now().isoformat(),
            "source": source,
            "pii_count": result["pii_count"],
            "pii_types": [d["pii_type"] for d in result["detections"]],
            "protection_applied": result["protection_mode"]
        }
        self.processing_log.append(log_entry)

        return result

    def generate_report(self) -> Dict:
        """Generate compliance report"""
        if not self.processing_log:
            return {"error": "No processing history"}

        total_processed = len(self.processing_log)
        total_pii = sum(e["pii_count"] for e in self.processing_log)

        # Count by type
        type_counts = {}
        for entry in self.processing_log:
            for pii_type in entry["pii_types"]:
                type_counts[pii_type] = type_counts.get(pii_type, 0) + 1

        return {
            "report_generated": datetime.now().isoformat(),
            "total_texts_processed": total_processed,
            "total_pii_detected": total_pii,
            "pii_by_type": type_counts,
            "protection_rate": 1.0,  # All detected PII was protected
            "compliance_status": "compliant"
        }

# Usage
reporter = PIIComplianceReporter()

texts = [
    "Contact me at user@email.com",
    "My SSN is 123-45-6789",
    "Call 555-0123 for support"
]

for text in texts:
    reporter.process_and_log(text, source="user_input")

report = reporter.generate_report()
print(f"Total PII detected: {report['total_pii_detected']}")
print(f"PII by type: {report['pii_by_type']}")

Conclusion

PII detection and protection is essential for privacy-preserving AI applications. A comprehensive approach includes pattern-based detection, validation, context-aware confidence boosting, and flexible protection modes. Compliance reporting and audit logging ensure regulatory requirements are met while maintaining user privacy.