3 min read
AI Content Detection: Tools and Techniques
Detecting AI-generated content is increasingly important for authenticity verification. Here’s the current state of detection techniques.
Detection Approaches
Text Detection
from transformers import pipeline
detector = pipeline("text-classification", model="roberta-base-openai-detector")
def detect_ai_text(text: str) -> dict:
"""Detect if text is AI-generated."""
result = detector(text[:512]) # Model token limit
return {
"is_ai_generated": result[0]["label"] == "Fake",
"confidence": result[0]["score"],
"method": "classifier"
}
# Statistical methods
def statistical_analysis(text: str) -> dict:
"""Analyze text statistics for AI indicators."""
import nltk
from collections import Counter
words = nltk.word_tokenize(text.lower())
sentences = nltk.sent_tokenize(text)
# AI text tends to have:
# - More consistent sentence length
# - Less vocabulary diversity
# - Fewer typos
avg_sentence_len = sum(len(s.split()) for s in sentences) / len(sentences)
vocab_diversity = len(set(words)) / len(words)
return {
"avg_sentence_length": avg_sentence_len,
"vocabulary_diversity": vocab_diversity,
"indicators": {
"uniform_sentences": 15 < avg_sentence_len < 25,
"low_diversity": vocab_diversity < 0.4
}
}
Image Detection
import torch
from torchvision import transforms
def detect_ai_image(image_path: str) -> dict:
"""Detect if image is AI-generated."""
# Load detection model
model = load_deepfake_detector()
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
image = Image.open(image_path)
tensor = transform(image).unsqueeze(0)
with torch.no_grad():
output = model(tensor)
probability = torch.sigmoid(output).item()
return {
"is_ai_generated": probability > 0.5,
"confidence": probability,
"method": "cnn_classifier"
}
# Artifact analysis
def analyze_artifacts(image_path: str) -> dict:
"""Analyze image for AI generation artifacts."""
img = cv2.imread(image_path)
# Check for common AI artifacts
artifacts = {
"repeating_patterns": detect_patterns(img),
"unnatural_textures": analyze_textures(img),
"inconsistent_lighting": check_lighting(img),
"anatomical_errors": check_anatomy(img) if has_people(img) else None
}
return artifacts
Combined Detection
class ContentDetector:
def __init__(self):
self.text_detector = TextAIDetector()
self.image_detector = ImageAIDetector()
def analyze(self, content: dict) -> dict:
"""Comprehensive AI content detection."""
results = {
"content_type": content["type"],
"detections": []
}
if content["type"] == "text":
results["detections"].append(self.text_detector.detect(content["text"]))
results["detections"].append(statistical_analysis(content["text"]))
elif content["type"] == "image":
results["detections"].append(self.image_detector.detect(content["path"]))
results["detections"].append(analyze_artifacts(content["path"]))
# Aggregate confidence
confidences = [d.get("confidence", 0.5) for d in results["detections"]]
results["overall_confidence"] = sum(confidences) / len(confidences)
results["likely_ai_generated"] = results["overall_confidence"] > 0.6
return results
Limitations
detection_limitations = {
"text": [
"Short texts are hard to classify",
"Edited AI text reduces accuracy",
"Models become outdated quickly",
"False positives on formal writing"
],
"images": [
"Post-processing reduces detectability",
"Newer models harder to detect",
"False positives on edited photos",
"Low resolution impacts accuracy"
],
"general": [
"Arms race with generators",
"No method is 100% accurate",
"Context matters",
"Requires regular model updates"
]
}
Best Practices
- Use multiple methods - No single detector is reliable
- Consider context - Detection is probabilistic
- Update regularly - Detection models age quickly
- Human review - Final decisions need human judgment
- Be transparent - Communicate uncertainty
Conclusion
AI detection is an evolving challenge. Use multiple approaches, acknowledge limitations, and combine automated detection with human review for critical decisions.