6 min read
Image Analysis with Azure Computer Vision
Azure Computer Vision is a powerful AI service that extracts information from images. It can analyze visual content, detect objects, read text, and generate descriptions, enabling applications to understand and process visual data at scale.
Setting Up Computer Vision
# Create Computer Vision resource
az cognitiveservices account create \
--name mycomputervision \
--resource-group myResourceGroup \
--kind ComputerVision \
--sku S1 \
--location eastus
# Get keys and endpoint
az cognitiveservices account keys list \
--name mycomputervision \
--resource-group myResourceGroup
Image Analysis
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
import io
class ImageAnalyzer:
def __init__(self, endpoint, key):
self.client = ComputerVisionClient(
endpoint,
CognitiveServicesCredentials(key)
)
def analyze_image(self, image_url, features=None):
"""Analyze an image from URL."""
if features is None:
features = [
VisualFeatureTypes.categories,
VisualFeatureTypes.description,
VisualFeatureTypes.tags,
VisualFeatureTypes.objects,
VisualFeatureTypes.faces,
VisualFeatureTypes.adult,
VisualFeatureTypes.color,
VisualFeatureTypes.image_type,
VisualFeatureTypes.brands
]
result = self.client.analyze_image(image_url, visual_features=features)
return self._format_analysis(result)
def analyze_image_stream(self, image_stream, features=None):
"""Analyze an image from stream."""
if features is None:
features = [
VisualFeatureTypes.categories,
VisualFeatureTypes.description,
VisualFeatureTypes.tags,
VisualFeatureTypes.objects
]
result = self.client.analyze_image_in_stream(
image_stream,
visual_features=features
)
return self._format_analysis(result)
def _format_analysis(self, result):
"""Format analysis results."""
analysis = {}
# Description
if result.description:
analysis["description"] = {
"captions": [
{"text": c.text, "confidence": c.confidence}
for c in result.description.captions
],
"tags": result.description.tags
}
# Tags
if result.tags:
analysis["tags"] = [
{"name": t.name, "confidence": t.confidence}
for t in result.tags
]
# Objects
if result.objects:
analysis["objects"] = [
{
"object": obj.object_property,
"confidence": obj.confidence,
"bounding_box": {
"x": obj.rectangle.x,
"y": obj.rectangle.y,
"width": obj.rectangle.w,
"height": obj.rectangle.h
}
}
for obj in result.objects
]
# Faces
if result.faces:
analysis["faces"] = [
{
"age": face.age,
"gender": face.gender,
"bounding_box": {
"left": face.face_rectangle.left,
"top": face.face_rectangle.top,
"width": face.face_rectangle.width,
"height": face.face_rectangle.height
}
}
for face in result.faces
]
# Colors
if result.color:
analysis["colors"] = {
"dominant_foreground": result.color.dominant_color_foreground,
"dominant_background": result.color.dominant_color_background,
"dominant_colors": result.color.dominant_colors,
"accent_color": result.color.accent_color,
"is_bw": result.color.is_bw_img
}
# Adult content
if result.adult:
analysis["adult"] = {
"is_adult_content": result.adult.is_adult_content,
"adult_score": result.adult.adult_score,
"is_racy_content": result.adult.is_racy_content,
"racy_score": result.adult.racy_score,
"is_gory_content": result.adult.is_gory_content,
"gore_score": result.adult.gore_score
}
# Brands
if result.brands:
analysis["brands"] = [
{
"name": b.name,
"confidence": b.confidence,
"bounding_box": {
"x": b.rectangle.x,
"y": b.rectangle.y,
"width": b.rectangle.w,
"height": b.rectangle.h
}
}
for b in result.brands
]
# Categories
if result.categories:
analysis["categories"] = [
{"name": c.name, "score": c.score}
for c in result.categories
]
return analysis
# Usage
analyzer = ImageAnalyzer(
"https://your-resource.cognitiveservices.azure.com",
"your-key"
)
# Analyze from URL
result = analyzer.analyze_image(
"https://example.com/image.jpg"
)
print(f"Description: {result['description']['captions'][0]['text']}")
print(f"Tags: {[t['name'] for t in result['tags'][:5]]}")
print(f"Objects detected: {len(result.get('objects', []))}")
Optical Character Recognition (OCR)
import time
class OCRReader:
def __init__(self, endpoint, key):
self.client = ComputerVisionClient(
endpoint,
CognitiveServicesCredentials(key)
)
def read_text(self, image_url):
"""Read text from image using Read API."""
# Start read operation
read_response = self.client.read(image_url, raw=True)
# Get operation ID from response headers
operation_location = read_response.headers["Operation-Location"]
operation_id = operation_location.split("/")[-1]
# Wait for operation to complete
while True:
result = self.client.get_read_result(operation_id)
if result.status.lower() not in ["notstarted", "running"]:
break
time.sleep(1)
# Extract text
if result.status.lower() == "succeeded":
return self._extract_text(result)
else:
raise Exception(f"OCR failed: {result.status}")
def read_text_stream(self, image_stream):
"""Read text from image stream."""
read_response = self.client.read_in_stream(image_stream, raw=True)
operation_location = read_response.headers["Operation-Location"]
operation_id = operation_location.split("/")[-1]
while True:
result = self.client.get_read_result(operation_id)
if result.status.lower() not in ["notstarted", "running"]:
break
time.sleep(1)
if result.status.lower() == "succeeded":
return self._extract_text(result)
else:
raise Exception(f"OCR failed: {result.status}")
def _extract_text(self, result):
"""Extract text and positions from OCR result."""
pages = []
for page in result.analyze_result.read_results:
page_data = {
"page": page.page,
"width": page.width,
"height": page.height,
"angle": page.angle,
"lines": []
}
for line in page.lines:
line_data = {
"text": line.text,
"bounding_box": line.bounding_box,
"words": [
{
"text": word.text,
"bounding_box": word.bounding_box,
"confidence": word.confidence
}
for word in line.words
]
}
page_data["lines"].append(line_data)
pages.append(page_data)
# Get full text
full_text = "\n".join(
line["text"]
for page in pages
for line in page["lines"]
)
return {
"pages": pages,
"full_text": full_text
}
# Usage
ocr = OCRReader(
"https://your-resource.cognitiveservices.azure.com",
"your-key"
)
# Read text from document image
result = ocr.read_text("https://example.com/document.jpg")
print("Extracted text:")
print(result["full_text"])
# Read from local file
with open("document.jpg", "rb") as f:
result = ocr.read_text_stream(f)
print(result["full_text"])
Object Detection
def detect_objects(analyzer, image_url):
"""Detect and locate objects in image."""
from PIL import Image, ImageDraw
import requests
from io import BytesIO
# Analyze image
result = analyzer.client.detect_objects(image_url)
# Download image for visualization
response = requests.get(image_url)
img = Image.open(BytesIO(response.content))
draw = ImageDraw.Draw(img)
# Draw bounding boxes
for obj in result.objects:
rect = obj.rectangle
draw.rectangle(
[rect.x, rect.y, rect.x + rect.w, rect.y + rect.h],
outline="red",
width=3
)
draw.text(
(rect.x, rect.y - 20),
f"{obj.object_property} ({obj.confidence:.2f})",
fill="red"
)
# Save annotated image
img.save("detected_objects.jpg")
return [
{
"object": obj.object_property,
"confidence": obj.confidence,
"rectangle": {
"x": obj.rectangle.x,
"y": obj.rectangle.y,
"width": obj.rectangle.w,
"height": obj.rectangle.h
}
}
for obj in result.objects
]
Thumbnail Generation
def generate_thumbnail(analyzer, image_url, width, height, smart_cropping=True):
"""Generate smart-cropped thumbnail."""
thumbnail = analyzer.client.generate_thumbnail(
width,
height,
image_url,
smart_cropping=smart_cropping
)
# Save thumbnail
with open("thumbnail.jpg", "wb") as f:
for chunk in thumbnail:
f.write(chunk)
return "thumbnail.jpg"
# Generate thumbnail from stream
def generate_thumbnail_from_file(analyzer, file_path, width, height):
"""Generate thumbnail from local file."""
with open(file_path, "rb") as f:
thumbnail = analyzer.client.generate_thumbnail_in_stream(
width,
height,
f,
smart_cropping=True
)
output_path = f"thumbnail_{width}x{height}.jpg"
with open(output_path, "wb") as out:
for chunk in thumbnail:
out.write(chunk)
return output_path
Spatial Analysis (Video Analytics)
# Spatial analysis for video streams (requires edge deployment)
spatial_analysis_config = {
"ai_insights": [
{
"type": "cognitiveservices.vision.spatialanalysis-personcrossingline",
"operation_id": "personcounting",
"config": {
"enable_face_mask_classifier": "true",
"detector_node_config": '{"gpu_index": 0}',
"lines": [
{
"line": {
"line": {
"start": {"x": 0.5, "y": 0.1},
"end": {"x": 0.5, "y": 0.9}
},
"name": "entrance"
},
"events": [
{
"type": "count",
"config": {
"output_frequency": 1,
"trigger": "event"
}
}
]
}
]
}
}
]
}
# This would be deployed as part of Azure IoT Edge module
REST API Integration
from flask import Flask, request, jsonify
import base64
app = Flask(__name__)
analyzer = ImageAnalyzer("endpoint", "key")
ocr = OCRReader("endpoint", "key")
@app.route("/api/analyze", methods=["POST"])
def analyze_endpoint():
"""Analyze uploaded image."""
if "image" in request.files:
image = request.files["image"]
stream = io.BytesIO(image.read())
result = analyzer.analyze_image_stream(stream)
elif "url" in request.json:
result = analyzer.analyze_image(request.json["url"])
else:
return jsonify({"error": "No image provided"}), 400
return jsonify(result)
@app.route("/api/ocr", methods=["POST"])
def ocr_endpoint():
"""Extract text from image."""
if "image" in request.files:
image = request.files["image"]
stream = io.BytesIO(image.read())
result = ocr.read_text_stream(stream)
elif "url" in request.json:
result = ocr.read_text(request.json["url"])
else:
return jsonify({"error": "No image provided"}), 400
return jsonify(result)
@app.route("/api/describe", methods=["POST"])
def describe_endpoint():
"""Get image description."""
url = request.json.get("url")
if not url:
return jsonify({"error": "URL required"}), 400
result = analyzer.client.describe_image(url)
return jsonify({
"description": result.captions[0].text if result.captions else None,
"confidence": result.captions[0].confidence if result.captions else 0,
"tags": result.tags
})
if __name__ == "__main__":
app.run(debug=True)
Conclusion
Azure Computer Vision provides comprehensive image analysis capabilities:
- Image analysis for understanding visual content
- OCR for extracting text from images and documents
- Object detection for locating items in images
- Smart thumbnails for content-aware cropping
- Spatial analysis for video stream processing
These capabilities enable applications from content moderation to document processing to retail analytics.