March 28, 2021 1 min read

Image Analysis with Azure Computer Vision

Azure Computer Vision Cognitive Services AI Image Analysis

Azure Computer Vision is a powerful AI service that extracts information from images. It can analyze visual content, detect objects, read text, and generate descriptions, enabling applications to understand and process visual data at scale.

Setting Up Computer Vision

# Create Computer Vision resource
az cognitiveservices account create \
    --name mycomputervision \
    --resource-group myResourceGroup \
    --kind ComputerVision \
    --sku S1 \
    --location eastus

# Get keys and endpoint
az cognitiveservices account keys list \
    --name mycomputervision \
    --resource-group myResourceGroup

Image Analysis

from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
import io

class ImageAnalyzer:
    def __init__(self, endpoint, key):
        self.client = ComputerVisionClient(
            endpoint,
            CognitiveServicesCredentials(key)
        )

    def analyze_image(self, image_url, features=None):
        """Analyze an image from URL."""
        if features is None:
            features = [
                VisualFeatureTypes.categories,
                VisualFeatureTypes.description,
                VisualFeatureTypes.tags,
                VisualFeatureTypes.objects,
                VisualFeatureTypes.faces,
                VisualFeatureTypes.adult,
                VisualFeatureTypes.color,
                VisualFeatureTypes.image_type,
                VisualFeatureTypes.brands
            ]

        result = self.client.analyze_image(image_url, visual_features=features)
        return self._format_analysis(result)

    def analyze_image_stream(self, image_stream, features=None):
        """Analyze an image from stream."""
        if features is None:
            features = [
                VisualFeatureTypes.categories,
                VisualFeatureTypes.description,
                VisualFeatureTypes.tags,
                VisualFeatureTypes.objects
            ]

        result = self.client.analyze_image_in_stream(
            image_stream,
            visual_features=features
        )
        return self._format_analysis(result)

    def _format_analysis(self, result):
        """Format analysis results."""
        analysis = {}

        # Description
        if result.description:
            analysis["description"] = {
                "captions": [
                    {"text": c.text, "confidence": c.confidence}
                    for c in result.description.captions
                ],
                "tags": result.description.tags
            }

        # Tags
        if result.tags:
            analysis["tags"] = [
                {"name": t.name, "confidence": t.confidence}
                for t in result.tags
            ]

        # Objects
        if result.objects:
            analysis["objects"] = [
                {
                    "object": obj.object_property,
                    "confidence": obj.confidence,
                    "bounding_box": {
                        "x": obj.rectangle.x,
                        "y": obj.rectangle.y,
                        "width": obj.rectangle.w,
                        "height": obj.rectangle.h
                    }
                }
                for obj in result.objects
            ]

        # Faces
        if result.faces:
            analysis["faces"] = [
                {
                    "age": face.age,
                    "gender": face.gender,
                    "bounding_box": {
                        "left": face.face_rectangle.left,
                        "top": face.face_rectangle.top,
                        "width": face.face_rectangle.width,
                        "height": face.face_rectangle.height
                    }
                }
                for face in result.faces
            ]

        # Colors
        if result.color:
            analysis["colors"] = {
                "dominant_foreground": result.color.dominant_color_foreground,
                "dominant_background": result.color.dominant_color_background,
                "dominant_colors": result.color.dominant_colors,
                "accent_color": result.color.accent_color,
                "is_bw": result.color.is_bw_img
            }

        # Adult content
        if result.adult:
            analysis["adult"] = {
                "is_adult_content": result.adult.is_adult_content,
                "adult_score": result.adult.adult_score,
                "is_racy_content": result.adult.is_racy_content,
                "racy_score": result.adult.racy_score,
                "is_gory_content": result.adult.is_gory_content,
                "gore_score": result.adult.gore_score
            }

        # Brands
        if result.brands:
            analysis["brands"] = [
                {
                    "name": b.name,
                    "confidence": b.confidence,
                    "bounding_box": {
                        "x": b.rectangle.x,
                        "y": b.rectangle.y,
                        "width": b.rectangle.w,
                        "height": b.rectangle.h
                    }
                }
                for b in result.brands
            ]

        # Categories
        if result.categories:
            analysis["categories"] = [
                {"name": c.name, "score": c.score}
                for c in result.categories
            ]

        return analysis


# Usage
analyzer = ImageAnalyzer(
    "https://your-resource.cognitiveservices.azure.com",
    "your-key"
)

# Analyze from URL
result = analyzer.analyze_image(
    "https://example.com/image.jpg"
)

print(f"Description: {result['description']['captions'][0]['text']}")
print(f"Tags: {[t['name'] for t in result['tags'][:5]]}")
print(f"Objects detected: {len(result.get('objects', []))}")

Optical Character Recognition (OCR)

import time

class OCRReader:
    def __init__(self, endpoint, key):
        self.client = ComputerVisionClient(
            endpoint,
            CognitiveServicesCredentials(key)
        )

    def read_text(self, image_url):
        """Read text from image using Read API."""
        # Start read operation
        read_response = self.client.read(image_url, raw=True)

        # Get operation ID from response headers
        operation_location = read_response.headers["Operation-Location"]
        operation_id = operation_location.split("/")[-1]

        # Wait for operation to complete
        while True:
            result = self.client.get_read_result(operation_id)

            if result.status.lower() not in ["notstarted", "running"]:
                break

            time.sleep(1)

        # Extract text
        if result.status.lower() == "succeeded":
            return self._extract_text(result)
        else:
            raise Exception(f"OCR failed: {result.status}")

    def read_text_stream(self, image_stream):
        """Read text from image stream."""
        read_response = self.client.read_in_stream(image_stream, raw=True)

        operation_location = read_response.headers["Operation-Location"]
        operation_id = operation_location.split("/")[-1]

        while True:
            result = self.client.get_read_result(operation_id)

            if result.status.lower() not in ["notstarted", "running"]:
                break

            time.sleep(1)

        if result.status.lower() == "succeeded":
            return self._extract_text(result)
        else:
            raise Exception(f"OCR failed: {result.status}")

    def _extract_text(self, result):
        """Extract text and positions from OCR result."""
        pages = []

        for page in result.analyze_result.read_results:
            page_data = {
                "page": page.page,
                "width": page.width,
                "height": page.height,
                "angle": page.angle,
                "lines": []
            }

            for line in page.lines:
                line_data = {
                    "text": line.text,
                    "bounding_box": line.bounding_box,
                    "words": [
                        {
                            "text": word.text,
                            "bounding_box": word.bounding_box,
                            "confidence": word.confidence
                        }
                        for word in line.words
                    ]
                }
                page_data["lines"].append(line_data)

            pages.append(page_data)

        # Get full text
        full_text = "\n".join(
            line["text"]
            for page in pages
            for line in page["lines"]
        )

        return {
            "pages": pages,
            "full_text": full_text
        }


# Usage
ocr = OCRReader(
    "https://your-resource.cognitiveservices.azure.com",
    "your-key"
)

# Read text from document image
result = ocr.read_text("https://example.com/document.jpg")

print("Extracted text:")
print(result["full_text"])

# Read from local file
with open("document.jpg", "rb") as f:
    result = ocr.read_text_stream(f)
    print(result["full_text"])

Object Detection

def detect_objects(analyzer, image_url):
    """Detect and locate objects in image."""
    from PIL import Image, ImageDraw
    import requests
    from io import BytesIO

    # Analyze image
    result = analyzer.client.detect_objects(image_url)

    # Download image for visualization
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    draw = ImageDraw.Draw(img)

    # Draw bounding boxes
    for obj in result.objects:
        rect = obj.rectangle
        draw.rectangle(
            [rect.x, rect.y, rect.x + rect.w, rect.y + rect.h],
            outline="red",
            width=3
        )
        draw.text(
            (rect.x, rect.y - 20),
            f"{obj.object_property} ({obj.confidence:.2f})",
            fill="red"
        )

    # Save annotated image
    img.save("detected_objects.jpg")

    return [
        {
            "object": obj.object_property,
            "confidence": obj.confidence,
            "rectangle": {
                "x": obj.rectangle.x,
                "y": obj.rectangle.y,
                "width": obj.rectangle.w,
                "height": obj.rectangle.h
            }
        }
        for obj in result.objects
    ]

Thumbnail Generation

def generate_thumbnail(analyzer, image_url, width, height, smart_cropping=True):
    """Generate smart-cropped thumbnail."""
    thumbnail = analyzer.client.generate_thumbnail(
        width,
        height,
        image_url,
        smart_cropping=smart_cropping
    )

    # Save thumbnail
    with open("thumbnail.jpg", "wb") as f:
        for chunk in thumbnail:
            f.write(chunk)

    return "thumbnail.jpg"


# Generate thumbnail from stream
def generate_thumbnail_from_file(analyzer, file_path, width, height):
    """Generate thumbnail from local file."""
    with open(file_path, "rb") as f:
        thumbnail = analyzer.client.generate_thumbnail_in_stream(
            width,
            height,
            f,
            smart_cropping=True
        )

    output_path = f"thumbnail_{width}x{height}.jpg"
    with open(output_path, "wb") as out:
        for chunk in thumbnail:
            out.write(chunk)

    return output_path

Spatial Analysis (Video Analytics)

# Spatial analysis for video streams (requires edge deployment)
spatial_analysis_config = {
    "ai_insights": [
        {
            "type": "cognitiveservices.vision.spatialanalysis-personcrossingline",
            "operation_id": "personcounting",
            "config": {
                "enable_face_mask_classifier": "true",
                "detector_node_config": '{"gpu_index": 0}',
                "lines": [
                    {
                        "line": {
                            "line": {
                                "start": {"x": 0.5, "y": 0.1},
                                "end": {"x": 0.5, "y": 0.9}
                            },
                            "name": "entrance"
                        },
                        "events": [
                            {
                                "type": "count",
                                "config": {
                                    "output_frequency": 1,
                                    "trigger": "event"
                                }
                            }
                        ]
                    }
                ]
            }
        }
    ]
}

# This would be deployed as part of Azure IoT Edge module

REST API Integration

from flask import Flask, request, jsonify
import base64

app = Flask(__name__)
analyzer = ImageAnalyzer("endpoint", "key")
ocr = OCRReader("endpoint", "key")

@app.route("/api/analyze", methods=["POST"])
def analyze_endpoint():
    """Analyze uploaded image."""
    if "image" in request.files:
        image = request.files["image"]
        stream = io.BytesIO(image.read())
        result = analyzer.analyze_image_stream(stream)
    elif "url" in request.json:
        result = analyzer.analyze_image(request.json["url"])
    else:
        return jsonify({"error": "No image provided"}), 400

    return jsonify(result)


@app.route("/api/ocr", methods=["POST"])
def ocr_endpoint():
    """Extract text from image."""
    if "image" in request.files:
        image = request.files["image"]
        stream = io.BytesIO(image.read())
        result = ocr.read_text_stream(stream)
    elif "url" in request.json:
        result = ocr.read_text(request.json["url"])
    else:
        return jsonify({"error": "No image provided"}), 400

    return jsonify(result)


@app.route("/api/describe", methods=["POST"])
def describe_endpoint():
    """Get image description."""
    url = request.json.get("url")
    if not url:
        return jsonify({"error": "URL required"}), 400

    result = analyzer.client.describe_image(url)

    return jsonify({
        "description": result.captions[0].text if result.captions else None,
        "confidence": result.captions[0].confidence if result.captions else 0,
        "tags": result.tags
    })


if __name__ == "__main__":
    app.run(debug=True)

Conclusion

Azure Computer Vision provides comprehensive image analysis capabilities:

Image analysis for understanding visual content
OCR for extracting text from images and documents
Object detection for locating items in images
Smart thumbnails for content-aware cropping
Spatial analysis for video stream processing

These capabilities enable applications from content moderation to document processing to retail analytics.