Back to Blog
7 min read

Building Custom Image Classifiers with Custom Vision

Azure Custom Vision enables you to build, deploy, and improve your own image classifiers and object detectors without deep machine learning expertise. It’s perfect for domain-specific visual recognition tasks that pre-trained models cannot handle.

Setting Up Custom Vision

# Create Custom Vision training resource
az cognitiveservices account create \
    --name mycustomvision-training \
    --resource-group myResourceGroup \
    --kind CustomVision.Training \
    --sku S0 \
    --location eastus

# Create Custom Vision prediction resource
az cognitiveservices account create \
    --name mycustomvision-prediction \
    --resource-group myResourceGroup \
    --kind CustomVision.Prediction \
    --sku S0 \
    --location eastus

Building an Image Classifier

from azure.cognitiveservices.vision.customvision.training import CustomVisionTrainingClient
from azure.cognitiveservices.vision.customvision.prediction import CustomVisionPredictionClient
from azure.cognitiveservices.vision.customvision.training.models import ImageFileCreateBatch, ImageFileCreateEntry
from msrest.authentication import ApiKeyCredentials
import os
import time

class CustomVisionClassifier:
    def __init__(self, training_key, training_endpoint, prediction_key, prediction_endpoint, prediction_resource_id):
        training_credentials = ApiKeyCredentials(in_headers={"Training-key": training_key})
        self.trainer = CustomVisionTrainingClient(training_endpoint, training_credentials)

        prediction_credentials = ApiKeyCredentials(in_headers={"Prediction-key": prediction_key})
        self.predictor = CustomVisionPredictionClient(prediction_endpoint, prediction_credentials)

        self.prediction_resource_id = prediction_resource_id
        self.project = None

    def create_project(self, name, classification_type="Multiclass", domain="General"):
        """Create a new Custom Vision project."""
        # Get domain ID
        domains = self.trainer.get_domains()
        domain_id = next(d.id for d in domains if d.name == domain and d.type == "Classification")

        self.project = self.trainer.create_project(
            name,
            domain_id=domain_id,
            classification_type=classification_type
        )

        print(f"Created project: {self.project.name} ({self.project.id})")
        return self.project

    def create_tags(self, tag_names):
        """Create tags for classification."""
        tags = {}
        for name in tag_names:
            tag = self.trainer.create_tag(self.project.id, name)
            tags[name] = tag
            print(f"Created tag: {name} ({tag.id})")
        return tags

    def upload_images(self, tag_id, image_folder):
        """Upload images for a specific tag."""
        image_list = []

        for filename in os.listdir(image_folder):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                file_path = os.path.join(image_folder, filename)
                with open(file_path, "rb") as f:
                    image_list.append(ImageFileCreateEntry(
                        name=filename,
                        contents=f.read(),
                        tag_ids=[tag_id]
                    ))

                # Upload in batches of 64
                if len(image_list) >= 64:
                    self._upload_batch(image_list)
                    image_list = []

        # Upload remaining images
        if image_list:
            self._upload_batch(image_list)

    def _upload_batch(self, image_list):
        """Upload a batch of images."""
        batch = ImageFileCreateBatch(images=image_list)
        result = self.trainer.create_images_from_files(self.project.id, batch)

        if not result.is_batch_successful:
            print(f"Batch upload failed. {len([img for img in result.images if img.status != 'OK'])} images failed.")
        else:
            print(f"Uploaded {len(image_list)} images")

    def train(self, training_type="Regular"):
        """Train the model."""
        print("Training...")
        iteration = self.trainer.train_project(
            self.project.id,
            training_type=training_type
        )

        # Wait for training to complete
        while iteration.status != "Completed":
            iteration = self.trainer.get_iteration(self.project.id, iteration.id)
            print(f"Training status: {iteration.status}")
            time.sleep(10)

        print("Training completed!")
        return iteration

    def publish(self, iteration_id, publish_name):
        """Publish the trained model."""
        self.trainer.publish_iteration(
            self.project.id,
            iteration_id,
            publish_name,
            self.prediction_resource_id
        )
        print(f"Published iteration: {publish_name}")

    def predict(self, image_path, publish_name):
        """Predict image classification."""
        with open(image_path, "rb") as f:
            result = self.predictor.classify_image(
                self.project.id,
                publish_name,
                f
            )

        predictions = []
        for prediction in result.predictions:
            predictions.append({
                "tag": prediction.tag_name,
                "probability": prediction.probability
            })

        return sorted(predictions, key=lambda x: x["probability"], reverse=True)

    def predict_url(self, image_url, publish_name):
        """Predict from image URL."""
        result = self.predictor.classify_image_url(
            self.project.id,
            publish_name,
            url=image_url
        )

        return [
            {"tag": p.tag_name, "probability": p.probability}
            for p in result.predictions
        ]


# Example: Product quality classification
classifier = CustomVisionClassifier(
    training_key="your-training-key",
    training_endpoint="https://eastus.api.cognitive.microsoft.com",
    prediction_key="your-prediction-key",
    prediction_endpoint="https://eastus.api.cognitive.microsoft.com",
    prediction_resource_id="/subscriptions/.../resourceGroups/.../providers/Microsoft.CognitiveServices/accounts/mycustomvision-prediction"
)

# Create project
project = classifier.create_project("ProductQuality", "Multiclass")

# Create tags
tags = classifier.create_tags(["Good", "Defective", "Damaged"])

# Upload training images
classifier.upload_images(tags["Good"].id, "./training_data/good")
classifier.upload_images(tags["Defective"].id, "./training_data/defective")
classifier.upload_images(tags["Damaged"].id, "./training_data/damaged")

# Train model
iteration = classifier.train()

# Publish
classifier.publish(iteration.id, "quality-v1")

# Predict
result = classifier.predict("./test_image.jpg", "quality-v1")
print(f"Prediction: {result[0]['tag']} ({result[0]['probability']:.2%})")

Object Detection

class CustomVisionObjectDetector:
    def __init__(self, training_key, training_endpoint, prediction_key, prediction_endpoint, prediction_resource_id):
        training_credentials = ApiKeyCredentials(in_headers={"Training-key": training_key})
        self.trainer = CustomVisionTrainingClient(training_endpoint, training_credentials)

        prediction_credentials = ApiKeyCredentials(in_headers={"Prediction-key": prediction_key})
        self.predictor = CustomVisionPredictionClient(prediction_endpoint, prediction_credentials)

        self.prediction_resource_id = prediction_resource_id
        self.project = None

    def create_project(self, name, domain="General"):
        """Create object detection project."""
        domains = self.trainer.get_domains()
        domain_id = next(d.id for d in domains if d.name == domain and d.type == "ObjectDetection")

        self.project = self.trainer.create_project(name, domain_id=domain_id)
        return self.project

    def create_tags(self, tag_names):
        """Create tags for detection."""
        tags = {}
        for name in tag_names:
            tag = self.trainer.create_tag(self.project.id, name)
            tags[name] = tag
        return tags

    def upload_image_with_regions(self, image_path, regions):
        """Upload image with bounding box regions.

        regions format: [{"tag_id": "...", "left": 0.1, "top": 0.1, "width": 0.3, "height": 0.3}]
        Coordinates are normalized (0-1)
        """
        from azure.cognitiveservices.vision.customvision.training.models import (
            ImageFileCreateEntry, Region
        )

        with open(image_path, "rb") as f:
            image_data = f.read()

        region_objects = [
            Region(
                tag_id=r["tag_id"],
                left=r["left"],
                top=r["top"],
                width=r["width"],
                height=r["height"]
            )
            for r in regions
        ]

        entry = ImageFileCreateEntry(
            name=os.path.basename(image_path),
            contents=image_data,
            regions=region_objects
        )

        result = self.trainer.create_images_from_files(
            self.project.id,
            ImageFileCreateBatch(images=[entry])
        )

        return result.is_batch_successful

    def train(self):
        """Train the detection model."""
        iteration = self.trainer.train_project(self.project.id)

        while iteration.status != "Completed":
            iteration = self.trainer.get_iteration(self.project.id, iteration.id)
            time.sleep(10)

        return iteration

    def detect_objects(self, image_path, publish_name):
        """Detect objects in image."""
        with open(image_path, "rb") as f:
            result = self.predictor.detect_image(
                self.project.id,
                publish_name,
                f
            )

        detections = []
        for prediction in result.predictions:
            if prediction.probability > 0.5:  # Confidence threshold
                detections.append({
                    "tag": prediction.tag_name,
                    "probability": prediction.probability,
                    "bounding_box": {
                        "left": prediction.bounding_box.left,
                        "top": prediction.bounding_box.top,
                        "width": prediction.bounding_box.width,
                        "height": prediction.bounding_box.height
                    }
                })

        return detections


# Example: Retail shelf detection
detector = CustomVisionObjectDetector(
    training_key="your-key",
    training_endpoint="https://eastus.api.cognitive.microsoft.com",
    prediction_key="your-key",
    prediction_endpoint="https://eastus.api.cognitive.microsoft.com",
    prediction_resource_id="your-resource-id"
)

# Create project
project = detector.create_project("ShelfProducts", "General")

# Create tags for products
tags = detector.create_tags(["Cola", "Water", "Juice", "Empty"])

# Upload images with annotations
detector.upload_image_with_regions("shelf1.jpg", [
    {"tag_id": tags["Cola"].id, "left": 0.1, "top": 0.2, "width": 0.15, "height": 0.3},
    {"tag_id": tags["Water"].id, "left": 0.3, "top": 0.2, "width": 0.15, "height": 0.3},
    {"tag_id": tags["Empty"].id, "left": 0.5, "top": 0.2, "width": 0.15, "height": 0.3}
])

# Train and publish
iteration = detector.train()
detector.trainer.publish_iteration(
    detector.project.id,
    iteration.id,
    "shelf-v1",
    detector.prediction_resource_id
)

# Detect
detections = detector.detect_objects("test_shelf.jpg", "shelf-v1")
for d in detections:
    print(f"Found {d['tag']} at ({d['bounding_box']['left']:.2f}, {d['bounding_box']['top']:.2f})")

Exporting Models for Edge Deployment

def export_model(trainer, project_id, iteration_id, platform="TensorFlow"):
    """Export model for offline use."""
    # Available platforms: TensorFlow, CoreML, ONNX, DockerFile, OpenVino

    # Request export
    export = trainer.export_iteration(
        project_id,
        iteration_id,
        platform
    )

    # Wait for export
    while export.status == "Exporting":
        time.sleep(5)
        exports = trainer.get_exports(project_id, iteration_id)
        export = next((e for e in exports if e.platform == platform), None)

    if export.status == "Done":
        print(f"Export URL: {export.download_uri}")
        return export.download_uri
    else:
        raise Exception(f"Export failed: {export.status}")


# Export to TensorFlow Lite for mobile
export_url = export_model(
    classifier.trainer,
    classifier.project.id,
    iteration_id,
    "TensorFlow"
)

# Export to ONNX for cross-platform
export_url = export_model(
    classifier.trainer,
    classifier.project.id,
    iteration_id,
    "ONNX"
)

# Export to CoreML for iOS
export_url = export_model(
    classifier.trainer,
    classifier.project.id,
    iteration_id,
    "CoreML"
)

Using Exported Model (ONNX)

import onnxruntime as ort
import numpy as np
from PIL import Image

class ONNXPredictor:
    def __init__(self, model_path, labels_path):
        self.session = ort.InferenceSession(model_path)
        self.input_name = self.session.get_inputs()[0].name
        self.output_name = self.session.get_outputs()[0].name

        with open(labels_path) as f:
            self.labels = [line.strip() for line in f]

    def preprocess(self, image_path, size=(224, 224)):
        """Preprocess image for model."""
        img = Image.open(image_path).convert("RGB")
        img = img.resize(size)
        img_array = np.array(img).astype(np.float32)

        # Normalize (Custom Vision specific)
        img_array = img_array / 255.0

        # Add batch dimension and transpose to NCHW
        img_array = np.transpose(img_array, (2, 0, 1))
        img_array = np.expand_dims(img_array, axis=0)

        return img_array

    def predict(self, image_path):
        """Run prediction."""
        input_data = self.preprocess(image_path)
        outputs = self.session.run([self.output_name], {self.input_name: input_data})

        probabilities = outputs[0][0]
        top_idx = np.argmax(probabilities)

        return {
            "label": self.labels[top_idx],
            "confidence": float(probabilities[top_idx]),
            "all_predictions": [
                {"label": self.labels[i], "confidence": float(p)}
                for i, p in enumerate(probabilities)
            ]
        }


# Use exported model locally
predictor = ONNXPredictor("model.onnx", "labels.txt")
result = predictor.predict("test_image.jpg")
print(f"Prediction: {result['label']} ({result['confidence']:.2%})")

Iterative Improvement

def add_misclassified_images(classifier, publish_name, image_folder, correct_tag):
    """Add misclassified images to improve model."""
    tags = {t.name: t for t in classifier.trainer.get_tags(classifier.project.id)}

    for filename in os.listdir(image_folder):
        if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(image_folder, filename)

            # Get current prediction
            predictions = classifier.predict(image_path, publish_name)

            # If misclassified, add to training data
            if predictions[0]["tag"] != correct_tag:
                print(f"Adding misclassified image: {filename}")
                print(f"  Was: {predictions[0]['tag']}, Should be: {correct_tag}")

                classifier.upload_images(
                    tags[correct_tag].id,
                    image_folder
                )

    # Retrain
    iteration = classifier.train()
    return iteration

Conclusion

Custom Vision makes it easy to build domain-specific image recognition:

  • No ML expertise required: Simple web interface and SDK
  • Classification and detection: Both scenarios supported
  • Export for edge: Deploy to mobile and IoT devices
  • Iterative improvement: Continuously enhance accuracy
  • Quick training: Results in minutes, not hours

It’s perfect for quality inspection, inventory management, and specialized recognition tasks.

Michael John Pena

Michael John Pena

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.