Back to Blog
5 min read

GPU Availability in 2024: Navigating the AI Compute Crunch

GPU availability has been a defining constraint of the AI era. Let’s examine the 2024 landscape and strategies for securing compute resources.

The GPU Landscape

Current State (Late 2024)

NVIDIA GPU Hierarchy:
┌─────────────────────────────────────────────────────────┐
│  H100/H200 (Data Center)                                │
│  - Highest performance                                   │
│  - Scarce availability                                   │
│  - 6-12 month lead times for large orders               │
├─────────────────────────────────────────────────────────┤
│  A100 (Data Center)                                      │
│  - Previous generation, still powerful                   │
│  - Better availability                                   │
│  - Good price-performance for many workloads            │
├─────────────────────────────────────────────────────────┤
│  A10G/L4 (Cloud Inference)                              │
│  - Optimized for inference                              │
│  - Generally available                                   │
│  - Cost-effective for serving                           │
├─────────────────────────────────────────────────────────┤
│  T4 (Entry Level)                                       │
│  - Widely available                                      │
│  - Good for small models                                 │
│  - Lowest cost                                          │
└─────────────────────────────────────────────────────────┘

Cloud GPU Availability

Azure GPU Options

azure_gpu_instances = {
    # Training/Large Model Inference
    "ND_A100_v4": {
        "gpus": 8,
        "gpu_type": "A100 80GB",
        "memory_per_gpu": "80GB",
        "interconnect": "NVLink",
        "availability": "Limited - reserve in advance",
        "use_case": "Large model training, 70B+ inference"
    },
    "NC_A100_v4": {
        "gpus": 4,
        "gpu_type": "A100 80GB",
        "memory_per_gpu": "80GB",
        "availability": "Moderate",
        "use_case": "Medium training, 70B inference"
    },

    # Inference Focused
    "NC_T4_v3": {
        "gpus": "1-4",
        "gpu_type": "T4 16GB",
        "memory_per_gpu": "16GB",
        "availability": "Good",
        "use_case": "Small model inference, development"
    },
    "NC_A10_v4": {
        "gpus": "1-2",
        "gpu_type": "A10 24GB",
        "memory_per_gpu": "24GB",
        "availability": "Good",
        "use_case": "Medium model inference"
    }
}

def select_gpu_for_workload(model_size_b: float, batch_size: int, training: bool) -> str:
    """Select appropriate GPU instance."""

    memory_needed = estimate_memory(model_size_b, batch_size, training)

    if training:
        if model_size_b > 70:
            return "ND_A100_v4"  # Need multi-GPU for large models
        elif model_size_b > 13:
            return "NC_A100_v4"
        else:
            return "NC_T4_v3"
    else:  # Inference
        if model_size_b > 70:
            return "NC_A100_v4"  # 70B needs ~140GB with quantization
        elif model_size_b > 13:
            return "NC_A10_v4"
        else:
            return "NC_T4_v3"

def estimate_memory(model_size_b: float, batch_size: int, training: bool) -> float:
    """Estimate GPU memory needed in GB."""
    # Model parameters
    param_memory = model_size_b * 2  # FP16: 2 bytes per param

    if training:
        # Gradients + optimizer states
        training_overhead = param_memory * 4  # AdamW needs 4x params
        return param_memory + training_overhead
    else:
        # Inference: just model + KV cache
        kv_cache = batch_size * 2  # Rough estimate
        return param_memory + kv_cache

Strategies for GPU Access

Strategy 1: Reserved Capacity

# Azure Reserved Instances for GPUs
reservation_strategy = {
    "when_to_reserve": [
        "Predictable, consistent workloads",
        "Long-term projects (1+ year)",
        "Cost optimization priority"
    ],

    "reservation_options": {
        "1_year": {"discount": "~30%", "flexibility": "Limited"},
        "3_year": {"discount": "~50%", "flexibility": "Very Limited"},
        "spot": {"discount": "~60-90%", "flexibility": "Can be evicted"}
    },

    "best_practice": """
    - Reserve base capacity (consistent load)
    - Use on-demand for peaks
    - Use spot for non-critical batch jobs
    """
}

Strategy 2: Multi-Cloud Approach

class MultiCloudGPUManager:
    """Manage GPU resources across clouds."""

    def __init__(self):
        self.providers = {
            "azure": AzureGPUProvider(),
            "aws": AWSGPUProvider(),
            "gcp": GCPGPUProvider()
        }

    async def find_available_capacity(
        self,
        gpu_type: str,
        count: int,
        region_preference: list
    ) -> list:
        """Find available GPU capacity across clouds."""

        options = []

        for provider_name, provider in self.providers.items():
            availability = await provider.check_availability(
                gpu_type=gpu_type,
                count=count,
                regions=region_preference
            )

            for region, status in availability.items():
                if status["available"]:
                    options.append({
                        "provider": provider_name,
                        "region": region,
                        "price_per_hour": status["price"],
                        "estimated_wait": status["wait_time"]
                    })

        return sorted(options, key=lambda x: x["price_per_hour"])

    async def provision_best_option(self, requirements: dict):
        """Provision from best available option."""

        options = await self.find_available_capacity(
            gpu_type=requirements["gpu_type"],
            count=requirements["count"],
            region_preference=requirements["regions"]
        )

        if not options:
            raise NoCapacityException("No GPU capacity available")

        best = options[0]
        return await self.providers[best["provider"]].provision(
            gpu_type=requirements["gpu_type"],
            count=requirements["count"],
            region=best["region"]
        )

Strategy 3: Optimize Utilization

class GPUOptimizer:
    """Maximize GPU utilization."""

    def optimize_batch_size(self, model, gpu_memory_gb: int) -> int:
        """Find optimal batch size for GPU memory."""

        model_memory = model.estimate_memory()
        available = gpu_memory_gb * 0.9  # Leave 10% buffer

        # Binary search for optimal batch size
        low, high = 1, 128
        optimal = 1

        while low <= high:
            mid = (low + high) // 2
            batch_memory = model.estimate_batch_memory(mid)

            if model_memory + batch_memory <= available:
                optimal = mid
                low = mid + 1
            else:
                high = mid - 1

        return optimal

    def schedule_jobs_efficiently(self, jobs: list, gpus: list) -> dict:
        """Pack jobs onto GPUs efficiently."""

        # Sort jobs by memory requirement (descending)
        jobs = sorted(jobs, key=lambda j: j.memory_required, reverse=True)

        assignments = {gpu.id: [] for gpu in gpus}
        gpu_usage = {gpu.id: 0 for gpu in gpus}

        for job in jobs:
            # Find GPU with most space that can fit the job
            best_gpu = None
            best_remaining = -1

            for gpu in gpus:
                remaining = gpu.memory - gpu_usage[gpu.id]
                if remaining >= job.memory_required:
                    if remaining > best_remaining:
                        best_gpu = gpu
                        best_remaining = remaining

            if best_gpu:
                assignments[best_gpu.id].append(job)
                gpu_usage[best_gpu.id] += job.memory_required
            else:
                raise InsufficientCapacityException(f"Cannot fit job {job.id}")

        return assignments

Strategy 4: Use Managed Services

# Often more cost-effective than raw GPUs

managed_services_comparison = {
    "azure_openai": {
        "gpu_management": "None required",
        "scaling": "Automatic",
        "availability": "High",
        "cost_model": "Per token",
        "when_to_use": "Standard LLM workloads"
    },
    "azure_ml_endpoints": {
        "gpu_management": "Managed",
        "scaling": "Configurable",
        "availability": "Depends on GPU type",
        "cost_model": "Per hour + inference",
        "when_to_use": "Custom models"
    },
    "raw_gpu_instances": {
        "gpu_management": "Full control, full responsibility",
        "scaling": "Manual or custom",
        "availability": "Constrained for high-end",
        "cost_model": "Per hour",
        "when_to_use": "Training, specialized workloads"
    }
}

# Key insight: Managed services abstract away GPU scarcity
# for most inference workloads

Looking Ahead

2025 GPU Landscape Predictions:
├── Supply improves as new fabs come online
├── H200 becomes more available
├── Alternative accelerators gain share (AMD, Intel, custom)
├── Managed services continue to abstract GPU access
└── Edge AI reduces cloud GPU demand

The GPU crunch will ease, but strategic capacity planning remains important. Use managed services where possible, and reserve capacity for predictable workloads.

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.