6 min read
DirectML: Hardware-Accelerated AI on Windows
DirectML is Microsoft’s hardware-accelerated machine learning API that works across all DirectX 12 GPUs. Today I’m exploring how to leverage it for high-performance AI inference on Windows.
What is DirectML?
DirectML provides:
- Vendor-agnostic GPU acceleration: Works with NVIDIA, AMD, Intel, Qualcomm
- NPU support: Leverages neural processing units when available
- Consistent API: Same code works across hardware
- Windows integration: Part of the Windows AI platform
DirectML Architecture
Application
↓
ONNX Runtime / DirectML API
↓
DirectML Runtime
↓
DirectX 12
↓
GPU / NPU Driver
↓
Hardware
Using DirectML with ONNX Runtime
import onnxruntime as ort
import numpy as np
# Check if DirectML is available
providers = ort.get_available_providers()
print(f"Available providers: {providers}")
if 'DmlExecutionProvider' in providers:
print("DirectML is available!")
# Create session with DirectML
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session = ort.InferenceSession(
"model.onnx",
sess_options=session_options,
providers=['DmlExecutionProvider', 'CPUExecutionProvider']
)
# Run inference
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
outputs = session.run(None, {"input": input_data})
print(f"Output shape: {outputs[0].shape}")
Device Selection
# List available DirectML devices
import onnxruntime as ort
def list_dml_devices():
"""List all DirectML-capable devices."""
# Try each device ID
devices = []
for device_id in range(10): # Check up to 10 devices
try:
session = ort.InferenceSession(
"model.onnx",
providers=[
('DmlExecutionProvider', {'device_id': device_id}),
'CPUExecutionProvider'
]
)
# If we get here, device exists
devices.append({
'device_id': device_id,
'provider': session.get_providers()[0]
})
except:
break
return devices
# Select specific GPU
dml_options = {
'device_id': 0, # First GPU
'performance_preference': 'high_performance',
'disable_metacommands': False
}
session = ort.InferenceSession(
"model.onnx",
providers=[
('DmlExecutionProvider', dml_options),
'CPUExecutionProvider'
]
)
Low-Level DirectML API
For maximum control, use DirectML directly:
#include <DirectML.h>
#include <d3d12.h>
class DirectMLInference {
public:
DirectMLInference() {
// Create D3D12 device
D3D12CreateDevice(
nullptr,
D3D_FEATURE_LEVEL_12_0,
IID_PPV_ARGS(&d3d12Device)
);
// Create DirectML device
DML_CREATE_DEVICE_FLAGS dmlFlags = DML_CREATE_DEVICE_FLAG_NONE;
DMLCreateDevice(
d3d12Device.Get(),
dmlFlags,
IID_PPV_ARGS(&dmlDevice)
);
// Create command queue
D3D12_COMMAND_QUEUE_DESC queueDesc = {};
queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
d3d12Device->CreateCommandQueue(
&queueDesc,
IID_PPV_ARGS(&commandQueue)
);
}
void CreateConvolutionOperator(
const DML_TENSOR_DESC& inputTensor,
const DML_TENSOR_DESC& filterTensor,
const DML_TENSOR_DESC& outputTensor
) {
DML_CONVOLUTION_OPERATOR_DESC convDesc = {};
convDesc.InputTensor = &inputTensor;
convDesc.FilterTensor = &filterTensor;
convDesc.OutputTensor = &outputTensor;
convDesc.Mode = DML_CONVOLUTION_MODE_CROSS_CORRELATION;
convDesc.Direction = DML_CONVOLUTION_DIRECTION_FORWARD;
convDesc.DimensionCount = 2;
UINT strides[] = { 1, 1 };
UINT dilations[] = { 1, 1 };
UINT paddings[] = { 0, 0, 0, 0 };
convDesc.Strides = strides;
convDesc.Dilations = dilations;
convDesc.StartPadding = paddings;
convDesc.EndPadding = paddings + 2;
DML_OPERATOR_DESC opDesc = {
DML_OPERATOR_CONVOLUTION,
&convDesc
};
dmlDevice->CreateOperator(&opDesc, IID_PPV_ARGS(&convOperator));
}
private:
ComPtr<ID3D12Device> d3d12Device;
ComPtr<IDMLDevice> dmlDevice;
ComPtr<ID3D12CommandQueue> commandQueue;
ComPtr<IDMLOperator> convOperator;
};
Python with DirectML Extension
import torch
import torch_directml
# Get DirectML device
dml_device = torch_directml.device()
print(f"Using device: {dml_device}")
# Move model to DirectML
model = MyModel()
model = model.to(dml_device)
# Inference
input_data = torch.randn(1, 3, 224, 224).to(dml_device)
with torch.no_grad():
output = model(input_data)
print(f"Output shape: {output.shape}")
Optimizing for DirectML
Memory Management
import onnxruntime as ort
import numpy as np
class DirectMLOptimizedInference:
def __init__(self, model_path: str, device_id: int = 0):
# Configure for optimal DirectML performance
sess_options = ort.SessionOptions()
# Enable memory pattern optimization
sess_options.enable_mem_pattern = True
# Enable memory arena (reduces allocations)
sess_options.enable_cpu_mem_arena = False # Disable CPU arena
# Set graph optimization level
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
self.session = ort.InferenceSession(
model_path,
sess_options=sess_options,
providers=[
('DmlExecutionProvider', {
'device_id': device_id,
'performance_preference': 'high_performance'
})
]
)
# Pre-allocate IO binding
self.io_binding = self.session.io_binding()
def run_optimized(self, input_data: np.ndarray) -> np.ndarray:
"""Run inference with optimized memory handling."""
# Bind input
input_tensor = ort.OrtValue.ortvalue_from_numpy(input_data)
self.io_binding.bind_input(
name='input',
device_type='dml',
device_id=0,
element_type=np.float32,
shape=input_data.shape,
buffer_ptr=input_tensor.data_ptr()
)
# Bind output
self.io_binding.bind_output('output', 'dml')
# Run
self.session.run_with_iobinding(self.io_binding)
# Get output
return self.io_binding.copy_outputs_to_cpu()[0]
Batch Processing
def optimal_batch_size(session, input_shape, target_memory_gb=4):
"""Find optimal batch size for DirectML."""
# Estimate memory per sample (rough approximation)
sample_memory = np.prod(input_shape[1:]) * 4 * 3 # Input + intermediate + output
max_batch = int((target_memory_gb * 1e9) / sample_memory)
# Test increasing batch sizes
best_batch = 1
best_throughput = 0
for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]:
if batch_size > max_batch:
break
try:
input_data = np.random.randn(batch_size, *input_shape[1:]).astype(np.float32)
# Warmup
for _ in range(3):
session.run(None, {'input': input_data})
# Benchmark
import time
start = time.perf_counter()
iterations = 10
for _ in range(iterations):
session.run(None, {'input': input_data})
elapsed = time.perf_counter() - start
throughput = (batch_size * iterations) / elapsed
if throughput > best_throughput:
best_throughput = throughput
best_batch = batch_size
except Exception as e:
print(f"Batch size {batch_size} failed: {e}")
break
return best_batch, best_throughput
DirectML vs CUDA Comparison
import time
def compare_providers(model_path, input_shape, iterations=100):
results = {}
for provider in ['DmlExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']:
try:
session = ort.InferenceSession(
model_path,
providers=[provider]
)
input_data = np.random.randn(*input_shape).astype(np.float32)
# Warmup
for _ in range(10):
session.run(None, {'input': input_data})
# Benchmark
start = time.perf_counter()
for _ in range(iterations):
session.run(None, {'input': input_data})
elapsed = time.perf_counter() - start
results[provider] = {
'total_time': elapsed,
'avg_latency_ms': (elapsed / iterations) * 1000,
'throughput': iterations / elapsed
}
except Exception as e:
results[provider] = {'error': str(e)}
return results
# Run comparison
results = compare_providers("model.onnx", (1, 3, 224, 224))
for provider, metrics in results.items():
if 'error' not in metrics:
print(f"{provider}: {metrics['avg_latency_ms']:.2f}ms, {metrics['throughput']:.1f}/s")
Supported Operations
DirectML supports most common ML operations:
| Category | Operations |
|---|---|
| Convolution | Conv, ConvTranspose, DepthwiseConv |
| Pooling | MaxPool, AveragePool, GlobalPool |
| Activation | ReLU, GELU, Sigmoid, Tanh, Softmax |
| Normalization | BatchNorm, LayerNorm, InstanceNorm |
| Linear | MatMul, Gemm, FullyConnected |
| Attention | MultiHeadAttention (fused) |
Troubleshooting
def diagnose_directml():
"""Diagnose DirectML setup."""
import onnxruntime as ort
print("ONNX Runtime version:", ort.__version__)
print("Available providers:", ort.get_available_providers())
if 'DmlExecutionProvider' not in ort.get_available_providers():
print("\nDirectML not available. Check:")
print("1. Windows 10 version 1903+ or Windows 11")
print("2. DirectX 12 compatible GPU")
print("3. onnxruntime-directml package installed")
return
# Try to create session
try:
session = ort.InferenceSession(
"model.onnx",
providers=['DmlExecutionProvider']
)
print("\nDirectML session created successfully!")
print("Active provider:", session.get_providers()[0])
except Exception as e:
print(f"\nFailed to create session: {e}")
Best Practices
- Use ONNX Runtime - Easiest way to leverage DirectML
- Optimize models - Graph optimization and quantization help
- Batch appropriately - Find optimal batch size for your GPU
- Use IO binding - Reduce CPU-GPU transfers
- Profile performance - Use PIX or GPU profilers
What’s Next
Tomorrow I’ll cover the Windows AI Recall feature and its implications.