4 min read
ONNX Runtime: Optimizing Model Inference
ONNX Runtime provides high-performance inference for machine learning models. Today we’ll explore how to convert and optimize models with ONNX.
Understanding ONNX
# ONNX (Open Neural Network Exchange)
onnx_overview = {
"what": "Open format for ML models",
"benefits": [
"Framework interoperability",
"Optimized inference runtime",
"Hardware acceleration",
"Cross-platform deployment"
],
"supported_frameworks": [
"PyTorch",
"TensorFlow",
"scikit-learn",
"Keras"
]
}
# Installation
# pip install onnx onnxruntime
# pip install onnxruntime-gpu # For GPU support
Converting PyTorch to ONNX
import torch
import onnx
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Load PyTorch model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Prepare dummy input
dummy_input = tokenizer(
"Hello, this is a test",
return_tensors="pt",
padding="max_length",
max_length=128,
truncation=True
)
# Export to ONNX
torch.onnx.export(
model,
(dummy_input["input_ids"], dummy_input["attention_mask"]),
"model.onnx",
input_names=["input_ids", "attention_mask"],
output_names=["logits"],
dynamic_axes={
"input_ids": {0: "batch_size", 1: "sequence"},
"attention_mask": {0: "batch_size", 1: "sequence"},
"logits": {0: "batch_size"}
},
opset_version=14
)
# Verify the model
onnx_model = onnx.load("model.onnx")
onnx.checker.check_model(onnx_model)
print("ONNX model is valid!")
Using Optimum for Transformers
# pip install optimum[onnxruntime]
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
# Direct export and load
model = ORTModelForSequenceClassification.from_pretrained(
model_name,
export=True # Automatically exports to ONNX
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Save ONNX model
model.save_pretrained("./onnx_model")
# Load saved ONNX model
model = ORTModelForSequenceClassification.from_pretrained("./onnx_model")
# Inference (same API as Transformers)
inputs = tokenizer("This is great!", return_tensors="pt")
outputs = model(**inputs)
Running Inference with ONNX Runtime
import onnxruntime as ort
import numpy as np
# Create inference session
session = ort.InferenceSession(
"model.onnx",
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
# Check available providers
print(ort.get_available_providers())
# Prepare input
text = "This movie is amazing!"
inputs = tokenizer(text, return_tensors="np", padding="max_length", max_length=128)
# Run inference
outputs = session.run(
None, # Get all outputs
{
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"]
}
)
logits = outputs[0]
predicted_class = np.argmax(logits, axis=-1)
print(f"Predicted class: {predicted_class}")
Optimizing ONNX Models
from onnxruntime.transformers import optimizer
from onnxruntime.transformers.fusion_options import FusionOptions
# Optimize for BERT-like models
optimized_model_path = optimizer.optimize_model(
"model.onnx",
model_type="bert",
num_heads=12,
hidden_size=768,
optimization_options=FusionOptions("bert")
)
optimized_model_path.save_model_to_file("model_optimized.onnx")
# Or use optimum
from optimum.onnxruntime import ORTOptimizer
from optimum.onnxruntime.configuration import OptimizationConfig
optimizer = ORTOptimizer.from_pretrained(model)
optimization_config = OptimizationConfig(
optimization_level=99,
optimize_for_gpu=True
)
optimizer.optimize(
save_dir="./optimized_model",
optimization_config=optimization_config
)
Quantization
from onnxruntime.quantization import quantize_dynamic, QuantType
# Dynamic quantization (INT8)
quantize_dynamic(
model_input="model.onnx",
model_output="model_quantized.onnx",
weight_type=QuantType.QInt8
)
# Using Optimum for quantization
from optimum.onnxruntime import ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
quantizer = ORTQuantizer.from_pretrained(model)
quantization_config = AutoQuantizationConfig.avx512_vnni(
is_static=False,
per_channel=False
)
quantizer.quantize(
save_dir="./quantized_model",
quantization_config=quantization_config
)
Performance Comparison
import time
import numpy as np
def benchmark_inference(model_path, inputs, num_runs=100):
session = ort.InferenceSession(model_path)
# Warmup
for _ in range(10):
session.run(None, inputs)
# Benchmark
times = []
for _ in range(num_runs):
start = time.perf_counter()
session.run(None, inputs)
times.append(time.perf_counter() - start)
return {
"mean_ms": np.mean(times) * 1000,
"std_ms": np.std(times) * 1000,
"min_ms": np.min(times) * 1000,
"max_ms": np.max(times) * 1000
}
# Compare models
inputs = {
"input_ids": np.random.randint(0, 30000, (1, 128)).astype(np.int64),
"attention_mask": np.ones((1, 128), dtype=np.int64)
}
original = benchmark_inference("model.onnx", inputs)
optimized = benchmark_inference("model_optimized.onnx", inputs)
quantized = benchmark_inference("model_quantized.onnx", inputs)
print(f"Original: {original['mean_ms']:.2f}ms")
print(f"Optimized: {optimized['mean_ms']:.2f}ms")
print(f"Quantized: {quantized['mean_ms']:.2f}ms")
Session Options
# Configure session for performance
session_options = ort.SessionOptions()
# Enable optimizations
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Set number of threads
session_options.intra_op_num_threads = 4
session_options.inter_op_num_threads = 4
# Enable memory pattern optimization
session_options.enable_mem_pattern = True
session_options.enable_cpu_mem_arena = True
# Create session with options
session = ort.InferenceSession(
"model.onnx",
sess_options=session_options,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
Deployment Considerations
deployment_tips = {
"model_selection": {
"cpu": "Use quantized INT8 models",
"gpu": "Use FP16 models with CUDA provider"
},
"batching": {
"tip": "Use dynamic batching for throughput",
"implementation": "Configure dynamic axes during export"
},
"memory": {
"tip": "Use session options to control memory",
"arena": "Enable memory arena for faster allocation"
},
"providers": {
"priority": ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"]
}
}
Tomorrow we’ll explore model optimization techniques in detail.