Monitoring Azure Resource Health for Proactive Operations
Introduction
Azure Resource Health provides personalized information about the health of your individual Azure resources. Unlike Service Health which reports on Azure-wide issues, Resource Health focuses on your specific resources, helping you understand whether a problem is caused by an Azure platform issue or something within your application.
In this post, we will explore how to leverage Resource Health for proactive operations.
Understanding Resource Health States
Resource Health reports four possible states:
- Available: Resource is healthy and operating normally
- Unavailable: Resource is not healthy due to Azure or customer action
- Degraded: Resource has reduced performance but is still operational
- Unknown: Health signal not received (typically transient)
Querying Resource Health
Access Resource Health using the Azure CLI:
# Get health status for a specific resource
az resource show \
--ids /subscriptions/$SUBSCRIPTION_ID/resourceGroups/rg-compute/providers/Microsoft.Compute/virtualMachines/vm-web-01 \
--query "properties.instanceView.statuses"
# List all unhealthy resources using Resource Graph
az graph query -q "
healthresources
| where type == 'microsoft.resourcehealth/availabilitystatuses'
| where properties.availabilityState != 'Available'
| project resourceId=id, state=properties.availabilityState, reason=properties.reasonType, summary=properties.summary
"
Python SDK for Resource Health
Monitor resource health programmatically:
from azure.mgmt.resourcehealth import ResourceHealthMgmtClient
from azure.identity import DefaultAzureCredential
from datetime import datetime, timedelta
credential = DefaultAzureCredential()
health_client = ResourceHealthMgmtClient(credential, subscription_id)
def get_availability_status(resource_id):
"""Get current availability status for a resource."""
status = health_client.availability_statuses.get_by_resource(resource_id)
return {
"resource_id": resource_id,
"availability_state": status.properties.availability_state,
"summary": status.properties.summary,
"reason_type": status.properties.reason_type,
"reason_chronicity": status.properties.reason_chronicity,
"occurred_time": status.properties.occurred_time,
"reported_time": status.properties.reported_time
}
def list_all_availability_statuses():
"""List availability status for all resources in subscription."""
statuses = health_client.availability_statuses.list_by_subscription_id()
results = []
for status in statuses:
results.append({
"resource_id": status.id.replace("/providers/Microsoft.ResourceHealth/availabilityStatuses/current", ""),
"availability_state": status.properties.availability_state,
"summary": status.properties.summary,
"reason_type": status.properties.reason_type
})
return results
def get_unhealthy_resources():
"""Get all resources that are not in Available state."""
all_statuses = list_all_availability_statuses()
unhealthy = [s for s in all_statuses if s["availability_state"] != "Available"]
print(f"Found {len(unhealthy)} unhealthy resources:")
for resource in unhealthy:
print(f" {resource['resource_id']}")
print(f" State: {resource['availability_state']}")
print(f" Reason: {resource['reason_type']}")
print(f" Summary: {resource['summary']}")
print()
return unhealthy
# Check specific resources
vm_id = f"/subscriptions/{subscription_id}/resourceGroups/rg-compute/providers/Microsoft.Compute/virtualMachines/vm-web-01"
status = get_availability_status(vm_id)
print(f"VM Status: {status['availability_state']}")
# Get all unhealthy resources
unhealthy = get_unhealthy_resources()
Historical Health Events
Query historical health events for root cause analysis:
def get_health_history(resource_id, days=30):
"""Get historical health events for a resource."""
# Use availability statuses history
events = health_client.availability_statuses.list(resource_id)
history = []
for event in events:
if event.properties.occurred_time:
if event.properties.occurred_time > datetime.utcnow() - timedelta(days=days):
history.append({
"occurred_time": event.properties.occurred_time,
"reported_time": event.properties.reported_time,
"state": event.properties.availability_state,
"summary": event.properties.summary,
"reason_type": event.properties.reason_type,
"root_cause_attribution_time": event.properties.root_cause_attribution_time
})
# Sort by occurred time
history.sort(key=lambda x: x["occurred_time"], reverse=True)
return history
def analyze_outage_pattern(resource_id, days=90):
"""Analyze outage patterns for a resource."""
history = get_health_history(resource_id, days)
# Calculate availability
total_time = timedelta(days=days)
unavailable_time = timedelta()
for i, event in enumerate(history):
if event["state"] == "Unavailable":
# Find when it became available again
end_time = datetime.utcnow()
for j in range(i-1, -1, -1):
if history[j]["state"] == "Available":
end_time = history[j]["occurred_time"]
break
duration = end_time - event["occurred_time"]
unavailable_time += duration
availability_percentage = ((total_time - unavailable_time) / total_time) * 100
print(f"\nAvailability Analysis for {days} days:")
print(f" Total Outages: {len([e for e in history if e['state'] == 'Unavailable'])}")
print(f" Total Downtime: {unavailable_time}")
print(f" Availability: {availability_percentage:.3f}%")
# Group by reason
reasons = {}
for event in history:
if event["state"] == "Unavailable":
reason = event["reason_type"] or "Unknown"
reasons[reason] = reasons.get(reason, 0) + 1
print("\n Outages by Reason:")
for reason, count in sorted(reasons.items(), key=lambda x: x[1], reverse=True):
print(f" {reason}: {count}")
return history
# Analyze SQL database availability
sql_id = f"/subscriptions/{subscription_id}/resourceGroups/rg-sql/providers/Microsoft.Sql/servers/sqlserver/databases/mydb"
analyze_outage_pattern(sql_id, days=90)
Setting Up Resource Health Alerts
Create alerts for resource health changes:
from azure.mgmt.monitor import MonitorManagementClient
monitor_client = MonitorManagementClient(credential, subscription_id)
def create_resource_health_alert(name, resource_type, severity=2):
"""Create an alert for resource health changes."""
alert = monitor_client.activity_log_alerts.create_or_update(
resource_group_name="rg-monitoring",
activity_log_alert_name=name,
activity_log_alert={
"location": "global",
"scopes": [f"/subscriptions/{subscription_id}"],
"enabled": True,
"condition": {
"allOf": [
{
"field": "category",
"equals": "ResourceHealth"
},
{
"field": "resourceType",
"equals": resource_type
},
{
"anyOf": [
{
"field": "properties.currentHealthStatus",
"equals": "Unavailable"
},
{
"field": "properties.currentHealthStatus",
"equals": "Degraded"
}
]
}
]
},
"actions": {
"actionGroups": [{
"actionGroupId": f"/subscriptions/{subscription_id}/resourceGroups/rg-monitoring/providers/Microsoft.Insights/actionGroups/ops-team"
}]
},
"description": f"Alert when {resource_type} becomes unhealthy"
}
)
return alert
# Create alerts for critical resource types
resource_types = [
("vm-health-alert", "Microsoft.Compute/virtualMachines"),
("sql-health-alert", "Microsoft.Sql/servers/databases"),
("storage-health-alert", "Microsoft.Storage/storageAccounts"),
("appservice-health-alert", "Microsoft.Web/sites")
]
for name, resource_type in resource_types:
create_resource_health_alert(name, resource_type)
print(f"Created alert: {name}")
Resource Health Dashboard
Query Resource Health using Resource Graph:
from azure.mgmt.resourcegraph import ResourceGraphClient
from azure.mgmt.resourcegraph.models import QueryRequest
graph_client = ResourceGraphClient(credential)
def query_resource_health():
"""Query resource health across subscription using Resource Graph."""
query = """
healthresources
| where type == 'microsoft.resourcehealth/availabilitystatuses'
| extend resourceId = tolower(tostring(id))
| extend availabilityState = tostring(properties.availabilityState)
| extend reasonType = tostring(properties.reasonType)
| extend summary = tostring(properties.summary)
| project resourceId, availabilityState, reasonType, summary
| order by availabilityState asc
"""
request = QueryRequest(
subscriptions=[subscription_id],
query=query
)
response = graph_client.resources(request)
return response.data
def get_health_summary():
"""Get summary of resource health across subscription."""
query = """
healthresources
| where type == 'microsoft.resourcehealth/availabilitystatuses'
| extend availabilityState = tostring(properties.availabilityState)
| summarize count() by availabilityState
"""
request = QueryRequest(
subscriptions=[subscription_id],
query=query
)
response = graph_client.resources(request)
summary = {}
for row in response.data:
summary[row["availabilityState"]] = row["count_"]
print("\nResource Health Summary:")
print(f" Available: {summary.get('Available', 0)}")
print(f" Unavailable: {summary.get('Unavailable', 0)}")
print(f" Degraded: {summary.get('Degraded', 0)}")
print(f" Unknown: {summary.get('Unknown', 0)}")
return summary
def get_unhealthy_by_type():
"""Get count of unhealthy resources by type."""
query = """
healthresources
| where type == 'microsoft.resourcehealth/availabilitystatuses'
| extend availabilityState = tostring(properties.availabilityState)
| where availabilityState != 'Available'
| extend resourceType = tostring(split(id, '/providers/')[1])
| extend resourceType = tostring(split(resourceType, '/providers/Microsoft.ResourceHealth')[0])
| summarize count() by resourceType, availabilityState
| order by count_ desc
"""
request = QueryRequest(
subscriptions=[subscription_id],
query=query
)
response = graph_client.resources(request)
print("\nUnhealthy Resources by Type:")
for row in response.data:
print(f" {row['resourceType']}: {row['count_']} ({row['availabilityState']})")
return response.data
# Get health overview
get_health_summary()
get_unhealthy_by_type()
Integrating with Incident Management
Automate incident creation from Resource Health:
def handle_resource_health_event(event):
"""Handle resource health event for incident management."""
resource_id = event["resource_id"]
current_state = event["current_health_status"]
previous_state = event["previous_health_status"]
summary = event["summary"]
if current_state == "Unavailable" and previous_state == "Available":
# Create incident
incident = {
"title": f"Resource Unavailable: {resource_id.split('/')[-1]}",
"severity": "High" if "production" in resource_id.lower() else "Medium",
"description": summary,
"affected_resource": resource_id,
"detected_time": datetime.utcnow().isoformat(),
"category": "Availability"
}
# Send to incident management system
create_incident(incident)
# Notify on-call team
send_pager_alert(incident)
elif current_state == "Available" and previous_state == "Unavailable":
# Resolve incident
resolve_incident(resource_id)
# Send recovery notification
send_recovery_notification(resource_id, summary)
def create_incident(incident):
"""Create incident in your incident management system."""
# Integration with ServiceNow, PagerDuty, etc.
print(f"Creating incident: {incident['title']}")
def send_pager_alert(incident):
"""Send alert to on-call responder."""
print(f"Paging on-call for: {incident['title']}")
def resolve_incident(resource_id):
"""Resolve existing incident."""
print(f"Resolving incident for: {resource_id}")
def send_recovery_notification(resource_id, summary):
"""Send recovery notification."""
print(f"Resource recovered: {resource_id}")
Conclusion
Azure Resource Health provides critical visibility into the health of your individual resources. By monitoring health status changes, analyzing historical patterns, and setting up proactive alerts, you can quickly identify and respond to availability issues.
Combining Resource Health with Service Health gives you a complete picture of both platform-level issues and resource-specific problems. Use Resource Graph for efficient querying across large environments, and integrate with your incident management processes for streamlined operations.