7 min read
Implementing the Azure Well-Architected Framework
Introduction
The Azure Well-Architected Framework provides a set of guiding tenets to improve the quality of workloads. It consists of five pillars: Cost Optimization, Operational Excellence, Performance Efficiency, Reliability, and Security. This guide demonstrates practical implementation patterns for each pillar.
The Five Pillars
Pillar Overview
┌─────────────────────────────────────────────────────────────────┐
│ Well-Architected Framework │
├─────────────┬─────────────┬─────────────┬──────────┬────────────┤
│ Cost │ Operational │ Performance │Reliability│ Security │
│Optimization │ Excellence │ Efficiency │ │ │
├─────────────┼─────────────┼─────────────┼──────────┼────────────┤
│ - Right- │ - IaC │ - Scaling │ - HA │ - Identity │
│ sizing │ - CI/CD │ - Caching │ - DR │ - Network │
│ - Reserved │ - Monitoring│ - CDN │ - Backup │ - Data │
│ instances │ - Automation│ - Async │ - Health │ - App │
│ - Budgets │ - Testing │ - Optimize │ - Retry │ - Govern │
└─────────────┴─────────────┴─────────────┴──────────┴────────────┘
Cost Optimization
Right-Sizing Resources
public class CostOptimizationService
{
public async Task<RightSizingReport> AnalyzeVmSizingAsync()
{
var armClient = new ArmClient(new DefaultAzureCredential());
var subscription = await armClient.GetDefaultSubscriptionAsync();
var report = new RightSizingReport();
await foreach (var vm in subscription.GetVirtualMachinesAsync())
{
var metrics = await GetVmMetricsAsync(vm);
var recommendation = AnalyzeMetrics(vm, metrics);
if (recommendation != null)
{
report.Recommendations.Add(recommendation);
}
}
return report;
}
private SizingRecommendation? AnalyzeMetrics(
VirtualMachineResource vm,
VmMetrics metrics)
{
// Undersized if consistently above 80% CPU
if (metrics.AvgCpuPercent > 80)
{
return new SizingRecommendation
{
VmName = vm.Data.Name,
CurrentSize = vm.Data.HardwareProfile.VmSize.ToString(),
Recommendation = "Consider scaling up",
Reason = $"Average CPU: {metrics.AvgCpuPercent}%"
};
}
// Oversized if consistently below 20% CPU
if (metrics.AvgCpuPercent < 20 && metrics.MaxCpuPercent < 50)
{
return new SizingRecommendation
{
VmName = vm.Data.Name,
CurrentSize = vm.Data.HardwareProfile.VmSize.ToString(),
Recommendation = "Consider scaling down",
Reason = $"Average CPU: {metrics.AvgCpuPercent}%, Max: {metrics.MaxCpuPercent}%"
};
}
return null;
}
}
Reserved Instance Analysis
public class ReservedInstanceAnalyzer
{
public async Task<ReservationRecommendation> AnalyzeForReservationsAsync()
{
var usage = await GetUsageDataAsync(TimeSpan.FromDays(30));
// Group by VM size and region
var consistentUsage = usage
.GroupBy(u => new { u.VmSize, u.Region })
.Where(g => g.All(u => u.HoursRunning >= 720 * 0.8)) // 80% uptime
.ToList();
var recommendations = new List<ReservationSuggestion>();
foreach (var group in consistentUsage)
{
var vmCount = group.Count();
var currentMonthlyCost = group.Sum(g => g.MonthlyCost);
var reservedCost = CalculateReservedCost(group.Key.VmSize, vmCount);
var savings = currentMonthlyCost - reservedCost;
if (savings > 0)
{
recommendations.Add(new ReservationSuggestion
{
VmSize = group.Key.VmSize,
Region = group.Key.Region,
Quantity = vmCount,
CurrentMonthlyCost = currentMonthlyCost,
ReservedMonthlyCost = reservedCost,
MonthlySavings = savings,
AnnualSavings = savings * 12
});
}
}
return new ReservationRecommendation { Suggestions = recommendations };
}
}
Operational Excellence
Infrastructure as Code
// infrastructure/main.bicep
targetScope = 'subscription'
param environment string
param location string = 'australiaeast'
// Resource naming convention module
module naming 'modules/naming.bicep' = {
name: 'naming'
params: {
environment: environment
location: location
}
}
// Networking module
module networking 'modules/networking.bicep' = {
name: 'networking'
scope: resourceGroup(naming.outputs.resourceGroupName)
params: {
naming: naming.outputs
}
}
// Application module
module application 'modules/application.bicep' = {
name: 'application'
scope: resourceGroup(naming.outputs.resourceGroupName)
params: {
naming: naming.outputs
subnetId: networking.outputs.appSubnetId
}
}
// Monitoring module
module monitoring 'modules/monitoring.bicep' = {
name: 'monitoring'
scope: resourceGroup(naming.outputs.resourceGroupName)
params: {
naming: naming.outputs
applicationInsightsId: application.outputs.appInsightsId
}
}
Automated Deployment Pipeline
# .github/workflows/deploy.yml
name: Deploy Infrastructure
on:
push:
branches: [main]
paths:
- 'infrastructure/**'
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: azure/login@v1
with:
creds: ${{ secrets.AZURE_CREDENTIALS }}
- name: Validate Bicep
run: |
az bicep build --file infrastructure/main.bicep
az deployment sub validate \
--location australiaeast \
--template-file infrastructure/main.bicep \
--parameters environment=dev
deploy-dev:
needs: validate
runs-on: ubuntu-latest
environment: development
steps:
- uses: actions/checkout@v2
- uses: azure/login@v1
with:
creds: ${{ secrets.AZURE_CREDENTIALS }}
- name: Deploy to Dev
run: |
az deployment sub create \
--location australiaeast \
--template-file infrastructure/main.bicep \
--parameters environment=dev
deploy-prod:
needs: deploy-dev
runs-on: ubuntu-latest
environment: production
steps:
- uses: actions/checkout@v2
- uses: azure/login@v1
with:
creds: ${{ secrets.AZURE_CREDENTIALS_PROD }}
- name: Deploy to Prod
run: |
az deployment sub create \
--location australiaeast \
--template-file infrastructure/main.bicep \
--parameters environment=prod
Performance Efficiency
Auto-Scaling Configuration
// autoscale.bicep
param appServicePlanId string
param minInstances int = 2
param maxInstances int = 10
resource autoScale 'Microsoft.Insights/autoscalesettings@2021-05-01-preview' = {
name: 'autoscale-webapp'
location: resourceGroup().location
properties: {
enabled: true
targetResourceUri: appServicePlanId
profiles: [
{
name: 'DefaultProfile'
capacity: {
minimum: string(minInstances)
maximum: string(maxInstances)
default: string(minInstances)
}
rules: [
// Scale out on CPU
{
metricTrigger: {
metricName: 'CpuPercentage'
metricResourceUri: appServicePlanId
timeGrain: 'PT1M'
statistic: 'Average'
timeWindow: 'PT5M'
timeAggregation: 'Average'
operator: 'GreaterThan'
threshold: 70
}
scaleAction: {
direction: 'Increase'
type: 'ChangeCount'
value: '1'
cooldown: 'PT5M'
}
}
// Scale out on memory
{
metricTrigger: {
metricName: 'MemoryPercentage'
metricResourceUri: appServicePlanId
timeGrain: 'PT1M'
statistic: 'Average'
timeWindow: 'PT5M'
timeAggregation: 'Average'
operator: 'GreaterThan'
threshold: 80
}
scaleAction: {
direction: 'Increase'
type: 'ChangeCount'
value: '1'
cooldown: 'PT5M'
}
}
// Scale in
{
metricTrigger: {
metricName: 'CpuPercentage'
metricResourceUri: appServicePlanId
timeGrain: 'PT1M'
statistic: 'Average'
timeWindow: 'PT10M'
timeAggregation: 'Average'
operator: 'LessThan'
threshold: 30
}
scaleAction: {
direction: 'Decrease'
type: 'ChangeCount'
value: '1'
cooldown: 'PT10M'
}
}
]
}
]
}
}
Caching Strategy
public class CachingService
{
private readonly IDistributedCache _cache;
private readonly IDatabase _redis;
public async Task<T?> GetOrSetAsync<T>(
string key,
Func<Task<T>> factory,
CacheOptions options)
{
// Try L1 cache (in-memory)
var cached = _memoryCache.Get<T>(key);
if (cached != null) return cached;
// Try L2 cache (Redis)
var redisValue = await _redis.StringGetAsync(key);
if (redisValue.HasValue)
{
var result = JsonSerializer.Deserialize<T>(redisValue!);
_memoryCache.Set(key, result, options.LocalExpiry);
return result;
}
// Fetch from source
var value = await factory();
// Set both caches
var serialized = JsonSerializer.Serialize(value);
await _redis.StringSetAsync(key, serialized, options.DistributedExpiry);
_memoryCache.Set(key, value, options.LocalExpiry);
return value;
}
}
Reliability
Health Checks Implementation
// Program.cs
builder.Services.AddHealthChecks()
.AddCheck<DatabaseHealthCheck>("database")
.AddCheck<StorageHealthCheck>("storage")
.AddCheck<ExternalApiHealthCheck>("external-api")
.AddAzureBlobStorage(connectionString, name: "azure-blob")
.AddRedis(redisConnectionString, name: "redis");
app.MapHealthChecks("/health", new HealthCheckOptions
{
ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
Predicate = check => check.Tags.Contains("ready")
});
app.MapHealthChecks("/health/live", new HealthCheckOptions
{
Predicate = _ => false // Just checks the app is running
});
Retry Policies with Polly
public static class ResilienceExtensions
{
public static IServiceCollection AddResilientHttpClient(
this IServiceCollection services,
string name,
string baseAddress)
{
services.AddHttpClient(name, client =>
{
client.BaseAddress = new Uri(baseAddress);
})
.AddTransientHttpErrorPolicy(policy =>
policy.WaitAndRetryAsync(
retryCount: 3,
sleepDurationProvider: retryAttempt =>
TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)),
onRetry: (outcome, timespan, retryAttempt, context) =>
{
Log.Warning(
"Retry {RetryAttempt} after {Delay}s due to {Exception}",
retryAttempt,
timespan.TotalSeconds,
outcome.Exception?.Message);
}))
.AddTransientHttpErrorPolicy(policy =>
policy.CircuitBreakerAsync(
handledEventsAllowedBeforeBreaking: 5,
durationOfBreak: TimeSpan.FromSeconds(30),
onBreak: (outcome, breakDelay) =>
{
Log.Error("Circuit breaker opened for {Delay}s", breakDelay.TotalSeconds);
},
onReset: () =>
{
Log.Information("Circuit breaker reset");
}));
return services;
}
}
Security
Defense in Depth
// security.bicep
// Network Security Group
resource nsg 'Microsoft.Network/networkSecurityGroups@2021-03-01' = {
name: 'nsg-app'
location: location
properties: {
securityRules: [
{
name: 'AllowHTTPS'
properties: {
priority: 100
direction: 'Inbound'
access: 'Allow'
protocol: 'Tcp'
sourceAddressPrefix: 'Internet'
sourcePortRange: '*'
destinationAddressPrefix: '*'
destinationPortRange: '443'
}
}
{
name: 'DenyAllInbound'
properties: {
priority: 4096
direction: 'Inbound'
access: 'Deny'
protocol: '*'
sourceAddressPrefix: '*'
sourcePortRange: '*'
destinationAddressPrefix: '*'
destinationPortRange: '*'
}
}
]
}
}
// Key Vault with private endpoint
resource keyVault 'Microsoft.KeyVault/vaults@2021-06-01-preview' = {
name: keyVaultName
location: location
properties: {
sku: { family: 'A', name: 'standard' }
tenantId: subscription().tenantId
enableRbacAuthorization: true
enableSoftDelete: true
softDeleteRetentionInDays: 90
enablePurgeProtection: true
networkAcls: {
bypass: 'AzureServices'
defaultAction: 'Deny'
virtualNetworkRules: [
{
id: appSubnetId
}
]
}
}
}
Well-Architected Review
Assessment Automation
public class WellArchitectedAssessment
{
public async Task<AssessmentReport> RunAssessmentAsync()
{
var report = new AssessmentReport();
report.CostOptimization = await AssessCostOptimizationAsync();
report.OperationalExcellence = await AssessOperationalExcellenceAsync();
report.PerformanceEfficiency = await AssessPerformanceEfficiencyAsync();
report.Reliability = await AssessReliabilityAsync();
report.Security = await AssessSecurityAsync();
report.OverallScore = CalculateOverallScore(report);
return report;
}
private async Task<PillarScore> AssessReliabilityAsync()
{
var checks = new List<CheckResult>();
// Check for availability zones
checks.Add(await CheckAvailabilityZonesAsync());
// Check for backup configurations
checks.Add(await CheckBackupConfigurationsAsync());
// Check for health probes
checks.Add(await CheckHealthProbesAsync());
// Check for disaster recovery
checks.Add(await CheckDisasterRecoveryAsync());
return new PillarScore
{
Pillar = "Reliability",
Score = checks.Average(c => c.Score),
Checks = checks
};
}
}
Conclusion
The Well-Architected Framework provides comprehensive guidance for building excellent cloud solutions. By systematically addressing each pillar, you create workloads that are cost-effective, operationally sound, performant, reliable, and secure. Regular assessments using the Well-Architected Review help identify areas for improvement.