7 min read
Platform Engineering: Building the Foundation for Developer Productivity
Platform engineering emerged as a discipline in 2021, focused on building internal developer platforms (IDPs) that enable self-service infrastructure and improve developer productivity.
What is Platform Engineering?
Platform engineering is about creating golden paths - paved roads that make it easy for developers to do the right thing. It’s infrastructure as a product, with developers as customers.
Building an Internal Developer Platform
# Platform API definition
apiVersion: platform.company.com/v1
kind: Application
metadata:
name: order-service
team: commerce
spec:
runtime: kubernetes
replicas:
min: 2
max: 10
resources:
cpu: "500m"
memory: "1Gi"
scaling:
metric: cpu
target: 70
networking:
ingress: true
domain: orders.company.com
databases:
- name: orders-db
type: postgresql
size: small
caching:
- name: orders-cache
type: redis
size: small
monitoring:
enabled: true
alerts:
- name: high-error-rate
condition: error_rate > 1%
severity: warning
- name: service-down
condition: availability < 99%
severity: critical
secrets:
vault: commerce-vault
keys:
- DB_PASSWORD
- API_KEY
Platform Service Catalog
from dataclasses import dataclass
from typing import Dict, List, Optional
from enum import Enum
class ServiceType(Enum):
COMPUTE = "compute"
DATABASE = "database"
MESSAGING = "messaging"
STORAGE = "storage"
MONITORING = "monitoring"
SECURITY = "security"
@dataclass
class ServiceOffering:
name: str
type: ServiceType
description: str
tiers: Dict[str, dict]
sla: dict
cost_model: dict
class ServiceCatalog:
"""Platform service catalog"""
def __init__(self):
self.services: Dict[str, ServiceOffering] = {}
self._initialize_catalog()
def _initialize_catalog(self):
"""Initialize standard service offerings"""
# Kubernetes namespace
self.services['kubernetes-namespace'] = ServiceOffering(
name='Kubernetes Namespace',
type=ServiceType.COMPUTE,
description='Isolated Kubernetes namespace with RBAC and resource quotas',
tiers={
'small': {
'cpu_limit': '4',
'memory_limit': '8Gi',
'storage_limit': '50Gi'
},
'medium': {
'cpu_limit': '16',
'memory_limit': '32Gi',
'storage_limit': '200Gi'
},
'large': {
'cpu_limit': '64',
'memory_limit': '128Gi',
'storage_limit': '1Ti'
}
},
sla={'availability': '99.9%', 'support': '24x7'},
cost_model={'type': 'resource_based', 'unit': 'per_hour'}
)
# PostgreSQL database
self.services['postgresql'] = ServiceOffering(
name='PostgreSQL Database',
type=ServiceType.DATABASE,
description='Managed PostgreSQL with automatic backups and monitoring',
tiers={
'small': {'vcores': 2, 'memory_gb': 4, 'storage_gb': 100},
'medium': {'vcores': 4, 'memory_gb': 16, 'storage_gb': 500},
'large': {'vcores': 16, 'memory_gb': 64, 'storage_gb': 2000}
},
sla={'availability': '99.95%', 'rpo': '5min', 'rto': '1hour'},
cost_model={'type': 'tier_based', 'billing': 'monthly'}
)
# Redis cache
self.services['redis'] = ServiceOffering(
name='Redis Cache',
type=ServiceType.DATABASE,
description='Managed Redis for caching and session storage',
tiers={
'small': {'memory_gb': 1, 'connections': 1000},
'medium': {'memory_gb': 6, 'connections': 5000},
'large': {'memory_gb': 26, 'connections': 20000}
},
sla={'availability': '99.9%', 'latency_p99': '1ms'},
cost_model={'type': 'tier_based', 'billing': 'monthly'}
)
def get_service(self, name: str) -> Optional[ServiceOffering]:
return self.services.get(name)
def list_services(self, service_type: ServiceType = None) -> List[ServiceOffering]:
if service_type:
return [s for s in self.services.values() if s.type == service_type]
return list(self.services.values())
Self-Service Provisioning
from abc import ABC, abstractmethod
import pulumi
from pulumi_azure_native import resources, containerservice, dbforpostgresql
class ResourceProvisioner(ABC):
"""Abstract base for resource provisioners"""
@abstractmethod
def provision(self, spec: dict) -> dict:
pass
@abstractmethod
def deprovision(self, resource_id: str):
pass
@abstractmethod
def get_status(self, resource_id: str) -> dict:
pass
class KubernetesNamespaceProvisioner(ResourceProvisioner):
"""Provision Kubernetes namespaces"""
def provision(self, spec: dict) -> dict:
from kubernetes import client, config
config.load_kube_config()
v1 = client.CoreV1Api()
namespace_name = f"{spec['team']}-{spec['name']}"
# Create namespace
namespace = client.V1Namespace(
metadata=client.V1ObjectMeta(
name=namespace_name,
labels={
'team': spec['team'],
'managed-by': 'platform',
'tier': spec.get('tier', 'small')
}
)
)
v1.create_namespace(namespace)
# Create resource quota
tier_limits = self._get_tier_limits(spec.get('tier', 'small'))
quota = client.V1ResourceQuota(
metadata=client.V1ObjectMeta(name='default-quota'),
spec=client.V1ResourceQuotaSpec(
hard={
'requests.cpu': tier_limits['cpu_limit'],
'requests.memory': tier_limits['memory_limit'],
'persistentvolumeclaims': '10'
}
)
)
v1.create_namespaced_resource_quota(namespace_name, quota)
# Create network policy
self._create_network_policy(namespace_name)
# Create RBAC
self._create_rbac(namespace_name, spec['team'])
return {
'resource_id': namespace_name,
'status': 'provisioned',
'connection_info': {
'namespace': namespace_name,
'cluster': 'production'
}
}
def _get_tier_limits(self, tier: str) -> dict:
limits = {
'small': {'cpu_limit': '4', 'memory_limit': '8Gi'},
'medium': {'cpu_limit': '16', 'memory_limit': '32Gi'},
'large': {'cpu_limit': '64', 'memory_limit': '128Gi'}
}
return limits.get(tier, limits['small'])
class PostgreSQLProvisioner(ResourceProvisioner):
"""Provision PostgreSQL databases"""
def provision(self, spec: dict) -> dict:
import pulumi
from pulumi_azure_native import dbforpostgresql
server_name = f"psql-{spec['team']}-{spec['name']}"
tier_config = self._get_tier_config(spec.get('tier', 'small'))
# Create PostgreSQL server using Pulumi
server = dbforpostgresql.Server(
server_name,
resource_group_name=spec['resource_group'],
server_name=server_name,
sku=dbforpostgresql.SkuArgs(
name=tier_config['sku'],
tier=tier_config['tier']
),
properties=dbforpostgresql.ServerPropertiesForDefaultCreateArgs(
administrator_login="platformadmin",
administrator_login_password=self._generate_password(),
create_mode="Default",
storage_profile=dbforpostgresql.StorageProfileArgs(
backup_retention_days=7,
geo_redundant_backup="Disabled",
storage_mb=tier_config['storage_mb']
),
version="13"
)
)
return {
'resource_id': server_name,
'status': 'provisioning',
'connection_info': {
'host': f"{server_name}.postgres.database.azure.com",
'port': 5432,
'database': 'postgres'
}
}
Developer Portal
// Developer portal API
import express from 'express';
import { ServiceCatalog } from './catalog';
import { ResourceManager } from './resources';
const app = express();
const catalog = new ServiceCatalog();
const resourceManager = new ResourceManager();
// List available services
app.get('/api/catalog', (req, res) => {
const services = catalog.listServices();
res.json(services);
});
// Request a new resource
app.post('/api/resources', async (req, res) => {
const { serviceType, tier, team, name, config } = req.body;
// Validate request
const validation = await validateRequest(req.body);
if (!validation.valid) {
return res.status(400).json({ error: validation.errors });
}
// Check team quota
const quota = await resourceManager.checkQuota(team, serviceType);
if (!quota.available) {
return res.status(403).json({ error: 'Quota exceeded' });
}
// Provision resource
const resource = await resourceManager.provision({
serviceType,
tier,
team,
name,
config
});
// Record in service registry
await serviceRegistry.register(resource);
res.json(resource);
});
// Get resource status
app.get('/api/resources/:id', async (req, res) => {
const resource = await resourceManager.getResource(req.params.id);
if (!resource) {
return res.status(404).json({ error: 'Resource not found' });
}
res.json(resource);
});
// List team resources
app.get('/api/teams/:team/resources', async (req, res) => {
const resources = await resourceManager.listByTeam(req.params.team);
res.json(resources);
});
// Deprovision resource
app.delete('/api/resources/:id', async (req, res) => {
await resourceManager.deprovision(req.params.id);
res.status(204).send();
});
Golden Paths with Templates
# Application template - golden path for microservices
apiVersion: scaffolder.backstage.io/v1beta3
kind: Template
metadata:
name: microservice-template
title: Microservice Template
description: Create a new microservice with all standard configurations
tags:
- recommended
- microservice
spec:
owner: platform-team
type: service
parameters:
- title: Service Information
required:
- name
- team
- description
properties:
name:
title: Service Name
type: string
pattern: '^[a-z0-9-]+$'
team:
title: Team
type: string
enum:
- commerce
- customer
- payments
- platform
description:
title: Description
type: string
- title: Technical Configuration
properties:
language:
title: Programming Language
type: string
enum:
- python
- go
- java
- typescript
default: python
database:
title: Database
type: string
enum:
- none
- postgresql
- mongodb
default: none
caching:
title: Redis Cache
type: boolean
default: false
steps:
- id: fetch-template
name: Fetch Template
action: fetch:template
input:
url: ./templates/${{ parameters.language }}
values:
name: ${{ parameters.name }}
team: ${{ parameters.team }}
description: ${{ parameters.description }}
- id: create-repo
name: Create Repository
action: publish:github
input:
repoUrl: github.com?owner=${{ parameters.team }}&repo=${{ parameters.name }}
description: ${{ parameters.description }}
- id: setup-pipeline
name: Setup CI/CD Pipeline
action: github:actions:create
input:
repoUrl: ${{ steps.create-repo.output.repoUrl }}
template: standard-pipeline
- id: register-service
name: Register in Catalog
action: catalog:register
input:
repoContentsUrl: ${{ steps.create-repo.output.repoContentsUrl }}
catalogInfoPath: /catalog-info.yaml
- id: provision-infrastructure
name: Provision Infrastructure
action: platform:provision
input:
team: ${{ parameters.team }}
service: ${{ parameters.name }}
database: ${{ parameters.database }}
caching: ${{ parameters.caching }}
output:
links:
- title: Repository
url: ${{ steps.create-repo.output.remoteUrl }}
- title: Pipeline
url: ${{ steps.setup-pipeline.output.pipelineUrl }}
- title: Service Catalog
url: /catalog/services/${{ parameters.name }}
Platform Metrics
# Platform health and usage metrics
from prometheus_client import Counter, Histogram, Gauge
# Provisioning metrics
provisions_total = Counter(
'platform_provisions_total',
'Total resource provisions',
['service_type', 'tier', 'status']
)
provision_duration = Histogram(
'platform_provision_duration_seconds',
'Time to provision resources',
['service_type'],
buckets=[10, 30, 60, 120, 300, 600]
)
# Usage metrics
active_resources = Gauge(
'platform_active_resources',
'Number of active resources',
['service_type', 'team']
)
resource_utilization = Gauge(
'platform_resource_utilization',
'Resource utilization percentage',
['resource_id', 'metric']
)
# Developer experience metrics
time_to_first_deploy = Histogram(
'platform_time_to_first_deploy_hours',
'Time from onboarding to first production deploy',
['team'],
buckets=[1, 4, 8, 24, 48, 168]
)
self_service_success_rate = Gauge(
'platform_self_service_success_rate',
'Percentage of self-service requests completed without support'
)
Key Platform Engineering Principles
- Product Mindset: Treat the platform as a product with developer customers
- Self-Service: Enable teams to provision without tickets
- Golden Paths: Make the right way the easy way
- Documentation: Comprehensive, up-to-date documentation
- Measure Success: Track developer productivity metrics
Platform engineering in 2021 emerged as the answer to scaling DevOps practices. By building platforms that abstract complexity, organizations enable faster, safer software delivery.