December 28, 2021 1 min read

Platform Engineering: Building the Foundation for Developer Productivity

Platform Engineering DevOps Developer Experience Infrastructure Cloud

Platform engineering emerged as a discipline in 2021, focused on building internal developer platforms (IDPs) that enable self-service infrastructure and improve developer productivity.

What is Platform Engineering?

Platform engineering is about creating golden paths - paved roads that make it easy for developers to do the right thing. It’s infrastructure as a product, with developers as customers.

Building an Internal Developer Platform

# Platform API definition
apiVersion: platform.company.com/v1
kind: Application
metadata:
  name: order-service
  team: commerce
spec:
  runtime: kubernetes
  replicas:
    min: 2
    max: 10
  resources:
    cpu: "500m"
    memory: "1Gi"
  scaling:
    metric: cpu
    target: 70
  networking:
    ingress: true
    domain: orders.company.com
  databases:
    - name: orders-db
      type: postgresql
      size: small
  caching:
    - name: orders-cache
      type: redis
      size: small
  monitoring:
    enabled: true
    alerts:
      - name: high-error-rate
        condition: error_rate > 1%
        severity: warning
      - name: service-down
        condition: availability < 99%
        severity: critical
  secrets:
    vault: commerce-vault
    keys:
      - DB_PASSWORD
      - API_KEY

Platform Service Catalog

from dataclasses import dataclass
from typing import Dict, List, Optional
from enum import Enum

class ServiceType(Enum):
    COMPUTE = "compute"
    DATABASE = "database"
    MESSAGING = "messaging"
    STORAGE = "storage"
    MONITORING = "monitoring"
    SECURITY = "security"

@dataclass
class ServiceOffering:
    name: str
    type: ServiceType
    description: str
    tiers: Dict[str, dict]
    sla: dict
    cost_model: dict

class ServiceCatalog:
    """Platform service catalog"""

    def __init__(self):
        self.services: Dict[str, ServiceOffering] = {}
        self._initialize_catalog()

    def _initialize_catalog(self):
        """Initialize standard service offerings"""

        # Kubernetes namespace
        self.services['kubernetes-namespace'] = ServiceOffering(
            name='Kubernetes Namespace',
            type=ServiceType.COMPUTE,
            description='Isolated Kubernetes namespace with RBAC and resource quotas',
            tiers={
                'small': {
                    'cpu_limit': '4',
                    'memory_limit': '8Gi',
                    'storage_limit': '50Gi'
                },
                'medium': {
                    'cpu_limit': '16',
                    'memory_limit': '32Gi',
                    'storage_limit': '200Gi'
                },
                'large': {
                    'cpu_limit': '64',
                    'memory_limit': '128Gi',
                    'storage_limit': '1Ti'
                }
            },
            sla={'availability': '99.9%', 'support': '24x7'},
            cost_model={'type': 'resource_based', 'unit': 'per_hour'}
        )

        # PostgreSQL database
        self.services['postgresql'] = ServiceOffering(
            name='PostgreSQL Database',
            type=ServiceType.DATABASE,
            description='Managed PostgreSQL with automatic backups and monitoring',
            tiers={
                'small': {'vcores': 2, 'memory_gb': 4, 'storage_gb': 100},
                'medium': {'vcores': 4, 'memory_gb': 16, 'storage_gb': 500},
                'large': {'vcores': 16, 'memory_gb': 64, 'storage_gb': 2000}
            },
            sla={'availability': '99.95%', 'rpo': '5min', 'rto': '1hour'},
            cost_model={'type': 'tier_based', 'billing': 'monthly'}
        )

        # Redis cache
        self.services['redis'] = ServiceOffering(
            name='Redis Cache',
            type=ServiceType.DATABASE,
            description='Managed Redis for caching and session storage',
            tiers={
                'small': {'memory_gb': 1, 'connections': 1000},
                'medium': {'memory_gb': 6, 'connections': 5000},
                'large': {'memory_gb': 26, 'connections': 20000}
            },
            sla={'availability': '99.9%', 'latency_p99': '1ms'},
            cost_model={'type': 'tier_based', 'billing': 'monthly'}
        )

    def get_service(self, name: str) -> Optional[ServiceOffering]:
        return self.services.get(name)

    def list_services(self, service_type: ServiceType = None) -> List[ServiceOffering]:
        if service_type:
            return [s for s in self.services.values() if s.type == service_type]
        return list(self.services.values())

Self-Service Provisioning

from abc import ABC, abstractmethod
import pulumi
from pulumi_azure_native import resources, containerservice, dbforpostgresql

class ResourceProvisioner(ABC):
    """Abstract base for resource provisioners"""

    @abstractmethod
    def provision(self, spec: dict) -> dict:
        pass

    @abstractmethod
    def deprovision(self, resource_id: str):
        pass

    @abstractmethod
    def get_status(self, resource_id: str) -> dict:
        pass

class KubernetesNamespaceProvisioner(ResourceProvisioner):
    """Provision Kubernetes namespaces"""

    def provision(self, spec: dict) -> dict:
        from kubernetes import client, config

        config.load_kube_config()
        v1 = client.CoreV1Api()

        namespace_name = f"{spec['team']}-{spec['name']}"

        # Create namespace
        namespace = client.V1Namespace(
            metadata=client.V1ObjectMeta(
                name=namespace_name,
                labels={
                    'team': spec['team'],
                    'managed-by': 'platform',
                    'tier': spec.get('tier', 'small')
                }
            )
        )
        v1.create_namespace(namespace)

        # Create resource quota
        tier_limits = self._get_tier_limits(spec.get('tier', 'small'))
        quota = client.V1ResourceQuota(
            metadata=client.V1ObjectMeta(name='default-quota'),
            spec=client.V1ResourceQuotaSpec(
                hard={
                    'requests.cpu': tier_limits['cpu_limit'],
                    'requests.memory': tier_limits['memory_limit'],
                    'persistentvolumeclaims': '10'
                }
            )
        )
        v1.create_namespaced_resource_quota(namespace_name, quota)

        # Create network policy
        self._create_network_policy(namespace_name)

        # Create RBAC
        self._create_rbac(namespace_name, spec['team'])

        return {
            'resource_id': namespace_name,
            'status': 'provisioned',
            'connection_info': {
                'namespace': namespace_name,
                'cluster': 'production'
            }
        }

    def _get_tier_limits(self, tier: str) -> dict:
        limits = {
            'small': {'cpu_limit': '4', 'memory_limit': '8Gi'},
            'medium': {'cpu_limit': '16', 'memory_limit': '32Gi'},
            'large': {'cpu_limit': '64', 'memory_limit': '128Gi'}
        }
        return limits.get(tier, limits['small'])


class PostgreSQLProvisioner(ResourceProvisioner):
    """Provision PostgreSQL databases"""

    def provision(self, spec: dict) -> dict:
        import pulumi
        from pulumi_azure_native import dbforpostgresql

        server_name = f"psql-{spec['team']}-{spec['name']}"
        tier_config = self._get_tier_config(spec.get('tier', 'small'))

        # Create PostgreSQL server using Pulumi
        server = dbforpostgresql.Server(
            server_name,
            resource_group_name=spec['resource_group'],
            server_name=server_name,
            sku=dbforpostgresql.SkuArgs(
                name=tier_config['sku'],
                tier=tier_config['tier']
            ),
            properties=dbforpostgresql.ServerPropertiesForDefaultCreateArgs(
                administrator_login="platformadmin",
                administrator_login_password=self._generate_password(),
                create_mode="Default",
                storage_profile=dbforpostgresql.StorageProfileArgs(
                    backup_retention_days=7,
                    geo_redundant_backup="Disabled",
                    storage_mb=tier_config['storage_mb']
                ),
                version="13"
            )
        )

        return {
            'resource_id': server_name,
            'status': 'provisioning',
            'connection_info': {
                'host': f"{server_name}.postgres.database.azure.com",
                'port': 5432,
                'database': 'postgres'
            }
        }

Developer Portal

// Developer portal API
import express from 'express';
import { ServiceCatalog } from './catalog';
import { ResourceManager } from './resources';

const app = express();
const catalog = new ServiceCatalog();
const resourceManager = new ResourceManager();

// List available services
app.get('/api/catalog', (req, res) => {
    const services = catalog.listServices();
    res.json(services);
});

// Request a new resource
app.post('/api/resources', async (req, res) => {
    const { serviceType, tier, team, name, config } = req.body;

    // Validate request
    const validation = await validateRequest(req.body);
    if (!validation.valid) {
        return res.status(400).json({ error: validation.errors });
    }

    // Check team quota
    const quota = await resourceManager.checkQuota(team, serviceType);
    if (!quota.available) {
        return res.status(403).json({ error: 'Quota exceeded' });
    }

    // Provision resource
    const resource = await resourceManager.provision({
        serviceType,
        tier,
        team,
        name,
        config
    });

    // Record in service registry
    await serviceRegistry.register(resource);

    res.json(resource);
});

// Get resource status
app.get('/api/resources/:id', async (req, res) => {
    const resource = await resourceManager.getResource(req.params.id);
    if (!resource) {
        return res.status(404).json({ error: 'Resource not found' });
    }
    res.json(resource);
});

// List team resources
app.get('/api/teams/:team/resources', async (req, res) => {
    const resources = await resourceManager.listByTeam(req.params.team);
    res.json(resources);
});

// Deprovision resource
app.delete('/api/resources/:id', async (req, res) => {
    await resourceManager.deprovision(req.params.id);
    res.status(204).send();
});

Golden Paths with Templates

# Application template - golden path for microservices
apiVersion: scaffolder.backstage.io/v1beta3
kind: Template
metadata:
  name: microservice-template
  title: Microservice Template
  description: Create a new microservice with all standard configurations
  tags:
    - recommended
    - microservice
spec:
  owner: platform-team
  type: service

  parameters:
    - title: Service Information
      required:
        - name
        - team
        - description
      properties:
        name:
          title: Service Name
          type: string
          pattern: '^[a-z0-9-]+$'
        team:
          title: Team
          type: string
          enum:
            - commerce
            - customer
            - payments
            - platform
        description:
          title: Description
          type: string

    - title: Technical Configuration
      properties:
        language:
          title: Programming Language
          type: string
          enum:
            - python
            - go
            - java
            - typescript
          default: python
        database:
          title: Database
          type: string
          enum:
            - none
            - postgresql
            - mongodb
          default: none
        caching:
          title: Redis Cache
          type: boolean
          default: false

  steps:
    - id: fetch-template
      name: Fetch Template
      action: fetch:template
      input:
        url: ./templates/${{ parameters.language }}
        values:
          name: ${{ parameters.name }}
          team: ${{ parameters.team }}
          description: ${{ parameters.description }}

    - id: create-repo
      name: Create Repository
      action: publish:github
      input:
        repoUrl: github.com?owner=${{ parameters.team }}&repo=${{ parameters.name }}
        description: ${{ parameters.description }}

    - id: setup-pipeline
      name: Setup CI/CD Pipeline
      action: github:actions:create
      input:
        repoUrl: ${{ steps.create-repo.output.repoUrl }}
        template: standard-pipeline

    - id: register-service
      name: Register in Catalog
      action: catalog:register
      input:
        repoContentsUrl: ${{ steps.create-repo.output.repoContentsUrl }}
        catalogInfoPath: /catalog-info.yaml

    - id: provision-infrastructure
      name: Provision Infrastructure
      action: platform:provision
      input:
        team: ${{ parameters.team }}
        service: ${{ parameters.name }}
        database: ${{ parameters.database }}
        caching: ${{ parameters.caching }}

  output:
    links:
      - title: Repository
        url: ${{ steps.create-repo.output.remoteUrl }}
      - title: Pipeline
        url: ${{ steps.setup-pipeline.output.pipelineUrl }}
      - title: Service Catalog
        url: /catalog/services/${{ parameters.name }}

Platform Metrics

# Platform health and usage metrics
from prometheus_client import Counter, Histogram, Gauge

# Provisioning metrics
provisions_total = Counter(
    'platform_provisions_total',
    'Total resource provisions',
    ['service_type', 'tier', 'status']
)

provision_duration = Histogram(
    'platform_provision_duration_seconds',
    'Time to provision resources',
    ['service_type'],
    buckets=[10, 30, 60, 120, 300, 600]
)

# Usage metrics
active_resources = Gauge(
    'platform_active_resources',
    'Number of active resources',
    ['service_type', 'team']
)

resource_utilization = Gauge(
    'platform_resource_utilization',
    'Resource utilization percentage',
    ['resource_id', 'metric']
)

# Developer experience metrics
time_to_first_deploy = Histogram(
    'platform_time_to_first_deploy_hours',
    'Time from onboarding to first production deploy',
    ['team'],
    buckets=[1, 4, 8, 24, 48, 168]
)

self_service_success_rate = Gauge(
    'platform_self_service_success_rate',
    'Percentage of self-service requests completed without support'
)

Key Platform Engineering Principles

Product Mindset: Treat the platform as a product with developer customers
Self-Service: Enable teams to provision without tickets
Golden Paths: Make the right way the easy way
Documentation: Comprehensive, up-to-date documentation
Measure Success: Track developer productivity metrics

Platform engineering in 2021 emerged as the answer to scaling DevOps practices. By building platforms that abstract complexity, organizations enable faster, safer software delivery.