Back to Blog
3 min read

Azure Purview: Unified Data Governance

Azure Purview provides unified data governance across on-premises, multi-cloud, and SaaS data. Automated data discovery, sensitive data classification, and end-to-end data lineage.

Create Purview Account

# Create Purview account
az purview account create \
    --name my-purview \
    --resource-group myRG \
    --location eastus \
    --managed-group-name purview-managed-rg

Register Data Sources

from azure.identity import DefaultAzureCredential
from azure.purview.scanning import PurviewScanningClient
from azure.purview.catalog import PurviewCatalogClient

credential = DefaultAzureCredential()

# Scanning client for data source registration
scanning_client = PurviewScanningClient(
    endpoint="https://my-purview.purview.azure.com",
    credential=credential
)

# Register Azure SQL Database
sql_source = {
    "kind": "AzureSqlDatabase",
    "properties": {
        "serverEndpoint": "myserver.database.windows.net",
        "resourceGroup": "myRG",
        "subscriptionId": "subscription-id",
        "resourceName": "myserver",
        "resourceId": "/subscriptions/.../Microsoft.Sql/servers/myserver",
        "collection": {
            "referenceName": "my-purview",
            "type": "CollectionReference"
        }
    }
}

scanning_client.data_sources.create_or_update("azure-sql-source", sql_source)

Configure and Run Scans

# Create scan
scan_definition = {
    "kind": "AzureSqlDatabaseCredential",
    "properties": {
        "credential": {
            "referenceName": "sql-credential",
            "credentialType": "SqlAuth"
        },
        "serverEndpoint": "myserver.database.windows.net",
        "databaseName": "SalesDB"
    }
}

scanning_client.scans.create_or_update(
    "azure-sql-source",
    "weekly-scan",
    scan_definition
)

# Create scan rule set
scan_ruleset = {
    "kind": "AzureSqlDatabase",
    "properties": {
        "scanningRule": {
            "fileExtensions": [],
            "customClassificationRules": [],
            "systemClassificationRuleNames": [
                "MICROSOFT.FINANCIAL.US.SSN",
                "MICROSOFT.FINANCIAL.CREDIT_CARD_NUMBER",
                "MICROSOFT.PERSONAL.EMAIL",
                "MICROSOFT.PERSONAL.PHONE_NUMBER"
            ]
        }
    }
}

# Run scan
scanning_client.scan_result.run_scan(
    "azure-sql-source",
    "weekly-scan",
    "unique-run-id"
)

Search Data Catalog

catalog_client = PurviewCatalogClient(
    endpoint="https://my-purview.purview.azure.com",
    credential=credential
)

# Search for assets
search_results = catalog_client.discovery.query(
    search_request={
        "keywords": "customer",
        "filter": {
            "and": [
                {"objectType": "Tables"},
                {"classification": "MICROSOFT.PERSONAL.EMAIL"}
            ]
        },
        "limit": 25
    }
)

for result in search_results["value"]:
    print(f"Name: {result['name']}")
    print(f"Type: {result['entityType']}")
    print(f"Classifications: {result.get('classification', [])}")
    print("---")

Data Lineage

# Get lineage for an asset
asset_guid = "asset-guid-here"

lineage = catalog_client.lineage.get_lineage_graph(
    guid=asset_guid,
    direction="BOTH",
    depth=3
)

print("Upstream sources:")
for node in lineage["guidEntityMap"].values():
    if node["typeName"] in ["azure_sql_table", "azure_blob_path"]:
        print(f"  {node['attributes']['name']}")

# Lineage is automatically captured from:
# - Azure Data Factory pipelines
# - Synapse pipelines
# - Databricks notebooks (with connector)

Glossary Terms

# Create glossary term
term = {
    "name": "Customer PII",
    "qualifiedName": "customer-pii@Glossary",
    "anchor": {
        "glossaryGuid": "glossary-guid"
    },
    "status": "Approved",
    "longDescription": "Personal identifiable information related to customers",
    "abbreviation": "PII"
}

catalog_client.glossary.create_glossary_term(term)

# Assign term to asset
catalog_client.entity.add_classification(
    guid=asset_guid,
    classifications=[{
        "typeName": "Customer PII",
        "attributes": {}
    }]
)

Access Policies

{
    "name": "data-reader-policy",
    "decisionRules": [
        {
            "effect": "Permit",
            "dnfCondition": [[
                {
                    "attributeName": "resource.collection",
                    "attributeValueIncludes": "finance-collection"
                },
                {
                    "attributeName": "subject.group",
                    "attributeValueIncludes": "finance-analysts"
                }
            ]]
        }
    ]
}

Insights and Reporting

# Get scan insights
insights = catalog_client.discovery.query(
    search_request={
        "keywords": "*",
        "facets": [
            {"facet": "classification", "count": 10},
            {"facet": "objectType", "count": 10},
            {"facet": "assetType", "count": 10}
        ]
    }
)

print("Classification distribution:")
for facet in insights["@search.facets"]["classification"]:
    print(f"  {facet['value']}: {facet['count']}")

Azure Purview: know your data, govern your data.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.