5 min read
Microsoft Purview and Fabric: Enterprise Data Governance
Microsoft Purview and Fabric: Enterprise Data Governance
Microsoft Purview provides enterprise-wide data governance that extends beyond Fabric to cover your entire data estate. When combined with Fabric, you get comprehensive visibility, cataloging, and compliance capabilities.
Purview Integration Overview
from dataclasses import dataclass
from typing import List, Dict, Optional
from enum import Enum
class PurviewCapability(Enum):
DATA_CATALOG = "Data Catalog"
DATA_MAP = "Data Map"
DATA_LINEAGE = "Data Lineage"
DATA_CLASSIFICATION = "Data Classification"
SENSITIVITY_LABELS = "Sensitivity Labels"
DATA_QUALITY = "Data Quality"
BUSINESS_GLOSSARY = "Business Glossary"
DATA_SHARING = "Data Sharing"
@dataclass
class PurviewFabricIntegration:
fabric_workspace: str
purview_account: str
capabilities: List[PurviewCapability]
auto_scan: bool
scan_frequency: str
# Example integration configuration
integration = PurviewFabricIntegration(
fabric_workspace="Sales-Analytics-Prod",
purview_account="company-purview",
capabilities=[
PurviewCapability.DATA_CATALOG,
PurviewCapability.DATA_MAP,
PurviewCapability.DATA_LINEAGE,
PurviewCapability.DATA_CLASSIFICATION,
PurviewCapability.SENSITIVITY_LABELS
],
auto_scan=True,
scan_frequency="daily"
)
Automatic Asset Discovery
Purview automatically discovers and catalogs Fabric assets:
import requests
from azure.identity import DefaultAzureCredential
from typing import List
class PurviewClient:
def __init__(self, account_name: str):
self.account_name = account_name
self.base_url = f"https://{account_name}.purview.azure.com"
self.credential = DefaultAzureCredential()
def _get_token(self) -> str:
token = self.credential.get_token("https://purview.azure.net/.default")
return token.token
def search_assets(self, query: str, filter_type: str = None) -> List[dict]:
"""Search for assets in the catalog."""
headers = {
"Authorization": f"Bearer {self._get_token()}",
"Content-Type": "application/json"
}
body = {
"keywords": query,
"limit": 100
}
if filter_type:
body["filter"] = {
"objectType": filter_type
}
response = requests.post(
f"{self.base_url}/catalog/api/search/query",
headers=headers,
json=body
)
return response.json().get("value", [])
def get_asset_details(self, guid: str) -> dict:
"""Get detailed information about an asset."""
headers = {"Authorization": f"Bearer {self._get_token()}"}
response = requests.get(
f"{self.base_url}/catalog/api/atlas/v2/entity/guid/{guid}",
headers=headers
)
return response.json()
def get_lineage(self, guid: str, direction: str = "BOTH", depth: int = 3) -> dict:
"""Get lineage for an asset."""
headers = {"Authorization": f"Bearer {self._get_token()}"}
response = requests.get(
f"{self.base_url}/catalog/api/atlas/v2/lineage/{guid}",
headers=headers,
params={"direction": direction, "depth": depth}
)
return response.json()
# Usage
client = PurviewClient("company-purview")
# Search for Fabric assets
fabric_assets = client.search_assets("sales", filter_type="azure_fabric_lakehouse")
for asset in fabric_assets:
print(f"Found: {asset['name']} ({asset['qualifiedName']})")
Business Glossary Integration
@dataclass
class GlossaryTerm:
name: str
definition: str
status: str # 'Draft', 'Approved', 'Expired'
owners: List[str]
related_terms: List[str]
assigned_assets: List[str]
class BusinessGlossary:
def __init__(self, purview_client: PurviewClient):
self.client = purview_client
self.terms: Dict[str, GlossaryTerm] = {}
def create_term(self, term: GlossaryTerm) -> str:
"""Create a new glossary term."""
body = {
"name": term.name,
"qualifiedName": f"glossary@{term.name}",
"longDescription": term.definition,
"status": term.status,
"contacts": {
"Owner": [{"id": owner} for owner in term.owners]
}
}
# API call to create term
# response = self.client.create_glossary_term(body)
self.terms[term.name] = term
return term.name
def assign_to_asset(self, term_name: str, asset_guid: str):
"""Assign a glossary term to an asset."""
# This creates a link between the term and asset
# Enables semantic search and understanding
pass
def search_by_term(self, term_name: str) -> List[dict]:
"""Find all assets associated with a glossary term."""
return self.client.search_assets(f"glossaryTerm:{term_name}")
# Create business glossary terms
glossary = BusinessGlossary(client)
glossary.create_term(GlossaryTerm(
name="Customer Lifetime Value",
definition="The total revenue expected from a customer over the entire relationship",
status="Approved",
owners=["data.steward@company.com"],
related_terms=["Customer", "Revenue", "Churn Rate"],
assigned_assets=["semantic-model-sales", "report-clv-analysis"]
))
glossary.create_term(GlossaryTerm(
name="Monthly Recurring Revenue",
definition="Predictable revenue normalized to a monthly amount",
status="Approved",
owners=["finance.analyst@company.com"],
related_terms=["Revenue", "Subscription", "ARR"],
assigned_assets=["lh-financial-metrics", "wh-revenue-facts"]
))
Data Classification and Sensitivity
class DataClassificationService:
"""Automatic data classification using Purview."""
CLASSIFICATION_TYPES = {
"PII": ["PERSON_NAME", "EMAIL", "PHONE_NUMBER", "ADDRESS", "SSN"],
"Financial": ["CREDIT_CARD", "BANK_ACCOUNT", "FINANCIAL_AMOUNT"],
"Health": ["MEDICAL_RECORD", "HEALTH_INSURANCE_ID"],
"Credentials": ["PASSWORD", "API_KEY", "CONNECTION_STRING"]
}
def __init__(self, purview_client: PurviewClient):
self.client = purview_client
def scan_table(self, table_guid: str) -> dict:
"""Scan a table for sensitive data."""
# Get table schema
asset = self.client.get_asset_details(table_guid)
columns = asset.get("entity", {}).get("attributes", {}).get("columns", [])
classifications = {}
for column in columns:
col_name = column.get("name", "")
col_type = column.get("dataType", "")
# Check for auto-classifications
detected = self._detect_classification(col_name, col_type)
if detected:
classifications[col_name] = detected
return {
"table": table_guid,
"columns_scanned": len(columns),
"classifications_found": classifications
}
def _detect_classification(self, col_name: str, col_type: str) -> Optional[str]:
"""Detect classification based on column name patterns."""
col_lower = col_name.lower()
patterns = {
"EMAIL": ["email", "mail"],
"PHONE_NUMBER": ["phone", "mobile", "cell"],
"PERSON_NAME": ["name", "first_name", "last_name"],
"SSN": ["ssn", "social_security"],
"CREDIT_CARD": ["card_number", "cc_num", "credit_card"],
"ADDRESS": ["address", "street", "city", "zip", "postal"]
}
for classification, keywords in patterns.items():
if any(kw in col_lower for kw in keywords):
return classification
return None
def apply_sensitivity_label(self, asset_guid: str, label: str):
"""Apply a sensitivity label to an asset."""
# Purview API call to apply label
# This propagates to Fabric and enforces policies
pass
# Usage
classifier = DataClassificationService(client)
results = classifier.scan_table("lakehouse-customers-guid")
print(f"Found {len(results['classifications_found'])} sensitive columns")
Data Quality Rules in Purview
@dataclass
class DataQualityRule:
name: str
description: str
asset_pattern: str # Regex for matching assets
rule_type: str # 'completeness', 'uniqueness', 'validity', 'timeliness'
threshold: float
alert_on_failure: bool
class PurviewDataQuality:
def __init__(self):
self.rules: List[DataQualityRule] = []
def add_rule(self, rule: DataQualityRule):
self.rules.append(rule)
def evaluate_asset(self, asset_name: str, metrics: dict) -> List[dict]:
"""Evaluate an asset against applicable rules."""
results = []
for rule in self.rules:
# Check if rule applies to this asset
import re
if not re.match(rule.asset_pattern, asset_name):
continue
score = metrics.get(rule.rule_type, 0)
passed = score >= rule.threshold
results.append({
"rule": rule.name,
"type": rule.rule_type,
"score": score,
"threshold": rule.threshold,
"passed": passed
})
return results
# Define enterprise data quality rules
dq = PurviewDataQuality()
dq.add_rule(DataQualityRule(
name="Customer Email Completeness",
description="Customer email should be 95% complete",
asset_pattern=".*customer.*",
rule_type="completeness",
threshold=0.95,
alert_on_failure=True
))
dq.add_rule(DataQualityRule(
name="Order ID Uniqueness",
description="Order IDs must be 100% unique",
asset_pattern=".*order.*",
rule_type="uniqueness",
threshold=1.0,
alert_on_failure=True
))
Best Practices
- Enable automatic scanning for all Fabric workspaces
- Define glossary terms before building reports
- Apply sensitivity labels based on classification results
- Regular data quality monitoring with alerts
- Use lineage for compliance documentation
Tomorrow, we’ll explore Copilot in Fabric and how AI enhances your analytics workflow!