August 23, 2021 1 min read

Azure Data Lake Storage Gen2: Hierarchical Namespace for Big Data Analytics

Azure Data Lake ADLS Gen2 Big Data Analytics

Azure Data Lake Storage Gen2 combines the scalability of blob storage with the hierarchical file system capabilities required for big data analytics. The hierarchical namespace (HNS) enables efficient data organization and atomic directory operations.

Enabling Hierarchical Namespace

# Create storage account with HNS enabled
az storage account create \
    --name mydatalakeaccount \
    --resource-group myResourceGroup \
    --location eastus \
    --sku Standard_LRS \
    --kind StorageV2 \
    --enable-hierarchical-namespace true

# HNS cannot be enabled on existing accounts!
# Must be specified at creation time

Using Terraform:

resource "azurerm_storage_account" "datalake" {
  name                     = "mydatalakeaccount"
  resource_group_name      = azurerm_resource_group.main.name
  location                 = azurerm_resource_group.main.location
  account_tier             = "Standard"
  account_replication_type = "LRS"
  account_kind             = "StorageV2"
  is_hns_enabled           = true

  blob_properties {
    versioning_enabled = true

    delete_retention_policy {
      days = 30
    }

    container_delete_retention_policy {
      days = 30
    }
  }
}

resource "azurerm_storage_data_lake_gen2_filesystem" "main" {
  name               = "analytics"
  storage_account_id = azurerm_storage_account.datalake.id
}

Working with Data Lake Gen2 SDK

# Python - Azure Data Lake Storage Gen2 operations
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import DefaultAzureCredential

class DataLakeManager:
    def __init__(self, account_name):
        self.account_url = f"https://{account_name}.dfs.core.windows.net"
        self.credential = DefaultAzureCredential()
        self.service_client = DataLakeServiceClient(
            account_url=self.account_url,
            credential=self.credential
        )

    def create_file_system(self, name, metadata=None):
        """Create a file system (container)"""
        file_system_client = self.service_client.create_file_system(
            file_system=name,
            metadata=metadata or {}
        )
        return file_system_client

    def create_directory(self, file_system, path):
        """Create a directory with all parent directories"""
        fs_client = self.service_client.get_file_system_client(file_system)
        directory_client = fs_client.create_directory(path)
        return directory_client

    def upload_file(self, file_system, remote_path, local_path):
        """Upload a file to Data Lake"""
        fs_client = self.service_client.get_file_system_client(file_system)
        file_client = fs_client.get_file_client(remote_path)

        with open(local_path, 'rb') as f:
            file_client.upload_data(f, overwrite=True)

        return file_client.url

    def upload_large_file(self, file_system, remote_path, local_path,
                          chunk_size=100*1024*1024):
        """Upload large file with chunked upload"""
        fs_client = self.service_client.get_file_system_client(file_system)
        file_client = fs_client.create_file(remote_path)

        with open(local_path, 'rb') as f:
            offset = 0
            while True:
                data = f.read(chunk_size)
                if not data:
                    break

                file_client.append_data(data, offset=offset, length=len(data))
                offset += len(data)

        file_client.flush_data(offset)
        return file_client.url

    def list_paths(self, file_system, path='/', recursive=True):
        """List all paths in a directory"""
        fs_client = self.service_client.get_file_system_client(file_system)
        paths = fs_client.get_paths(path=path, recursive=recursive)

        return [
            {
                'name': p.name,
                'is_directory': p.is_directory,
                'size': p.content_length,
                'last_modified': p.last_modified
            }
            for p in paths
        ]

    def rename_path(self, file_system, old_path, new_path):
        """Rename file or directory (atomic operation with HNS)"""
        fs_client = self.service_client.get_file_system_client(file_system)

        # Get source client
        if '.' in old_path.split('/')[-1]:
            source_client = fs_client.get_file_client(old_path)
        else:
            source_client = fs_client.get_directory_client(old_path)

        # Rename is atomic with HNS
        source_client.rename_file(f"{file_system}/{new_path}")

    def set_access_control(self, file_system, path, acl):
        """Set ACL on file or directory"""
        fs_client = self.service_client.get_file_system_client(file_system)
        path_client = fs_client.get_directory_client(path)

        path_client.set_access_control(acl=acl)

Organizing Data Lake Structure

# Python - Best practice data lake organization
class DataLakeOrganizer:
    def __init__(self, datalake_manager):
        self.manager = datalake_manager

    def create_medallion_architecture(self, file_system):
        """Create Bronze/Silver/Gold medallion structure"""
        structure = [
            # Bronze layer - raw data
            'bronze/sales/orders',
            'bronze/sales/customers',
            'bronze/inventory/products',
            'bronze/inventory/stock',

            # Silver layer - cleansed data
            'silver/sales/orders_clean',
            'silver/sales/customers_clean',
            'silver/inventory/products_clean',

            # Gold layer - aggregated data
            'gold/sales/daily_summary',
            'gold/sales/customer_analytics',
            'gold/inventory/stock_alerts',

            # Supporting directories
            'schemas',
            'checkpoints',
            'temp'
        ]

        for path in structure:
            self.manager.create_directory(file_system, path)

        return structure

    def create_time_partitioned_path(self, file_system, base_path,
                                      year, month, day=None):
        """Create time-partitioned directory structure"""
        if day:
            path = f"{base_path}/year={year}/month={month:02d}/day={day:02d}"
        else:
            path = f"{base_path}/year={year}/month={month:02d}"

        self.manager.create_directory(file_system, path)
        return path

    def organize_by_partition(self, file_system, base_path, partitions):
        """Create partition-based organization"""
        for partition in partitions:
            partition_path = '/'.join(
                f"{k}={v}" for k, v in partition.items()
            )
            full_path = f"{base_path}/{partition_path}"
            self.manager.create_directory(file_system, full_path)

ACL Management

// C# - Managing Access Control Lists
using Azure.Storage.Files.DataLake;
using Azure.Storage.Files.DataLake.Models;

public class ACLManager
{
    private readonly DataLakeServiceClient _serviceClient;

    public async Task SetDirectoryACLAsync(
        string fileSystem,
        string path,
        string objectId,
        RolePermissions permissions)
    {
        var fsClient = _serviceClient.GetFileSystemClient(fileSystem);
        var directoryClient = fsClient.GetDirectoryClient(path);

        // Get current ACL
        var accessControl = await directoryClient.GetAccessControlAsync();
        var acl = accessControl.Value.AccessControlList.ToList();

        // Add new ACL entry
        var newEntry = new PathAccessControlItem(
            AccessControlType.User,
            permissions,
            false,
            objectId);

        acl.Add(newEntry);

        await directoryClient.SetAccessControlListAsync(acl);
    }

    public async Task SetRecursiveACLAsync(
        string fileSystem,
        string path,
        IList<PathAccessControlItem> acl)
    {
        var fsClient = _serviceClient.GetFileSystemClient(fileSystem);
        var directoryClient = fsClient.GetDirectoryClient(path);

        // Set ACL recursively on all files and subdirectories
        await directoryClient.SetAccessControlRecursiveAsync(acl);
    }

    public async Task<AccessControlList> GetEffectivePermissionsAsync(
        string fileSystem,
        string path)
    {
        var fsClient = _serviceClient.GetFileSystemClient(fileSystem);
        var directoryClient = fsClient.GetDirectoryClient(path);

        var response = await directoryClient.GetAccessControlAsync();

        return new AccessControlList
        {
            Owner = response.Value.Owner,
            Group = response.Value.Group,
            Permissions = response.Value.Permissions,
            ACL = response.Value.AccessControlList.ToList()
        };
    }
}

Integration with Analytics Services

# Python - Using Data Lake with Spark (Azure Databricks/Synapse)
from pyspark.sql import SparkSession

def configure_spark_for_adls():
    """Configure Spark to access ADLS Gen2"""
    spark = SparkSession.builder \
        .appName("DataLakeAnalytics") \
        .config("spark.hadoop.fs.azure.account.auth.type", "OAuth") \
        .config("spark.hadoop.fs.azure.account.oauth.provider.type",
                "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider") \
        .config("spark.hadoop.fs.azure.account.oauth2.client.id", "<client-id>") \
        .config("spark.hadoop.fs.azure.account.oauth2.client.secret", "<secret>") \
        .config("spark.hadoop.fs.azure.account.oauth2.client.endpoint",
                "https://login.microsoftonline.com/<tenant-id>/oauth2/token") \
        .getOrCreate()

    return spark

# Read from Data Lake
def read_parquet_from_datalake(spark, account_name, file_system, path):
    """Read parquet files from ADLS Gen2"""
    url = f"abfss://{file_system}@{account_name}.dfs.core.windows.net/{path}"
    return spark.read.parquet(url)

# Write to Data Lake with partitioning
def write_partitioned_data(df, account_name, file_system, path,
                           partition_cols):
    """Write DataFrame with partitioning"""
    url = f"abfss://{file_system}@{account_name}.dfs.core.windows.net/{path}"

    df.write \
        .partitionBy(*partition_cols) \
        .mode("overwrite") \
        .parquet(url)

Best Practices

Enable HNS at creation: Cannot be enabled later
Use medallion architecture: Bronze/Silver/Gold layers
Partition wisely: By date, region, or other common filters
Implement ACLs: For fine-grained access control
Monitor storage metrics: Track capacity and transactions

Azure Data Lake Storage Gen2 provides the foundation for modern data analytics, combining the best of blob storage scalability with the directory semantics required for efficient big data processing.