5 min read
External Tables in Azure Data Explorer
External Tables in Azure Data Explorer
External tables allow you to query data stored in Azure Blob Storage or Data Lake without ingesting it into ADX. This enables cost-effective analysis of archived data and integration with data lake architectures.
Types of External Tables
ADX supports external tables pointing to:
- Azure Blob Storage
- Azure Data Lake Storage Gen2
- Azure Data Lake Storage Gen1
- SQL Server (external SQL tables)
Creating External Tables
Azure Blob Storage
// Create external table on Blob Storage
.create external table ArchivedLogs (
TimeGenerated: datetime,
Computer: string,
ContainerID: string,
LogEntry: string,
Namespace: string,
PodName: string
)
kind=storage
dataformat=parquet
(
h@'https://mystorageaccount.blob.core.windows.net/logs;impersonate'
)
With Partitioning
// Create partitioned external table
.create external table PartitionedLogs (
TimeGenerated: datetime,
Computer: string,
ContainerID: string,
LogEntry: string,
Namespace: string
)
kind=storage
partition by (Year: datetime = startofyear(TimeGenerated), Month: datetime = startofmonth(TimeGenerated))
pathformat = ("year=" datetime_pattern("yyyy", Year) "/month=" datetime_pattern("MM", Month))
dataformat=parquet
(
h@'https://mystorageaccount.blob.core.windows.net/logs;impersonate'
)
Azure Data Lake Gen2
// Create external table on ADLS Gen2
.create external table DataLakeLogs (
TimeGenerated: datetime,
EventType: string,
Message: string,
Properties: dynamic
)
kind=storage
dataformat=json
(
h@'abfss://container@storageaccount.dfs.core.windows.net/path;impersonate'
)
Authentication Methods
Managed Identity
// Using managed identity (recommended)
.create external table ManagedIdentityTable (
Column1: string,
Column2: int
)
kind=storage
dataformat=csv
(
h@'https://storage.blob.core.windows.net/container;impersonate'
)
SAS Token
// Using SAS token
.create external table SASTable (
Column1: string,
Column2: int
)
kind=storage
dataformat=csv
(
h@'https://storage.blob.core.windows.net/container?sv=2020-08-04&ss=b&srt=co&sp=rl...'
)
Storage Account Key (via secrets)
// Store secret first
.set stored_credential StorageKey with (type="storage", password="<account-key>")
// Use in external table
.create external table KeyAuthTable (
Column1: string
)
kind=storage
dataformat=csv
(
h@'https://storage.blob.core.windows.net/container;StorageKey'
)
Supported Data Formats
CSV Format
.create external table CsvTable (
Id: int,
Name: string,
Value: real
)
kind=storage
dataformat=csv
with (ignoreFirstRecord=true)
(
h@'https://storage.blob.core.windows.net/data;impersonate'
)
JSON Format
.create external table JsonTable (
TimeGenerated: datetime,
Level: string,
Message: string,
Properties: dynamic
)
kind=storage
dataformat=multijson
(
h@'https://storage.blob.core.windows.net/json-data;impersonate'
)
Parquet Format
.create external table ParquetTable (
Id: long,
Timestamp: datetime,
Value: real,
Tags: dynamic
)
kind=storage
dataformat=parquet
(
h@'https://storage.blob.core.windows.net/parquet-data;impersonate'
)
Querying External Tables
Basic Query
// Query external table
external_table('ArchivedLogs')
| where TimeGenerated > datetime(2021-01-01)
| where LogEntry contains "error"
| take 100
With Partition Pruning
// Efficient query using partitions
external_table('PartitionedLogs')
| where Year == datetime(2021-01-01) and Month == datetime(2021-06-01)
| summarize count() by Namespace
Joining with Internal Tables
// Join archived and current data
let archived = external_table('ArchivedLogs')
| where TimeGenerated between (datetime(2021-01-01) .. datetime(2021-06-30));
let current = ContainerLogs
| where TimeGenerated > datetime(2021-07-01);
union archived, current
| summarize count() by bin(TimeGenerated, 1d)
| render timechart
External SQL Tables
Query SQL databases directly from ADX:
// Create external SQL table
.create external table SqlOrders (
OrderId: int,
CustomerId: int,
OrderDate: datetime,
TotalAmount: real
)
kind=sql
table=Orders
(
h@'Server=myserver.database.windows.net;Database=mydb;Authentication=Active Directory Integrated'
)
Query SQL External Table
// Query SQL data
external_table('SqlOrders')
| where OrderDate > ago(30d)
| summarize TotalRevenue = sum(TotalAmount) by CustomerId
| top 10 by TotalRevenue
Virtual Columns
Add computed columns to external tables:
// Create external table with virtual column
.create external table LogsWithVirtual (
TimeGenerated: datetime,
LogEntry: string,
['$filepath']: string // Virtual column for file path
)
kind=storage
dataformat=csv
(
h@'https://storage.blob.core.windows.net/logs;impersonate'
)
// Use virtual column in queries
external_table('LogsWithVirtual')
| extend FileName = ['$filepath']
| summarize count() by FileName
Schema Mapping
Define custom mappings for complex data:
// Create mapping
.create external table LogsWithMapping ingestion json mapping 'LogMapping'
[
{"column": "TimeGenerated", "path": "$.timestamp", "transform": "DateTimeFromUnixSeconds"},
{"column": "Level", "path": "$.log_level"},
{"column": "Message", "path": "$.message"},
{"column": "Properties", "path": "$", "transform": "SourceLineNumber"}
]
Managing External Tables
Show External Tables
// List all external tables
.show external tables
// Show specific table details
.show external table ArchivedLogs
// Show table schema
.show external table ArchivedLogs schema
Alter External Tables
// Add storage path
.alter external table ArchivedLogs add_storage_path h@'https://newstorage.blob.core.windows.net/logs;impersonate'
// Drop external table
.drop external table ArchivedLogs
Refresh External Table
// Refresh artifacts (discover new files)
.refresh external table ArchivedLogs
Performance Optimization
Use Partitioning
// Well-partitioned query is much faster
external_table('PartitionedLogs')
| where Year == datetime(2021-01-01) // Partition pruning
| where TimeGenerated > ago(7d)
| take 1000
Filter Early
// Push filters as early as possible
external_table('ArchivedLogs')
| where TimeGenerated > datetime(2021-06-01) // Filter first
| where Namespace == "production"
| where LogEntry contains "error"
Project Needed Columns
// Select only needed columns
external_table('ArchivedLogs')
| project TimeGenerated, LogEntry // Don't select unnecessary columns
| take 1000
Hybrid Query Pattern
Combine hot (ADX) and cold (external) data seamlessly:
// Function to query both hot and cold data
.create-or-alter function GetAllLogs(startTime: datetime, endTime: datetime) {
let hotCutoff = ago(30d);
let coldData = external_table('ArchivedLogs')
| where TimeGenerated between (startTime .. min_of(endTime, hotCutoff));
let hotData = ContainerLogs
| where TimeGenerated between (max_of(startTime, hotCutoff) .. endTime);
union coldData, hotData
}
// Use the function
GetAllLogs(datetime(2021-01-01), now())
| summarize count() by bin(TimeGenerated, 1d)
Best Practices
- Use Parquet format - Best performance for external queries
- Partition wisely - Enable partition pruning
- Use managed identity - Avoid key management
- Filter early - Reduce data scanned
- Project columns - Don’t read unnecessary data
- Cache appropriately - Use ADX caching for frequent queries
Conclusion
External tables extend ADX’s reach to data lakes and archives without ingestion costs. By combining internal and external tables, you can build efficient hybrid analytics solutions.
Tomorrow, we’ll shift focus to Azure Synapse serverless SQL pools for big data analytics.