Back to Blog
4 min read

Knowledge Store in Azure Cognitive Search

Knowledge Store persists AI-enriched content from Cognitive Search skillsets to Azure Storage. This enables downstream analytics, further processing, and building knowledge graphs.

Understanding Knowledge Store

Knowledge Store allows you to:

  • Persist enriched data to blob storage or tables
  • Create projections of your data in different shapes
  • Build knowledge graphs from extracted entities
  • Enable analytics on enriched content

Creating a Knowledge Store

from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataSourceConnection,
    SearchIndexerDataContainer,
    SearchIndexer,
    SearchIndexerSkillset,
    EntityRecognitionSkill,
    KeyPhraseExtractionSkill,
    SentimentSkill,
    OcrSkill,
    ImageAnalysisSkill,
    KnowledgeStore,
    KnowledgeStoreProjection,
    KnowledgeStoreTableProjectionSelector,
    KnowledgeStoreBlobProjectionSelector,
    KnowledgeStoreFileProjectionSelector
)
from azure.core.credentials import AzureKeyCredential

endpoint = "https://mysearchservice.search.windows.net"
credential = AzureKeyCredential("your-admin-key")

indexer_client = SearchIndexerClient(endpoint=endpoint, credential=credential)

# Define knowledge store
knowledge_store = KnowledgeStore(
    storage_connection_string="DefaultEndpointsProtocol=https;AccountName=...",
    projections=[
        KnowledgeStoreProjection(
            # Table projections
            tables=[
                KnowledgeStoreTableProjectionSelector(
                    table_name="Documents",
                    reference_key_name="DocumentId",
                    generated_key_name="DocumentKey",
                    source="/document",
                    inputs=[]
                ),
                KnowledgeStoreTableProjectionSelector(
                    table_name="Entities",
                    reference_key_name="DocumentId",
                    generated_key_name="EntityKey",
                    source="/document/entities/*",
                    inputs=[]
                ),
                KnowledgeStoreTableProjectionSelector(
                    table_name="KeyPhrases",
                    reference_key_name="DocumentId",
                    generated_key_name="PhraseKey",
                    source="/document/keyPhrases/*",
                    inputs=[]
                )
            ],
            # Blob projections
            objects=[
                KnowledgeStoreBlobProjectionSelector(
                    storage_container="enriched-documents",
                    generated_key_name="EnrichedDocKey",
                    source="/document/enriched",
                    inputs=[]
                )
            ],
            # File projections (for images)
            files=[
                KnowledgeStoreFileProjectionSelector(
                    storage_container="extracted-images",
                    generated_key_name="ImageKey",
                    source="/document/normalized_images/*",
                    inputs=[]
                )
            ]
        )
    ]
)

Skillset with Knowledge Store

from azure.search.documents.indexes.models import (
    SearchIndexerSkillset,
    EntityRecognitionSkill,
    KeyPhraseExtractionSkill,
    SentimentSkill,
    ShaperSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry
)

# Define skills
entity_skill = EntityRecognitionSkill(
    name="entity-recognition",
    description="Extract entities from content",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content")
    ],
    outputs=[
        OutputFieldMappingEntry(name="persons", target_name="persons"),
        OutputFieldMappingEntry(name="organizations", target_name="organizations"),
        OutputFieldMappingEntry(name="locations", target_name="locations")
    ],
    categories=["Person", "Organization", "Location"]
)

keyphrase_skill = KeyPhraseExtractionSkill(
    name="keyphrase-extraction",
    description="Extract key phrases",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content")
    ],
    outputs=[
        OutputFieldMappingEntry(name="keyPhrases", target_name="keyPhrases")
    ]
)

sentiment_skill = SentimentSkill(
    name="sentiment-analysis",
    description="Analyze sentiment",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content")
    ],
    outputs=[
        OutputFieldMappingEntry(name="sentiment", target_name="sentiment")
    ]
)

# Shaper skill to structure output for knowledge store
shaper_skill = ShaperSkill(
    name="document-shaper",
    description="Shape document for knowledge store",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="content", source="/document/content"),
        InputFieldMappingEntry(name="persons", source="/document/persons"),
        InputFieldMappingEntry(name="organizations", source="/document/organizations"),
        InputFieldMappingEntry(name="locations", source="/document/locations"),
        InputFieldMappingEntry(name="keyPhrases", source="/document/keyPhrases"),
        InputFieldMappingEntry(name="sentiment", source="/document/sentiment")
    ],
    outputs=[
        OutputFieldMappingEntry(name="output", target_name="enriched")
    ]
)

# Create skillset with knowledge store
skillset = SearchIndexerSkillset(
    name="enrichment-skillset",
    description="Skillset with knowledge store",
    skills=[entity_skill, keyphrase_skill, sentiment_skill, shaper_skill],
    knowledge_store=knowledge_store
)

indexer_client.create_or_update_skillset(skillset)

Querying Knowledge Store Data

from azure.data.tables import TableServiceClient
from azure.storage.blob import BlobServiceClient

# Query table projections
table_service = TableServiceClient.from_connection_string(storage_connection_string)
documents_table = table_service.get_table_client("Documents")
entities_table = table_service.get_table_client("Entities")

# Get all documents
documents = list(documents_table.list_entities())
print(f"Total documents: {len(documents)}")

# Get entities for a specific document
doc_id = documents[0]["DocumentId"]
entities = entities_table.query_entities(f"DocumentId eq '{doc_id}'")
for entity in entities:
    print(f"Entity: {entity['text']} ({entity['category']})")

# Query blob projections
blob_service = BlobServiceClient.from_connection_string(storage_connection_string)
container = blob_service.get_container_client("enriched-documents")

for blob in container.list_blobs():
    blob_client = container.get_blob_client(blob)
    content = blob_client.download_blob().readall()
    enriched_doc = json.loads(content)
    print(f"Document: {enriched_doc.get('title', 'Untitled')}")
    print(f"  Entities: {len(enriched_doc.get('entities', []))}")
    print(f"  Key Phrases: {enriched_doc.get('keyPhrases', [])[:5]}")

Building a Knowledge Graph

import networkx as nx
import matplotlib.pyplot as plt

def build_knowledge_graph(table_service):
    """Build a knowledge graph from knowledge store data"""
    G = nx.Graph()

    # Get all documents and entities
    documents_table = table_service.get_table_client("Documents")
    entities_table = table_service.get_table_client("Entities")

    documents = list(documents_table.list_entities())

    for doc in documents:
        doc_id = doc["DocumentId"]

        # Add document node
        G.add_node(doc_id, type="document", title=doc.get("title", ""))

        # Get entities for this document
        entities = list(entities_table.query_entities(f"DocumentId eq '{doc_id}'"))

        for entity in entities:
            entity_name = entity.get("text", "")
            entity_type = entity.get("category", "unknown")

            # Add entity node
            G.add_node(entity_name, type=entity_type)

            # Add edge between document and entity
            G.add_edge(doc_id, entity_name, relationship="mentions")

    return G

# Build and visualize
graph = build_knowledge_graph(table_service)

# Visualize
plt.figure(figsize=(15, 10))
pos = nx.spring_layout(graph)

# Color nodes by type
colors = []
for node in graph.nodes():
    node_type = graph.nodes[node].get("type", "unknown")
    if node_type == "document":
        colors.append("lightblue")
    elif node_type == "Person":
        colors.append("lightgreen")
    elif node_type == "Organization":
        colors.append("lightyellow")
    else:
        colors.append("lightgray")

nx.draw(graph, pos, node_color=colors, with_labels=True, font_size=8)
plt.savefig("knowledge_graph.png")

Analytics on Enriched Data

import pandas as pd
from collections import Counter

def analyze_knowledge_store(table_service):
    """Analyze enriched content from knowledge store"""

    # Load data
    entities_df = pd.DataFrame(list(
        table_service.get_table_client("Entities").list_entities()
    ))

    keyphrases_df = pd.DataFrame(list(
        table_service.get_table_client("KeyPhrases").list_entities()
    ))

    # Entity analysis
    print("=== Entity Analysis ===")
    entity_counts = entities_df["category"].value_counts()
    print(f"Entity types: {dict(entity_counts)}")

    # Top mentioned people
    people = entities_df[entities_df["category"] == "Person"]["text"].value_counts()
    print(f"\nTop mentioned people: {dict(people.head(10))}")

    # Top organizations
    orgs = entities_df[entities_df["category"] == "Organization"]["text"].value_counts()
    print(f"\nTop organizations: {dict(orgs.head(10))}")

    # Key phrase analysis
    print("\n=== Key Phrase Analysis ===")
    all_phrases = keyphrases_df["text"].value_counts()
    print(f"Top key phrases: {dict(all_phrases.head(20))}")

    return {
        "entities": entities_df,
        "keyphrases": keyphrases_df,
        "entity_counts": entity_counts,
        "top_phrases": all_phrases.head(20)
    }

# Run analysis
analysis = analyze_knowledge_store(table_service)

Knowledge Store enables rich analytics and knowledge graph construction from AI-enriched content.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.