August 27, 2022 1 min read

Working with Projections in Azure Cognitive Search

Azure Cognitive Search Projections Knowledge Store Data Shaping

Projections define how enriched data is shaped and stored in Knowledge Store. Understanding projection types and patterns is key to building effective knowledge mining solutions.

Projection Types

Azure Cognitive Search supports three projection types:

Table Projections: Store data in Azure Table Storage
Object Projections: Store JSON documents in Blob Storage
File Projections: Store binary files (images) in Blob Storage

Table Projections

from azure.search.documents.indexes.models import (
    KnowledgeStore,
    KnowledgeStoreProjection,
    KnowledgeStoreTableProjectionSelector
)

# Define multiple related tables
table_projections = [
    # Main document table
    KnowledgeStoreTableProjectionSelector(
        table_name="Documents",
        reference_key_name="DocumentKey",
        generated_key_name="DocumentId",
        source="/document/documentRecord"
    ),
    # One-to-many: Entities per document
    KnowledgeStoreTableProjectionSelector(
        table_name="DocumentEntities",
        reference_key_name="DocumentKey",  # Links to Documents table
        generated_key_name="EntityId",
        source="/document/entities/*"  # Flattens array
    ),
    # One-to-many: Sentences per document
    KnowledgeStoreTableProjectionSelector(
        table_name="DocumentSentences",
        reference_key_name="DocumentKey",
        generated_key_name="SentenceId",
        source="/document/sentences/*"
    ),
    # One-to-many: Key phrases per document
    KnowledgeStoreTableProjectionSelector(
        table_name="DocumentKeyPhrases",
        reference_key_name="DocumentKey",
        generated_key_name="PhraseId",
        source="/document/keyPhrases/*"
    )
]

Shaping Data for Projections

from azure.search.documents.indexes.models import (
    ShaperSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry
)

# Shaper skill to create structured output
document_shaper = ShaperSkill(
    name="document-shaper",
    description="Shape document data for projections",
    context="/document",
    inputs=[
        # Simple fields
        InputFieldMappingEntry(name="id", source="/document/id"),
        InputFieldMappingEntry(name="title", source="/document/title"),
        InputFieldMappingEntry(name="content", source="/document/content"),

        # Enriched fields
        InputFieldMappingEntry(name="language", source="/document/language"),
        InputFieldMappingEntry(name="sentiment", source="/document/sentiment"),

        # Complex nested structure
        InputFieldMappingEntry(
            name="metadata",
            source_context="/document",
            inputs=[
                InputFieldMappingEntry(name="author", source="/document/author"),
                InputFieldMappingEntry(name="publishDate", source="/document/publishDate"),
                InputFieldMappingEntry(name="category", source="/document/category")
            ]
        )
    ],
    outputs=[
        OutputFieldMappingEntry(name="output", target_name="documentRecord")
    ]
)

# Shaper for entity array
entity_shaper = ShaperSkill(
    name="entity-shaper",
    description="Shape entities for projection",
    context="/document/recognized_entities/*",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/recognized_entities/*/text"),
        InputFieldMappingEntry(name="category", source="/document/recognized_entities/*/category"),
        InputFieldMappingEntry(name="confidence", source="/document/recognized_entities/*/confidenceScore"),
        InputFieldMappingEntry(name="offset", source="/document/recognized_entities/*/offset"),
        InputFieldMappingEntry(name="length", source="/document/recognized_entities/*/length")
    ],
    outputs=[
        OutputFieldMappingEntry(name="output", target_name="entities")
    ]
)

Object Projections (Blob)

from azure.search.documents.indexes.models import KnowledgeStoreBlobProjectionSelector

# Object projection for full enriched documents
object_projections = [
    # Full enriched document as JSON
    KnowledgeStoreBlobProjectionSelector(
        storage_container="enriched-documents",
        generated_key_name="EnrichedDocId",
        source="/document/enrichedDocument"
    ),
    # Extracted summaries
    KnowledgeStoreBlobProjectionSelector(
        storage_container="document-summaries",
        generated_key_name="SummaryId",
        source="/document/summary"
    ),
    # Entity extractions as separate files
    KnowledgeStoreBlobProjectionSelector(
        storage_container="entity-extractions",
        generated_key_name="EntityExtractionId",
        source="/document/entityExtraction"
    )
]

# Shaper for enriched document blob
enriched_doc_shaper = ShaperSkill(
    name="enriched-document-shaper",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="id", source="/document/id"),
        InputFieldMappingEntry(name="content", source="/document/content"),
        InputFieldMappingEntry(name="entities", source="/document/entities"),
        InputFieldMappingEntry(name="keyPhrases", source="/document/keyPhrases"),
        InputFieldMappingEntry(name="sentiment", source="/document/sentiment"),
        InputFieldMappingEntry(name="language", source="/document/language"),
        InputFieldMappingEntry(name="piiEntities", source="/document/piiEntities"),
        InputFieldMappingEntry(name="linkedEntities", source="/document/linkedEntities")
    ],
    outputs=[
        OutputFieldMappingEntry(name="output", target_name="enrichedDocument")
    ]
)

File Projections (Images)

from azure.search.documents.indexes.models import KnowledgeStoreFileProjectionSelector

# File projections for images
file_projections = [
    # Original normalized images
    KnowledgeStoreFileProjectionSelector(
        storage_container="document-images",
        generated_key_name="ImageId",
        source="/document/normalized_images/*"
    ),
    # OCR-processed image regions
    KnowledgeStoreFileProjectionSelector(
        storage_container="ocr-regions",
        generated_key_name="OcrRegionId",
        source="/document/normalized_images/*/regions/*"
    )
]

Complete Knowledge Store Configuration

knowledge_store = KnowledgeStore(
    storage_connection_string=storage_connection_string,
    projections=[
        KnowledgeStoreProjection(
            tables=table_projections,
            objects=object_projections,
            files=file_projections
        )
    ]
)

# Create skillset with all projections
skillset = SearchIndexerSkillset(
    name="comprehensive-skillset",
    description="Skillset with complete knowledge store projections",
    skills=[
        # Cognitive skills
        entity_recognition_skill,
        keyphrase_skill,
        sentiment_skill,
        language_detection_skill,
        pii_detection_skill,

        # Shapers for projections
        document_shaper,
        entity_shaper,
        enriched_doc_shaper
    ],
    knowledge_store=knowledge_store
)

indexer_client.create_or_update_skillset(skillset)

Querying Projected Data

from azure.data.tables import TableServiceClient
from azure.storage.blob import BlobServiceClient

class KnowledgeStoreClient:
    def __init__(self, storage_connection_string):
        self.table_service = TableServiceClient.from_connection_string(storage_connection_string)
        self.blob_service = BlobServiceClient.from_connection_string(storage_connection_string)

    def get_document_with_entities(self, document_key):
        """Get document with all related entities"""
        # Get main document
        docs_table = self.table_service.get_table_client("Documents")
        doc = docs_table.get_entity(partition_key="", row_key=document_key)

        # Get related entities
        entities_table = self.table_service.get_table_client("DocumentEntities")
        entities = list(entities_table.query_entities(
            f"DocumentKey eq '{document_key}'"
        ))

        # Get key phrases
        phrases_table = self.table_service.get_table_client("DocumentKeyPhrases")
        phrases = list(phrases_table.query_entities(
            f"DocumentKey eq '{document_key}'"
        ))

        return {
            "document": doc,
            "entities": entities,
            "keyPhrases": phrases
        }

    def get_enriched_document_blob(self, blob_name):
        """Get full enriched document from blob storage"""
        container = self.blob_service.get_container_client("enriched-documents")
        blob_client = container.get_blob_client(blob_name)
        content = blob_client.download_blob().readall()
        return json.loads(content)

    def get_documents_by_entity(self, entity_text, entity_type=None):
        """Find all documents mentioning a specific entity"""
        entities_table = self.table_service.get_table_client("DocumentEntities")

        query = f"text eq '{entity_text}'"
        if entity_type:
            query += f" and category eq '{entity_type}'"

        entities = list(entities_table.query_entities(query))
        document_keys = set(e["DocumentKey"] for e in entities)

        # Get full documents
        docs_table = self.table_service.get_table_client("Documents")
        documents = [
            docs_table.get_entity(partition_key="", row_key=key)
            for key in document_keys
        ]

        return documents

# Usage
client = KnowledgeStoreClient(storage_connection_string)

# Get document with all enrichments
doc_data = client.get_document_with_entities("doc-123")
print(f"Document: {doc_data['document']['title']}")
print(f"Entities: {len(doc_data['entities'])}")

# Find documents mentioning a person
docs = client.get_documents_by_entity("Microsoft", "Organization")
print(f"Documents mentioning Microsoft: {len(docs)}")

Projections enable flexible storage and querying of AI-enriched content for various downstream use cases.