5 min read
Azure Cognitive Services Updates: OpenAI Integration and More
Azure Cognitive Services continues to evolve with new capabilities announced at Ignite 2022. The biggest news is deeper OpenAI integration, but there are updates across vision, language, and speech services.
Azure OpenAI Integration
Cognitive Services now includes Azure OpenAI Service, bringing GPT models into the Cognitive Services family:
from azure.ai.openai import OpenAIClient
from azure.identity import DefaultAzureCredential
# Unified authentication with Cognitive Services
client = OpenAIClient(
endpoint="https://your-resource.openai.azure.com/",
credential=DefaultAzureCredential()
)
response = client.completions.create(
deployment="gpt-35-turbo",
prompt="Explain Azure Cognitive Services in one paragraph:",
max_tokens=200
)
print(response.choices[0].text)
Vision API Updates
Image Analysis 4.0
The new Image Analysis API includes enhanced capabilities:
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.ai.vision.imageanalysis.models import VisualFeatures
from azure.core.credentials import AzureKeyCredential
client = ImageAnalysisClient(
endpoint="https://your-vision.cognitiveservices.azure.com/",
credential=AzureKeyCredential("your-key")
)
# Analyze with all visual features
result = client.analyze(
image_url="https://example.com/image.jpg",
visual_features=[
VisualFeatures.CAPTION,
VisualFeatures.DENSE_CAPTIONS,
VisualFeatures.OBJECTS,
VisualFeatures.TAGS,
VisualFeatures.READ,
VisualFeatures.SMART_CROPS,
VisualFeatures.PEOPLE
],
gender_neutral_caption=True,
smart_crops_aspect_ratios=[0.9, 1.33]
)
# Get detailed caption
print(f"Caption: {result.caption.text} (confidence: {result.caption.confidence:.2f})")
# Get dense captions for different regions
for caption in result.dense_captions.values:
print(f"Region [{caption.bounding_box}]: {caption.text}")
# Detect people
for person in result.people.values:
print(f"Person at {person.bounding_box}, confidence: {person.confidence:.2f}")
Custom Vision Improvements
from azure.cognitiveservices.vision.customvision.training import CustomVisionTrainingClient
from azure.cognitiveservices.vision.customvision.prediction import CustomVisionPredictionClient
# Train a custom model
training_client = CustomVisionTrainingClient(
endpoint="https://your-customvision.cognitiveservices.azure.com/",
credentials=ApiKeyCredentials(in_headers={"Training-key": training_key})
)
# Create project with new domain
project = training_client.create_project(
"Product Classification",
domain_id="general-compact-s1", # New compact domain
classification_type="Multilabel"
)
# Upload images with tags
for image_path, tags in training_data:
with open(image_path, "rb") as image_file:
training_client.create_images_from_data(
project.id,
image_file.read(),
tag_ids=tags
)
# Train iteration
iteration = training_client.train_project(
project.id,
training_type="Advanced", # New advanced training
reserved_budget_in_hours=2
)
Language Service Updates
Question Answering with GPT
Combine custom question answering with GPT for better responses:
from azure.ai.language.questionanswering import QuestionAnsweringClient
qa_client = QuestionAnsweringClient(
endpoint="https://your-language.cognitiveservices.azure.com/",
credential=AzureKeyCredential("your-key")
)
def enhanced_qa(question: str, project_name: str) -> dict:
"""Get answer from QA and enhance with GPT."""
# First, get answer from custom knowledge base
qa_response = qa_client.get_answers(
question=question,
project_name=project_name,
deployment_name="production"
)
if qa_response.answers and qa_response.answers[0].confidence > 0.7:
base_answer = qa_response.answers[0].answer
else:
base_answer = "I don't have specific information about that."
# Enhance with GPT
enhanced_prompt = f"""Based on this knowledge base answer:
"{base_answer}"
Provide a helpful, conversational response to: {question}
Response:"""
gpt_response = openai.Completion.create(
engine="text-davinci-003",
prompt=enhanced_prompt,
max_tokens=200
)
return {
"original_answer": base_answer,
"enhanced_answer": gpt_response.choices[0].text.strip(),
"confidence": qa_response.answers[0].confidence if qa_response.answers else 0
}
Conversational Language Understanding (CLU)
from azure.ai.language.conversations import ConversationAnalysisClient
clu_client = ConversationAnalysisClient(
endpoint="https://your-language.cognitiveservices.azure.com/",
credential=AzureKeyCredential("your-key")
)
def understand_conversation(user_input: str) -> dict:
"""Analyze user intent and entities."""
result = clu_client.analyze_conversation(
task={
"kind": "Conversation",
"analysisInput": {
"conversationItem": {
"id": "1",
"participantId": "user1",
"text": user_input
}
},
"parameters": {
"projectName": "customer-service",
"deploymentName": "production"
}
}
)
prediction = result["result"]["prediction"]
return {
"top_intent": prediction["topIntent"],
"confidence": prediction["intents"][0]["confidenceScore"],
"entities": [
{
"category": e["category"],
"text": e["text"],
"confidence": e["confidenceScore"]
}
for e in prediction.get("entities", [])
]
}
# Example
result = understand_conversation("Book a flight from Seattle to New York next Monday")
print(f"Intent: {result['top_intent']}")
print(f"Entities: {result['entities']}")
Speech Service Updates
Neural Voice Customization
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer
from azure.cognitiveservices.speech.audio import AudioOutputConfig
speech_config = SpeechConfig(
subscription="your-key",
region="eastus"
)
# Use custom neural voice
speech_config.speech_synthesis_voice_name = "your-custom-neural-voice"
synthesizer = SpeechSynthesizer(
speech_config=speech_config,
audio_config=AudioOutputConfig(filename="output.wav")
)
# SSML for fine control
ssml = """
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
xmlns:mstts="http://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="your-custom-neural-voice">
<mstts:express-as style="customerservice" styledegree="2">
Welcome to Azure support! How can I help you today?
</mstts:express-as>
</voice>
</speak>
"""
result = synthesizer.speak_ssml_async(ssml).get()
Real-time Speech Translation
import azure.cognitiveservices.speech as speechsdk
def setup_translation(source_lang: str, target_langs: list) -> speechsdk.translation.TranslationRecognizer:
"""Set up real-time speech translation."""
translation_config = speechsdk.translation.SpeechTranslationConfig(
subscription="your-key",
region="eastus"
)
translation_config.speech_recognition_language = source_lang
for lang in target_langs:
translation_config.add_target_language(lang)
# Enable voice output for first target language
translation_config.voice_name = "de-DE-KatjaNeural"
recognizer = speechsdk.translation.TranslationRecognizer(
translation_config=translation_config
)
return recognizer
def on_recognized(evt):
"""Handle recognized speech."""
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
print(f"Original: {evt.result.text}")
for lang, translation in evt.result.translations.items():
print(f"{lang}: {translation}")
# Usage
recognizer = setup_translation("en-US", ["de", "fr", "es"])
recognizer.recognized.connect(on_recognized)
recognizer.start_continuous_recognition()
Form Recognizer Updates
from azure.ai.formrecognizer import DocumentAnalysisClient
form_client = DocumentAnalysisClient(
endpoint="https://your-formrecognizer.cognitiveservices.azure.com/",
credential=AzureKeyCredential("your-key")
)
# Analyze with new prebuilt models
with open("invoice.pdf", "rb") as f:
poller = form_client.begin_analyze_document(
"prebuilt-invoice", # New enhanced invoice model
f
)
result = poller.result()
for document in result.documents:
print(f"Document type: {document.doc_type}")
# Extract structured fields
vendor = document.fields.get("VendorName")
if vendor:
print(f"Vendor: {vendor.value} (confidence: {vendor.confidence:.2f})")
total = document.fields.get("InvoiceTotal")
if total:
print(f"Total: {total.value} (confidence: {total.confidence:.2f})")
# Extract line items
items = document.fields.get("Items")
if items:
for item in items.value:
desc = item.value.get("Description")
amount = item.value.get("Amount")
print(f" - {desc.value}: {amount.value}")
Unified Multi-service Deployment
// Deploy multiple Cognitive Services with unified key
resource cognitiveServices 'Microsoft.CognitiveServices/accounts@2022-12-01' = {
name: 'cognitive-multi-service'
location: location
sku: {
name: 'S0'
}
kind: 'CognitiveServices' // Multi-service account
properties: {
customSubDomainName: 'my-cognitive-services'
networkAcls: {
defaultAction: 'Deny'
virtualNetworkRules: [
{
id: subnetId
}
]
}
publicNetworkAccess: 'Disabled'
}
}
// Private endpoint
resource privateEndpoint 'Microsoft.Network/privateEndpoints@2021-05-01' = {
name: 'pe-cognitive'
location: location
properties: {
subnet: {
id: subnetId
}
privateLinkServiceConnections: [
{
name: 'cognitive-connection'
properties: {
privateLinkServiceId: cognitiveServices.id
groupIds: [
'account'
]
}
}
]
}
}
Conclusion
Azure Cognitive Services is becoming more powerful with OpenAI integration while maintaining the familiar, easy-to-use APIs. The improvements across vision, language, and speech services, combined with better enterprise features, make it an excellent choice for adding AI capabilities to applications.