5 min read
Document Translation at Scale with Azure Cognitive Services
Azure Document Translation enables bulk translation of documents while preserving their original structure and formatting. It supports over 90 languages and various document formats including PDF, DOCX, PPTX, and more.
Document Translation Features
- Format Preservation: Maintains layout, tables, and images
- Batch Processing: Translate hundreds of documents at once
- Multiple Formats: PDF, Office documents, HTML, text files
- Custom Models: Use custom translation models for domain terminology
- Glossary Support: Ensure consistent translation of specific terms
Setting Up Document Translation
from azure.ai.translation.document import DocumentTranslationClient
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient, generate_container_sas, ContainerSasPermissions
from datetime import datetime, timedelta
class DocumentTranslator:
def __init__(self, translation_key: str, translation_endpoint: str,
storage_connection: str):
self.translation_client = DocumentTranslationClient(
translation_endpoint,
AzureKeyCredential(translation_key)
)
self.blob_service = BlobServiceClient.from_connection_string(storage_connection)
def generate_sas_url(self, container_name: str, permissions: str = "rl") -> str:
"""Generate SAS URL for a container."""
sas_token = generate_container_sas(
account_name=self.blob_service.account_name,
container_name=container_name,
account_key=self.blob_service.credential.account_key,
permission=ContainerSasPermissions(
read="r" in permissions,
write="w" in permissions,
list="l" in permissions
),
expiry=datetime.utcnow() + timedelta(hours=1)
)
return f"https://{self.blob_service.account_name}.blob.core.windows.net/{container_name}?{sas_token}"
def translate_documents(self, source_container: str, target_container: str,
source_language: str, target_language: str) -> str:
"""Start document translation job."""
source_url = self.generate_sas_url(source_container, "rl")
target_url = self.generate_sas_url(target_container, "rwl")
poller = self.translation_client.begin_translation(
source_url,
target_url,
target_language,
source_language=source_language
)
return poller
# Initialize translator
translator = DocumentTranslator(
"your-translation-key",
"https://your-translator.cognitiveservices.azure.com/",
"your-storage-connection-string"
)
# Start translation
poller = translator.translate_documents(
source_container="documents-english",
target_container="documents-spanish",
source_language="en",
target_language="es"
)
# Wait for completion
result = poller.result()
print(f"Translation completed: {result.status}")
Multi-Language Batch Translation
from azure.ai.translation.document import DocumentTranslationInput, TranslationTarget
def translate_to_multiple_languages(client: DocumentTranslationClient,
source_url: str,
target_configs: list) -> dict:
"""Translate documents to multiple languages simultaneously."""
inputs = [
DocumentTranslationInput(
source_url=source_url,
targets=[
TranslationTarget(
target_url=config["target_url"],
language=config["language"]
)
for config in target_configs
]
)
]
poller = client.begin_translation(inputs)
# Monitor progress
while not poller.done():
status = poller.status()
print(f"Status: {status}")
# Get detailed progress
for doc in poller.document_statuses():
print(f" Document: {doc.source_document_url}")
print(f" Status: {doc.status}")
if doc.status == "Succeeded":
print(f" Translated to: {doc.translated_document_url}")
import time
time.sleep(10)
return poller.result()
# Translate to multiple languages
target_languages = [
{"language": "es", "target_url": "https://storage.blob.core.windows.net/spanish?sas=..."},
{"language": "fr", "target_url": "https://storage.blob.core.windows.net/french?sas=..."},
{"language": "de", "target_url": "https://storage.blob.core.windows.net/german?sas=..."},
{"language": "ja", "target_url": "https://storage.blob.core.windows.net/japanese?sas=..."}
]
result = translate_to_multiple_languages(
translator.translation_client,
"https://storage.blob.core.windows.net/english?sas=...",
target_languages
)
Using Custom Glossaries
from azure.ai.translation.document import TranslationGlossary
def translate_with_glossary(client: DocumentTranslationClient,
source_url: str,
target_url: str,
glossary_url: str,
source_lang: str,
target_lang: str):
"""Translate with custom glossary for consistent terminology."""
inputs = [
DocumentTranslationInput(
source_url=source_url,
targets=[
TranslationTarget(
target_url=target_url,
language=target_lang,
glossaries=[
TranslationGlossary(
glossary_url=glossary_url,
file_format="TSV" # or CSV, XLIFF
)
]
)
],
source_language=source_lang
)
]
poller = client.begin_translation(inputs)
return poller.result()
# Glossary file format (TSV):
# English Spanish
# machine learning aprendizaje automático
# cloud computing computación en la nube
# artificial intelligence inteligencia artificial
Translation API for Text
import requests
import uuid
class TextTranslator:
def __init__(self, key: str, endpoint: str, region: str):
self.key = key
self.endpoint = endpoint
self.region = region
def translate(self, texts: list, target_language: str,
source_language: str = None) -> list:
"""Translate text using the Translator API."""
path = '/translate?api-version=3.0'
params = f'&to={target_language}'
if source_language:
params += f'&from={source_language}'
url = self.endpoint + path + params
headers = {
'Ocp-Apim-Subscription-Key': self.key,
'Ocp-Apim-Subscription-Region': self.region,
'Content-type': 'application/json',
'X-ClientTraceId': str(uuid.uuid4())
}
body = [{'text': text} for text in texts]
response = requests.post(url, headers=headers, json=body)
return response.json()
def detect_language(self, texts: list) -> list:
"""Detect language of texts."""
path = '/detect?api-version=3.0'
url = self.endpoint + path
headers = {
'Ocp-Apim-Subscription-Key': self.key,
'Ocp-Apim-Subscription-Region': self.region,
'Content-type': 'application/json'
}
body = [{'text': text} for text in texts]
response = requests.post(url, headers=headers, json=body)
return response.json()
def transliterate(self, texts: list, language: str,
from_script: str, to_script: str) -> list:
"""Convert text from one script to another."""
path = f'/transliterate?api-version=3.0&language={language}&fromScript={from_script}&toScript={to_script}'
url = self.endpoint + path
headers = {
'Ocp-Apim-Subscription-Key': self.key,
'Ocp-Apim-Subscription-Region': self.region,
'Content-type': 'application/json'
}
body = [{'text': text} for text in texts]
response = requests.post(url, headers=headers, json=body)
return response.json()
# Use text translator
text_translator = TextTranslator(
"your-key",
"https://api.cognitive.microsofttranslator.com",
"westus"
)
# Translate text
texts = [
"Hello, how are you?",
"The weather is beautiful today."
]
results = text_translator.translate(texts, target_language="es")
for i, result in enumerate(results):
print(f"Original: {texts[i]}")
print(f"Translated: {result['translations'][0]['text']}")
print()
Building a Translation Service
from flask import Flask, request, jsonify
from azure.ai.translation.document import DocumentTranslationClient
from azure.core.credentials import AzureKeyCredential
import os
app = Flask(__name__)
# Initialize clients
doc_translator = DocumentTranslationClient(
os.environ["TRANSLATOR_ENDPOINT"],
AzureKeyCredential(os.environ["TRANSLATOR_KEY"])
)
text_translator = TextTranslator(
os.environ["TRANSLATOR_KEY"],
"https://api.cognitive.microsofttranslator.com",
os.environ["TRANSLATOR_REGION"]
)
@app.route('/translate/text', methods=['POST'])
def translate_text():
"""Translate text content."""
data = request.json
texts = data.get('texts', [])
target_language = data.get('target_language', 'en')
source_language = data.get('source_language')
results = text_translator.translate(
texts,
target_language,
source_language
)
translations = []
for result in results:
translations.append({
"detected_language": result.get('detectedLanguage', {}).get('language'),
"translation": result['translations'][0]['text']
})
return jsonify({"translations": translations})
@app.route('/translate/documents', methods=['POST'])
def translate_documents():
"""Start document translation job."""
data = request.json
source_url = data['source_url']
target_url = data['target_url']
target_language = data['target_language']
source_language = data.get('source_language')
poller = doc_translator.begin_translation(
source_url,
target_url,
target_language,
source_language=source_language
)
return jsonify({
"job_id": poller.id,
"status": poller.status()
})
@app.route('/translate/status/<job_id>', methods=['GET'])
def get_translation_status(job_id):
"""Check translation job status."""
statuses = list(doc_translator.list_document_statuses(job_id))
return jsonify({
"documents": [
{
"source": s.source_document_url,
"status": s.status,
"translated": s.translated_document_url if s.status == "Succeeded" else None
}
for s in statuses
]
})
if __name__ == '__main__':
app.run(port=5000)
Supported File Formats
| Format | Extensions |
|---|---|
| Word | .docx, .doc |
| PowerPoint | .pptx, .ppt |
| Excel | .xlsx, .xls |
| HTML | .html, .htm |
| Text | .txt |
| Rich Text | .rtf |
| Tab-separated | .tsv, .tab |
| Comma-separated | .csv |
Best Practices
- Batch Wisely: Group documents by language pair
- Use Glossaries: Ensure terminology consistency
- Format Matters: PDF translation quality varies by source
- Monitor Jobs: Implement polling for long-running translations
- Handle Errors: Some documents may fail; handle gracefully
- Cost Management: Large documents incur higher costs
Document Translation enables global content delivery by making document localization efficient and scalable.