7 min read
Introduction to Text Embeddings with Azure OpenAI
Text embeddings are one of the most powerful tools in modern AI. They transform text into numerical vectors that capture semantic meaning, enabling similarity search, clustering, and retrieval-augmented generation. Let’s explore embeddings with Azure OpenAI.
What Are Embeddings?
Embeddings are dense vector representations of text where:
- Similar meanings are close together in vector space
- Different meanings are far apart
- Relationships are captured (king - man + woman = queen)
import openai
import numpy as np
from typing import List
# Get an embedding
def get_embedding(text: str, deployment: str = "text-embedding-ada-002") -> List[float]:
"""Get embedding vector for text."""
response = openai.Embedding.create(
engine=deployment,
input=text
)
return response['data'][0]['embedding']
# Example
embedding = get_embedding("Azure is a cloud computing platform")
print(f"Dimensions: {len(embedding)}") # 1536 for ada-002
print(f"First 5 values: {embedding[:5]}")
Understanding Embedding Dimensions
Azure OpenAI’s text-embedding-ada-002 produces 1536-dimensional vectors:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
def visualize_embeddings(texts: List[str], labels: List[str] = None):
"""Visualize embeddings in 2D using PCA."""
embeddings = [get_embedding(text) for text in texts]
# Reduce to 2D
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)
# Plot
plt.figure(figsize=(10, 8))
for i, (x, y) in enumerate(reduced):
plt.scatter(x, y)
label = labels[i] if labels else texts[i][:20]
plt.annotate(label, (x, y), fontsize=8)
plt.title("Text Embeddings (2D PCA)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
# Example
texts = [
"Azure is a cloud platform",
"AWS is Amazon's cloud service",
"Google Cloud Platform offers cloud computing",
"Python is a programming language",
"JavaScript runs in browsers",
"Machine learning uses data to learn patterns"
]
visualize_embeddings(texts)
# Cloud platforms cluster together, programming languages cluster together
Similarity Calculations
from typing import Tuple
import numpy as np
def cosine_similarity(a: List[float], b: List[float]) -> float:
"""Calculate cosine similarity between two vectors."""
a = np.array(a)
b = np.array(b)
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def euclidean_distance(a: List[float], b: List[float]) -> float:
"""Calculate Euclidean distance between two vectors."""
return np.linalg.norm(np.array(a) - np.array(b))
def dot_product(a: List[float], b: List[float]) -> float:
"""Calculate dot product (works well with normalized vectors)."""
return np.dot(a, b)
class SimilarityCalculator:
"""Calculate and compare text similarities."""
def __init__(self, deployment: str = "text-embedding-ada-002"):
self.deployment = deployment
self.cache = {}
def get_embedding(self, text: str) -> List[float]:
"""Get embedding with caching."""
if text not in self.cache:
self.cache[text] = get_embedding(text, self.deployment)
return self.cache[text]
def similarity(self, text1: str, text2: str) -> float:
"""Calculate similarity between two texts."""
emb1 = self.get_embedding(text1)
emb2 = self.get_embedding(text2)
return cosine_similarity(emb1, emb2)
def rank_by_similarity(
self,
query: str,
documents: List[str],
top_k: int = 5
) -> List[Tuple[str, float]]:
"""Rank documents by similarity to query."""
query_emb = self.get_embedding(query)
scored = []
for doc in documents:
doc_emb = self.get_embedding(doc)
score = cosine_similarity(query_emb, doc_emb)
scored.append((doc, score))
scored.sort(key=lambda x: x[1], reverse=True)
return scored[:top_k]
# Usage
calc = SimilarityCalculator()
# Compare two sentences
sim = calc.similarity(
"The quick brown fox jumps over the lazy dog",
"A fast auburn fox leaps above a sleepy canine"
)
print(f"Similarity: {sim:.4f}") # High similarity (same meaning)
sim2 = calc.similarity(
"The quick brown fox jumps over the lazy dog",
"Azure provides cloud computing services"
)
print(f"Similarity: {sim2:.4f}") # Low similarity (different topics)
Batch Processing
Efficiently embed many documents:
from typing import List, Dict
import time
class BatchEmbedder:
"""Efficient batch embedding with rate limiting."""
def __init__(
self,
deployment: str = "text-embedding-ada-002",
batch_size: int = 100,
requests_per_minute: int = 60
):
self.deployment = deployment
self.batch_size = batch_size
self.min_interval = 60.0 / requests_per_minute
def embed_batch(self, texts: List[str]) -> List[List[float]]:
"""Embed a batch of texts."""
response = openai.Embedding.create(
engine=self.deployment,
input=texts
)
return [item['embedding'] for item in response['data']]
def embed_all(
self,
texts: List[str],
show_progress: bool = True
) -> List[List[float]]:
"""Embed all texts with batching and rate limiting."""
all_embeddings = []
total_batches = (len(texts) + self.batch_size - 1) // self.batch_size
for i in range(0, len(texts), self.batch_size):
batch = texts[i:i + self.batch_size]
batch_num = i // self.batch_size + 1
if show_progress:
print(f"Processing batch {batch_num}/{total_batches}")
start_time = time.time()
embeddings = self.embed_batch(batch)
all_embeddings.extend(embeddings)
# Rate limiting
elapsed = time.time() - start_time
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
return all_embeddings
def embed_with_metadata(
self,
documents: List[Dict]
) -> List[Dict]:
"""Embed documents and add embeddings to metadata."""
texts = [doc.get('text', doc.get('content', '')) for doc in documents]
embeddings = self.embed_all(texts)
results = []
for doc, emb in zip(documents, embeddings):
result = doc.copy()
result['embedding'] = emb
results.append(result)
return results
# Usage
embedder = BatchEmbedder()
documents = [
{"id": 1, "text": "Azure Virtual Machines provide scalable computing"},
{"id": 2, "text": "Azure Functions is a serverless compute service"},
{"id": 3, "text": "Azure Cosmos DB is a globally distributed database"},
# ... many more documents
]
embedded_docs = embedder.embed_with_metadata(documents)
print(f"Embedded {len(embedded_docs)} documents")
Building a Simple Semantic Search
from dataclasses import dataclass
from typing import List, Optional
import json
@dataclass
class SearchResult:
"""A search result."""
document: dict
score: float
rank: int
class SimpleSemanticSearch:
"""Simple in-memory semantic search."""
def __init__(self, deployment: str = "text-embedding-ada-002"):
self.deployment = deployment
self.documents: List[dict] = []
self.embeddings: List[List[float]] = []
def add_documents(self, documents: List[dict], text_field: str = "text"):
"""Add documents to the index."""
embedder = BatchEmbedder(self.deployment)
for doc in documents:
text = doc.get(text_field, "")
embedding = get_embedding(text, self.deployment)
self.documents.append(doc)
self.embeddings.append(embedding)
def search(
self,
query: str,
top_k: int = 5,
threshold: Optional[float] = None
) -> List[SearchResult]:
"""Search for similar documents."""
query_embedding = get_embedding(query, self.deployment)
# Calculate similarities
scores = []
for i, doc_emb in enumerate(self.embeddings):
score = cosine_similarity(query_embedding, doc_emb)
scores.append((i, score))
# Sort by score
scores.sort(key=lambda x: x[1], reverse=True)
# Filter and limit
results = []
for rank, (idx, score) in enumerate(scores[:top_k], 1):
if threshold and score < threshold:
continue
results.append(SearchResult(
document=self.documents[idx],
score=score,
rank=rank
))
return results
def save_index(self, filepath: str):
"""Save index to file."""
data = {
"documents": self.documents,
"embeddings": self.embeddings
}
with open(filepath, 'w') as f:
json.dump(data, f)
def load_index(self, filepath: str):
"""Load index from file."""
with open(filepath, 'r') as f:
data = json.load(f)
self.documents = data["documents"]
self.embeddings = data["embeddings"]
# Usage
search = SimpleSemanticSearch()
# Add documents
docs = [
{"id": 1, "title": "VM Guide", "text": "Azure Virtual Machines are IaaS compute resources"},
{"id": 2, "title": "Functions Guide", "text": "Azure Functions lets you run code without servers"},
{"id": 3, "title": "Cosmos DB Guide", "text": "Cosmos DB is a NoSQL database with global distribution"},
{"id": 4, "title": "SQL Guide", "text": "Azure SQL Database is a managed relational database"},
{"id": 5, "title": "Blob Storage", "text": "Azure Blob Storage stores unstructured data objects"}
]
search.add_documents(docs)
# Search
results = search.search("serverless computing", top_k=3)
for r in results:
print(f"{r.rank}. {r.document['title']} (score: {r.score:.4f})")
Embedding Use Cases
# 1. Document deduplication
def find_duplicates(documents: List[str], threshold: float = 0.95) -> List[Tuple[int, int]]:
"""Find near-duplicate documents."""
embeddings = [get_embedding(doc) for doc in documents]
duplicates = []
for i in range(len(embeddings)):
for j in range(i + 1, len(embeddings)):
sim = cosine_similarity(embeddings[i], embeddings[j])
if sim >= threshold:
duplicates.append((i, j))
return duplicates
# 2. Text clustering
from sklearn.cluster import KMeans
def cluster_documents(documents: List[str], n_clusters: int = 5) -> List[int]:
"""Cluster documents by semantic similarity."""
embeddings = [get_embedding(doc) for doc in documents]
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)
return labels.tolist()
# 3. Anomaly detection
def find_outliers(documents: List[str], threshold: float = 0.5) -> List[int]:
"""Find documents that are outliers (dissimilar to others)."""
embeddings = [get_embedding(doc) for doc in documents]
# Calculate average similarity to all other documents
outlier_indices = []
for i, emb in enumerate(embeddings):
similarities = [
cosine_similarity(emb, other)
for j, other in enumerate(embeddings)
if i != j
]
avg_sim = np.mean(similarities)
if avg_sim < threshold:
outlier_indices.append(i)
return outlier_indices
Best Practices
- Cache embeddings: Embedding generation is slow and costs tokens
- Batch requests: Process multiple texts in one API call
- Normalize vectors: For faster dot product similarity
- Choose the right model: ada-002 balances quality and cost
- Handle long text: Chunk or summarize texts over 8191 tokens
- Monitor costs: Track token usage for embedding calls