6 min read
Introduction to Vector Databases for AI Applications
As AI applications scale, storing and searching millions of embeddings efficiently becomes critical. Vector databases are purpose-built for this challenge. Let’s explore what they are and when to use them.
Why Vector Databases?
Traditional databases are optimized for exact matches and range queries. Vector search requires finding approximate nearest neighbors in high-dimensional space - a fundamentally different problem.
# The challenge: Find similar vectors among millions
import numpy as np
# 1 million documents, 1536 dimensions each
embeddings = np.random.rand(1_000_000, 1536)
query = np.random.rand(1536)
# Brute force: O(n) - too slow for production
def brute_force_search(query, embeddings, top_k=10):
similarities = np.dot(embeddings, query)
top_indices = np.argpartition(similarities, -top_k)[-top_k:]
return top_indices[np.argsort(similarities[top_indices])[::-1]]
# Vector databases use approximate algorithms: O(log n) or O(1)
Vector Database Landscape
| Database | Type | Best For | Managed Option |
|---|---|---|---|
| Pinecone | Managed SaaS | Simplicity, scale | Yes (only) |
| Weaviate | Open source | Flexibility, self-host | Weaviate Cloud |
| Milvus | Open source | High performance | Zilliz Cloud |
| Qdrant | Open source | Rust performance | Qdrant Cloud |
| Azure Cognitive Search | Azure native | Azure integration | Yes |
| pgvector | PostgreSQL ext | Existing Postgres | Any Postgres |
Key Concepts
Indexing Algorithms
# HNSW (Hierarchical Navigable Small World)
# - Fast search: O(log n)
# - High accuracy
# - Memory intensive
# IVF (Inverted File Index)
# - Clusters vectors, searches relevant clusters
# - Good for large datasets
# - Trade-off between speed and accuracy
# Product Quantization
# - Compresses vectors
# - Reduces memory usage
# - Slight accuracy loss
Distance Metrics
import numpy as np
def cosine_similarity(a, b):
"""Best for semantic similarity (normalized vectors)."""
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
def euclidean_distance(a, b):
"""Best for actual distance in embedding space."""
return np.linalg.norm(a - b)
def dot_product(a, b):
"""Best for pre-normalized vectors, fastest."""
return np.dot(a, b)
# Choose based on your embeddings
# OpenAI embeddings: cosine or dot product (already normalized)
Generic Vector DB Interface
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
@dataclass
class VectorDocument:
"""A document with embedding."""
id: str
embedding: List[float]
metadata: Dict[str, Any]
text: Optional[str] = None
@dataclass
class VectorSearchResult:
"""Search result from vector database."""
id: str
score: float
metadata: Dict[str, Any]
text: Optional[str] = None
class VectorDBInterface(ABC):
"""Abstract interface for vector databases."""
@abstractmethod
def create_collection(
self,
name: str,
dimension: int,
metric: str = "cosine"
) -> bool:
"""Create a new collection/index."""
pass
@abstractmethod
def insert(
self,
collection: str,
documents: List[VectorDocument]
) -> List[str]:
"""Insert documents into collection."""
pass
@abstractmethod
def search(
self,
collection: str,
query_vector: List[float],
top_k: int = 10,
filters: Optional[Dict] = None
) -> List[VectorSearchResult]:
"""Search for similar vectors."""
pass
@abstractmethod
def delete(
self,
collection: str,
ids: List[str]
) -> bool:
"""Delete documents by ID."""
pass
@abstractmethod
def update(
self,
collection: str,
document: VectorDocument
) -> bool:
"""Update a document."""
pass
In-Memory Vector Store (For Development)
import numpy as np
from typing import List, Dict, Optional
import heapq
class InMemoryVectorStore(VectorDBInterface):
"""Simple in-memory vector store for development."""
def __init__(self):
self.collections: Dict[str, Dict] = {}
def create_collection(
self,
name: str,
dimension: int,
metric: str = "cosine"
) -> bool:
self.collections[name] = {
"dimension": dimension,
"metric": metric,
"documents": {},
"embeddings": []
}
return True
def insert(
self,
collection: str,
documents: List[VectorDocument]
) -> List[str]:
if collection not in self.collections:
raise ValueError(f"Collection {collection} not found")
coll = self.collections[collection]
ids = []
for doc in documents:
coll["documents"][doc.id] = {
"embedding": doc.embedding,
"metadata": doc.metadata,
"text": doc.text
}
ids.append(doc.id)
return ids
def search(
self,
collection: str,
query_vector: List[float],
top_k: int = 10,
filters: Optional[Dict] = None
) -> List[VectorSearchResult]:
if collection not in self.collections:
raise ValueError(f"Collection {collection} not found")
coll = self.collections[collection]
query = np.array(query_vector)
# Calculate scores
scores = []
for doc_id, doc_data in coll["documents"].items():
# Apply filters
if filters:
skip = False
for key, value in filters.items():
if doc_data["metadata"].get(key) != value:
skip = True
break
if skip:
continue
embedding = np.array(doc_data["embedding"])
if coll["metric"] == "cosine":
score = np.dot(query, embedding) / (
np.linalg.norm(query) * np.linalg.norm(embedding)
)
elif coll["metric"] == "euclidean":
score = -np.linalg.norm(query - embedding)
else:
score = np.dot(query, embedding)
scores.append((score, doc_id, doc_data))
# Get top k
top_results = heapq.nlargest(top_k, scores, key=lambda x: x[0])
return [
VectorSearchResult(
id=doc_id,
score=score,
metadata=doc_data["metadata"],
text=doc_data.get("text")
)
for score, doc_id, doc_data in top_results
]
def delete(self, collection: str, ids: List[str]) -> bool:
if collection not in self.collections:
return False
for doc_id in ids:
self.collections[collection]["documents"].pop(doc_id, None)
return True
def update(self, collection: str, document: VectorDocument) -> bool:
if collection not in self.collections:
return False
self.collections[collection]["documents"][document.id] = {
"embedding": document.embedding,
"metadata": document.metadata,
"text": document.text
}
return True
# Usage
store = InMemoryVectorStore()
store.create_collection("documents", dimension=1536, metric="cosine")
# Insert documents
docs = [
VectorDocument(
id="1",
embedding=[0.1] * 1536, # Placeholder
metadata={"category": "tech"},
text="Azure is a cloud platform"
),
VectorDocument(
id="2",
embedding=[0.2] * 1536,
metadata={"category": "tech"},
text="AWS provides cloud services"
)
]
store.insert("documents", docs)
# Search
results = store.search(
"documents",
query_vector=[0.15] * 1536,
top_k=5,
filters={"category": "tech"}
)
Choosing a Vector Database
Decision Factors
def recommend_vector_db(requirements: dict) -> str:
"""Recommend a vector database based on requirements."""
scale = requirements.get("vectors", 0)
budget = requirements.get("budget", "medium")
self_host = requirements.get("self_host", False)
azure_native = requirements.get("azure_native", False)
existing_postgres = requirements.get("existing_postgres", False)
if azure_native:
return "Azure Cognitive Search"
if existing_postgres and scale < 1_000_000:
return "pgvector"
if not self_host:
if budget == "low":
return "Qdrant Cloud (free tier)"
elif scale > 10_000_000:
return "Pinecone"
else:
return "Weaviate Cloud"
if self_host:
if scale > 100_000_000:
return "Milvus"
else:
return "Qdrant or Weaviate"
return "Pinecone"
# Example
recommendation = recommend_vector_db({
"vectors": 5_000_000,
"budget": "medium",
"self_host": False,
"azure_native": True
})
print(f"Recommended: {recommendation}")
Performance Characteristics
VECTOR_DB_CHARACTERISTICS = {
"pinecone": {
"query_latency_ms": "10-50",
"indexing_speed": "Fast",
"max_vectors": "Billions",
"filtering": "Yes",
"hybrid_search": "Yes",
"deployment": "Managed only"
},
"weaviate": {
"query_latency_ms": "10-100",
"indexing_speed": "Medium",
"max_vectors": "Millions",
"filtering": "Yes",
"hybrid_search": "Yes",
"deployment": "Self-host or Cloud"
},
"milvus": {
"query_latency_ms": "1-10",
"indexing_speed": "Fast",
"max_vectors": "Billions",
"filtering": "Yes",
"hybrid_search": "Limited",
"deployment": "Self-host or Zilliz"
},
"qdrant": {
"query_latency_ms": "5-50",
"indexing_speed": "Fast",
"max_vectors": "Hundreds of millions",
"filtering": "Yes",
"hybrid_search": "Yes",
"deployment": "Self-host or Cloud"
},
"azure_cognitive_search": {
"query_latency_ms": "50-200",
"indexing_speed": "Medium",
"max_vectors": "Millions",
"filtering": "Yes",
"hybrid_search": "Yes",
"deployment": "Azure managed"
}
}
Best Practices
- Start simple: Use in-memory or pgvector for prototypes
- Choose based on scale: Different DBs for different volumes
- Consider hybrid search: Combine vector + keyword
- Plan for filtering: Most queries need metadata filters
- Monitor performance: Track latency and recall
- Design for updates: Some DBs handle updates better