Back to Blog
6 min read

Introduction to Vector Databases for AI Applications

As AI applications scale, storing and searching millions of embeddings efficiently becomes critical. Vector databases are purpose-built for this challenge. Let’s explore what they are and when to use them.

Why Vector Databases?

Traditional databases are optimized for exact matches and range queries. Vector search requires finding approximate nearest neighbors in high-dimensional space - a fundamentally different problem.

# The challenge: Find similar vectors among millions
import numpy as np

# 1 million documents, 1536 dimensions each
embeddings = np.random.rand(1_000_000, 1536)
query = np.random.rand(1536)

# Brute force: O(n) - too slow for production
def brute_force_search(query, embeddings, top_k=10):
    similarities = np.dot(embeddings, query)
    top_indices = np.argpartition(similarities, -top_k)[-top_k:]
    return top_indices[np.argsort(similarities[top_indices])[::-1]]

# Vector databases use approximate algorithms: O(log n) or O(1)

Vector Database Landscape

DatabaseTypeBest ForManaged Option
PineconeManaged SaaSSimplicity, scaleYes (only)
WeaviateOpen sourceFlexibility, self-hostWeaviate Cloud
MilvusOpen sourceHigh performanceZilliz Cloud
QdrantOpen sourceRust performanceQdrant Cloud
Azure Cognitive SearchAzure nativeAzure integrationYes
pgvectorPostgreSQL extExisting PostgresAny Postgres

Key Concepts

Indexing Algorithms

# HNSW (Hierarchical Navigable Small World)
# - Fast search: O(log n)
# - High accuracy
# - Memory intensive

# IVF (Inverted File Index)
# - Clusters vectors, searches relevant clusters
# - Good for large datasets
# - Trade-off between speed and accuracy

# Product Quantization
# - Compresses vectors
# - Reduces memory usage
# - Slight accuracy loss

Distance Metrics

import numpy as np

def cosine_similarity(a, b):
    """Best for semantic similarity (normalized vectors)."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def euclidean_distance(a, b):
    """Best for actual distance in embedding space."""
    return np.linalg.norm(a - b)

def dot_product(a, b):
    """Best for pre-normalized vectors, fastest."""
    return np.dot(a, b)

# Choose based on your embeddings
# OpenAI embeddings: cosine or dot product (already normalized)

Generic Vector DB Interface

from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any, Optional

@dataclass
class VectorDocument:
    """A document with embedding."""
    id: str
    embedding: List[float]
    metadata: Dict[str, Any]
    text: Optional[str] = None

@dataclass
class VectorSearchResult:
    """Search result from vector database."""
    id: str
    score: float
    metadata: Dict[str, Any]
    text: Optional[str] = None

class VectorDBInterface(ABC):
    """Abstract interface for vector databases."""

    @abstractmethod
    def create_collection(
        self,
        name: str,
        dimension: int,
        metric: str = "cosine"
    ) -> bool:
        """Create a new collection/index."""
        pass

    @abstractmethod
    def insert(
        self,
        collection: str,
        documents: List[VectorDocument]
    ) -> List[str]:
        """Insert documents into collection."""
        pass

    @abstractmethod
    def search(
        self,
        collection: str,
        query_vector: List[float],
        top_k: int = 10,
        filters: Optional[Dict] = None
    ) -> List[VectorSearchResult]:
        """Search for similar vectors."""
        pass

    @abstractmethod
    def delete(
        self,
        collection: str,
        ids: List[str]
    ) -> bool:
        """Delete documents by ID."""
        pass

    @abstractmethod
    def update(
        self,
        collection: str,
        document: VectorDocument
    ) -> bool:
        """Update a document."""
        pass

In-Memory Vector Store (For Development)

import numpy as np
from typing import List, Dict, Optional
import heapq

class InMemoryVectorStore(VectorDBInterface):
    """Simple in-memory vector store for development."""

    def __init__(self):
        self.collections: Dict[str, Dict] = {}

    def create_collection(
        self,
        name: str,
        dimension: int,
        metric: str = "cosine"
    ) -> bool:
        self.collections[name] = {
            "dimension": dimension,
            "metric": metric,
            "documents": {},
            "embeddings": []
        }
        return True

    def insert(
        self,
        collection: str,
        documents: List[VectorDocument]
    ) -> List[str]:
        if collection not in self.collections:
            raise ValueError(f"Collection {collection} not found")

        coll = self.collections[collection]
        ids = []

        for doc in documents:
            coll["documents"][doc.id] = {
                "embedding": doc.embedding,
                "metadata": doc.metadata,
                "text": doc.text
            }
            ids.append(doc.id)

        return ids

    def search(
        self,
        collection: str,
        query_vector: List[float],
        top_k: int = 10,
        filters: Optional[Dict] = None
    ) -> List[VectorSearchResult]:
        if collection not in self.collections:
            raise ValueError(f"Collection {collection} not found")

        coll = self.collections[collection]
        query = np.array(query_vector)

        # Calculate scores
        scores = []
        for doc_id, doc_data in coll["documents"].items():
            # Apply filters
            if filters:
                skip = False
                for key, value in filters.items():
                    if doc_data["metadata"].get(key) != value:
                        skip = True
                        break
                if skip:
                    continue

            embedding = np.array(doc_data["embedding"])

            if coll["metric"] == "cosine":
                score = np.dot(query, embedding) / (
                    np.linalg.norm(query) * np.linalg.norm(embedding)
                )
            elif coll["metric"] == "euclidean":
                score = -np.linalg.norm(query - embedding)
            else:
                score = np.dot(query, embedding)

            scores.append((score, doc_id, doc_data))

        # Get top k
        top_results = heapq.nlargest(top_k, scores, key=lambda x: x[0])

        return [
            VectorSearchResult(
                id=doc_id,
                score=score,
                metadata=doc_data["metadata"],
                text=doc_data.get("text")
            )
            for score, doc_id, doc_data in top_results
        ]

    def delete(self, collection: str, ids: List[str]) -> bool:
        if collection not in self.collections:
            return False

        for doc_id in ids:
            self.collections[collection]["documents"].pop(doc_id, None)
        return True

    def update(self, collection: str, document: VectorDocument) -> bool:
        if collection not in self.collections:
            return False

        self.collections[collection]["documents"][document.id] = {
            "embedding": document.embedding,
            "metadata": document.metadata,
            "text": document.text
        }
        return True

# Usage
store = InMemoryVectorStore()
store.create_collection("documents", dimension=1536, metric="cosine")

# Insert documents
docs = [
    VectorDocument(
        id="1",
        embedding=[0.1] * 1536,  # Placeholder
        metadata={"category": "tech"},
        text="Azure is a cloud platform"
    ),
    VectorDocument(
        id="2",
        embedding=[0.2] * 1536,
        metadata={"category": "tech"},
        text="AWS provides cloud services"
    )
]
store.insert("documents", docs)

# Search
results = store.search(
    "documents",
    query_vector=[0.15] * 1536,
    top_k=5,
    filters={"category": "tech"}
)

Choosing a Vector Database

Decision Factors

def recommend_vector_db(requirements: dict) -> str:
    """Recommend a vector database based on requirements."""

    scale = requirements.get("vectors", 0)
    budget = requirements.get("budget", "medium")
    self_host = requirements.get("self_host", False)
    azure_native = requirements.get("azure_native", False)
    existing_postgres = requirements.get("existing_postgres", False)

    if azure_native:
        return "Azure Cognitive Search"

    if existing_postgres and scale < 1_000_000:
        return "pgvector"

    if not self_host:
        if budget == "low":
            return "Qdrant Cloud (free tier)"
        elif scale > 10_000_000:
            return "Pinecone"
        else:
            return "Weaviate Cloud"

    if self_host:
        if scale > 100_000_000:
            return "Milvus"
        else:
            return "Qdrant or Weaviate"

    return "Pinecone"

# Example
recommendation = recommend_vector_db({
    "vectors": 5_000_000,
    "budget": "medium",
    "self_host": False,
    "azure_native": True
})
print(f"Recommended: {recommendation}")

Performance Characteristics

VECTOR_DB_CHARACTERISTICS = {
    "pinecone": {
        "query_latency_ms": "10-50",
        "indexing_speed": "Fast",
        "max_vectors": "Billions",
        "filtering": "Yes",
        "hybrid_search": "Yes",
        "deployment": "Managed only"
    },
    "weaviate": {
        "query_latency_ms": "10-100",
        "indexing_speed": "Medium",
        "max_vectors": "Millions",
        "filtering": "Yes",
        "hybrid_search": "Yes",
        "deployment": "Self-host or Cloud"
    },
    "milvus": {
        "query_latency_ms": "1-10",
        "indexing_speed": "Fast",
        "max_vectors": "Billions",
        "filtering": "Yes",
        "hybrid_search": "Limited",
        "deployment": "Self-host or Zilliz"
    },
    "qdrant": {
        "query_latency_ms": "5-50",
        "indexing_speed": "Fast",
        "max_vectors": "Hundreds of millions",
        "filtering": "Yes",
        "hybrid_search": "Yes",
        "deployment": "Self-host or Cloud"
    },
    "azure_cognitive_search": {
        "query_latency_ms": "50-200",
        "indexing_speed": "Medium",
        "max_vectors": "Millions",
        "filtering": "Yes",
        "hybrid_search": "Yes",
        "deployment": "Azure managed"
    }
}

Best Practices

  1. Start simple: Use in-memory or pgvector for prototypes
  2. Choose based on scale: Different DBs for different volumes
  3. Consider hybrid search: Combine vector + keyword
  4. Plan for filtering: Most queries need metadata filters
  5. Monitor performance: Track latency and recall
  6. Design for updates: Some DBs handle updates better

Resources

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.