Back to Blog
3 min read

PostgreSQL with pgvector: Open Source Vector Search

pgvector brings vector search capabilities to PostgreSQL, enabling AI features in existing database infrastructure.

pgvector Implementation

import psycopg2
from psycopg2.extras import execute_values
import numpy as np
from typing import List, Tuple

class PgVectorStore:
    def __init__(self, connection_string: str):
        self.conn = psycopg2.connect(connection_string)
        self.setup_extension()

    def setup_extension(self):
        """Enable pgvector extension."""
        with self.conn.cursor() as cur:
            cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
            self.conn.commit()

    def create_table(self, table_name: str, dimensions: int = 1536):
        """Create table with vector column."""
        with self.conn.cursor() as cur:
            cur.execute(f"""
                CREATE TABLE IF NOT EXISTS {table_name} (
                    id SERIAL PRIMARY KEY,
                    content TEXT NOT NULL,
                    embedding vector({dimensions}),
                    metadata JSONB,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                )
            """)

            # Create HNSW index for fast similarity search
            cur.execute(f"""
                CREATE INDEX IF NOT EXISTS {table_name}_embedding_idx
                ON {table_name}
                USING hnsw (embedding vector_cosine_ops)
                WITH (m = 16, ef_construction = 64)
            """)

            self.conn.commit()

    def insert_documents(self, table_name: str, documents: List[Tuple[str, List[float], dict]]):
        """Bulk insert documents with embeddings."""
        with self.conn.cursor() as cur:
            execute_values(
                cur,
                f"""
                INSERT INTO {table_name} (content, embedding, metadata)
                VALUES %s
                """,
                [(doc[0], doc[1], json.dumps(doc[2])) for doc in documents],
                template="(%s, %s::vector, %s::jsonb)"
            )
            self.conn.commit()

    def similarity_search(
        self,
        table_name: str,
        query_embedding: List[float],
        top_k: int = 10,
        filter_metadata: dict = None
    ) -> List[dict]:
        """Perform similarity search."""
        with self.conn.cursor() as cur:
            query = f"""
                SELECT
                    id,
                    content,
                    metadata,
                    1 - (embedding <=> %s::vector) AS similarity
                FROM {table_name}
            """

            params = [query_embedding]

            if filter_metadata:
                conditions = []
                for key, value in filter_metadata.items():
                    conditions.append(f"metadata->>'{key}' = %s")
                    params.append(value)
                query += " WHERE " + " AND ".join(conditions)

            query += f"""
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """
            params.extend([query_embedding, top_k])

            cur.execute(query, params)
            results = cur.fetchall()

            return [
                {
                    "id": r[0],
                    "content": r[1],
                    "metadata": r[2],
                    "similarity": r[3]
                }
                for r in results
            ]

    def hybrid_search(
        self,
        table_name: str,
        text_query: str,
        query_embedding: List[float],
        top_k: int = 10
    ) -> List[dict]:
        """Combine full-text and vector search."""
        with self.conn.cursor() as cur:
            cur.execute(f"""
                WITH vector_results AS (
                    SELECT id, content, metadata,
                           1 - (embedding <=> %s::vector) AS vector_score
                    FROM {table_name}
                    ORDER BY embedding <=> %s::vector
                    LIMIT %s
                ),
                text_results AS (
                    SELECT id, content, metadata,
                           ts_rank(to_tsvector('english', content),
                                  plainto_tsquery('english', %s)) AS text_score
                    FROM {table_name}
                    WHERE to_tsvector('english', content) @@
                          plainto_tsquery('english', %s)
                    LIMIT %s
                )
                SELECT COALESCE(v.id, t.id) AS id,
                       COALESCE(v.content, t.content) AS content,
                       COALESCE(v.vector_score, 0) + COALESCE(t.text_score, 0) AS score
                FROM vector_results v
                FULL OUTER JOIN text_results t ON v.id = t.id
                ORDER BY score DESC
                LIMIT %s
            """, [query_embedding, query_embedding, top_k * 2,
                  text_query, text_query, top_k * 2, top_k])

            return [{"id": r[0], "content": r[1], "score": r[2]} for r in cur.fetchall()]

pgvector enables AI-ready applications using familiar PostgreSQL infrastructure.

Michael John Peña

Michael John Peña

Senior Data Engineer based in Sydney. Writing about data, cloud, and technology.