Enhanced the hybrid search strategy with tsvector keyword matching (#539)

2025-12-24 02:39:17 -05:00 · 2025-09-10 05:23:49 -06:00
parent 012d2c58ed
commit 926b6f5a7b
7 changed files with 568 additions and 341 deletions
--- a/migration/RESET_DB.sql
+++ b/migration/RESET_DB.sql
@@ -133,6 +133,10 @@ BEGIN
    DROP FUNCTION IF EXISTS match_archon_crawled_pages(vector, int, jsonb, text) CASCADE;
    DROP FUNCTION IF EXISTS match_archon_code_examples(vector, int, jsonb, text) CASCADE;
    
+    -- Hybrid search functions (with ts_vector support)
+    DROP FUNCTION IF EXISTS hybrid_search_archon_crawled_pages(vector, text, int, jsonb, text) CASCADE;
+    DROP FUNCTION IF EXISTS hybrid_search_archon_code_examples(vector, text, int, jsonb, text) CASCADE;
+    
    -- Search functions (old without prefix)
    DROP FUNCTION IF EXISTS match_crawled_pages(vector, int, jsonb, text) CASCADE;
    DROP FUNCTION IF EXISTS match_code_examples(vector, int, jsonb, text) CASCADE;
--- a/migration/add_hybrid_search_tsvector.sql
+++ b/migration/add_hybrid_search_tsvector.sql
@@ -0,0 +1,237 @@
+-- =====================================================
+-- Add Hybrid Search with ts_vector Support
+-- =====================================================
+-- This migration adds efficient text search capabilities using PostgreSQL's
+-- full-text search features (ts_vector) to enable better keyword matching
+-- in hybrid search operations.
+-- =====================================================
+
+-- Enable required extensions (pg_trgm for fuzzy matching)
+CREATE EXTENSION IF NOT EXISTS pg_trgm;
+
+-- =====================================================
+-- SECTION 1: ADD TEXT SEARCH COLUMNS AND INDEXES
+-- =====================================================
+
+-- Add ts_vector columns for full-text search if they don't exist
+ALTER TABLE archon_crawled_pages 
+ADD COLUMN IF NOT EXISTS content_search_vector tsvector 
+GENERATED ALWAYS AS (to_tsvector('english', content)) STORED;
+
+ALTER TABLE archon_code_examples 
+ADD COLUMN IF NOT EXISTS content_search_vector tsvector 
+GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED;
+
+-- Create GIN indexes for fast text search
+CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector);
+CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector);
+
+-- Create trigram indexes for fuzzy matching (useful for typos and partial matches)
+CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops);
+CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops);
+
+-- =====================================================
+-- SECTION 2: HYBRID SEARCH FUNCTIONS
+-- =====================================================
+
+-- Hybrid search function for archon_crawled_pages
+CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
+    query_embedding vector(1536),
+    query_text TEXT,
+    match_count INT DEFAULT 10,
+    filter JSONB DEFAULT '{}'::jsonb,
+    source_filter TEXT DEFAULT NULL
+)
+RETURNS TABLE (
+    id BIGINT,
+    url VARCHAR,
+    chunk_number INTEGER,
+    content TEXT,
+    metadata JSONB,
+    source_id TEXT,
+    similarity FLOAT,
+    match_type TEXT
+)
+LANGUAGE plpgsql
+AS $$
+DECLARE
+    max_vector_results INT;
+    max_text_results INT;
+BEGIN
+    -- Calculate how many results to fetch from each search type
+    max_vector_results := match_count;
+    max_text_results := match_count;
+    
+    RETURN QUERY
+    WITH vector_results AS (
+        -- Vector similarity search
+        SELECT 
+            cp.id,
+            cp.url,
+            cp.chunk_number,
+            cp.content,
+            cp.metadata,
+            cp.source_id,
+            1 - (cp.embedding <=> query_embedding) AS vector_sim
+        FROM archon_crawled_pages cp
+        WHERE cp.metadata @> filter
+            AND (source_filter IS NULL OR cp.source_id = source_filter)
+            AND cp.embedding IS NOT NULL
+        ORDER BY cp.embedding <=> query_embedding
+        LIMIT max_vector_results
+    ),
+    text_results AS (
+        -- Full-text search with ranking
+        SELECT 
+            cp.id,
+            cp.url,
+            cp.chunk_number,
+            cp.content,
+            cp.metadata,
+            cp.source_id,
+            ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
+        FROM archon_crawled_pages cp
+        WHERE cp.metadata @> filter
+            AND (source_filter IS NULL OR cp.source_id = source_filter)
+            AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
+        ORDER BY text_sim DESC
+        LIMIT max_text_results
+    ),
+    combined_results AS (
+        -- Combine results from both searches
+        SELECT 
+            COALESCE(v.id, t.id) AS id,
+            COALESCE(v.url, t.url) AS url,
+            COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
+            COALESCE(v.content, t.content) AS content,
+            COALESCE(v.metadata, t.metadata) AS metadata,
+            COALESCE(v.source_id, t.source_id) AS source_id,
+            -- Use vector similarity if available, otherwise text similarity
+            COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
+            -- Determine match type
+            CASE 
+                WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
+                WHEN v.id IS NOT NULL THEN 'vector'
+                ELSE 'keyword'
+            END AS match_type
+        FROM vector_results v
+        FULL OUTER JOIN text_results t ON v.id = t.id
+    )
+    SELECT * FROM combined_results
+    ORDER BY similarity DESC
+    LIMIT match_count;
+END;
+$$;
+
+-- Hybrid search function for archon_code_examples
+CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
+    query_embedding vector(1536),
+    query_text TEXT,
+    match_count INT DEFAULT 10,
+    filter JSONB DEFAULT '{}'::jsonb,
+    source_filter TEXT DEFAULT NULL
+)
+RETURNS TABLE (
+    id BIGINT,
+    url VARCHAR,
+    chunk_number INTEGER,
+    content TEXT,
+    summary TEXT,
+    metadata JSONB,
+    source_id TEXT,
+    similarity FLOAT,
+    match_type TEXT
+)
+LANGUAGE plpgsql
+AS $$
+DECLARE
+    max_vector_results INT;
+    max_text_results INT;
+BEGIN
+    -- Calculate how many results to fetch from each search type
+    max_vector_results := match_count;
+    max_text_results := match_count;
+    
+    RETURN QUERY
+    WITH vector_results AS (
+        -- Vector similarity search
+        SELECT 
+            ce.id,
+            ce.url,
+            ce.chunk_number,
+            ce.content,
+            ce.summary,
+            ce.metadata,
+            ce.source_id,
+            1 - (ce.embedding <=> query_embedding) AS vector_sim
+        FROM archon_code_examples ce
+        WHERE ce.metadata @> filter
+            AND (source_filter IS NULL OR ce.source_id = source_filter)
+            AND ce.embedding IS NOT NULL
+        ORDER BY ce.embedding <=> query_embedding
+        LIMIT max_vector_results
+    ),
+    text_results AS (
+        -- Full-text search with ranking (searches both content and summary)
+        SELECT 
+            ce.id,
+            ce.url,
+            ce.chunk_number,
+            ce.content,
+            ce.summary,
+            ce.metadata,
+            ce.source_id,
+            ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
+        FROM archon_code_examples ce
+        WHERE ce.metadata @> filter
+            AND (source_filter IS NULL OR ce.source_id = source_filter)
+            AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
+        ORDER BY text_sim DESC
+        LIMIT max_text_results
+    ),
+    combined_results AS (
+        -- Combine results from both searches
+        SELECT 
+            COALESCE(v.id, t.id) AS id,
+            COALESCE(v.url, t.url) AS url,
+            COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
+            COALESCE(v.content, t.content) AS content,
+            COALESCE(v.summary, t.summary) AS summary,
+            COALESCE(v.metadata, t.metadata) AS metadata,
+            COALESCE(v.source_id, t.source_id) AS source_id,
+            -- Use vector similarity if available, otherwise text similarity
+            COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
+            -- Determine match type
+            CASE 
+                WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
+                WHEN v.id IS NOT NULL THEN 'vector'
+                ELSE 'keyword'
+            END AS match_type
+        FROM vector_results v
+        FULL OUTER JOIN text_results t ON v.id = t.id
+    )
+    SELECT * FROM combined_results
+    ORDER BY similarity DESC
+    LIMIT match_count;
+END;
+$$;
+
+-- =====================================================
+-- SECTION 3: UPDATE EXISTING DATA
+-- =====================================================
+
+-- Force regeneration of search vectors for existing data
+-- This is handled automatically by the GENERATED ALWAYS AS columns
+
+-- Add comment to document the new functionality
+COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
+COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
+
+-- =====================================================
+-- MIGRATION COMPLETE
+-- =====================================================
+-- Hybrid search with ts_vector is now available!
+-- The search vectors will be automatically maintained
+-- as data is inserted or updated.
+-- =====================================================
--- a/migration/complete_setup.sql
+++ b/migration/complete_setup.sql
@@ -15,6 +15,7 @@
 -- Enable required PostgreSQL extensions
 CREATE EXTENSION IF NOT EXISTS vector;
 CREATE EXTENSION IF NOT EXISTS pgcrypto;
+CREATE EXTENSION IF NOT EXISTS pg_trgm;

 -- =====================================================
 -- SECTION 2: CREDENTIALS AND SETTINGS
@@ -203,6 +204,7 @@ CREATE TABLE IF NOT EXISTS archon_crawled_pages (
    metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
    source_id TEXT NOT NULL,
    embedding VECTOR(1536),  -- OpenAI embeddings are 1536 dimensions
+    content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT timezone('utc'::text, now()) NOT NULL,

    -- Add a unique constraint to prevent duplicate chunks for the same URL
@@ -216,6 +218,8 @@ CREATE TABLE IF NOT EXISTS archon_crawled_pages (
 CREATE INDEX ON archon_crawled_pages USING ivfflat (embedding vector_cosine_ops);
 CREATE INDEX idx_archon_crawled_pages_metadata ON archon_crawled_pages USING GIN (metadata);
 CREATE INDEX idx_archon_crawled_pages_source_id ON archon_crawled_pages (source_id);
+CREATE INDEX idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector);
+CREATE INDEX idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops);

 -- Create the code_examples table
 CREATE TABLE IF NOT EXISTS archon_code_examples (
@@ -227,6 +231,7 @@ CREATE TABLE IF NOT EXISTS archon_code_examples (
    metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
    source_id TEXT NOT NULL,
    embedding VECTOR(1536),  -- OpenAI embeddings are 1536 dimensions
+    content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED,
    created_at TIMESTAMP WITH TIME ZONE DEFAULT timezone('utc'::text, now()) NOT NULL,

    -- Add a unique constraint to prevent duplicate chunks for the same URL
@@ -240,6 +245,9 @@ CREATE TABLE IF NOT EXISTS archon_code_examples (
 CREATE INDEX ON archon_code_examples USING ivfflat (embedding vector_cosine_ops);
 CREATE INDEX idx_archon_code_examples_metadata ON archon_code_examples USING GIN (metadata);
 CREATE INDEX idx_archon_code_examples_source_id ON archon_code_examples (source_id);
+CREATE INDEX idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector);
+CREATE INDEX idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops);
+CREATE INDEX idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops);

 -- =====================================================
 -- SECTION 5: SEARCH FUNCTIONS
@@ -319,6 +327,196 @@ BEGIN
 END;
 $$;

+-- =====================================================
+-- SECTION 5B: HYBRID SEARCH FUNCTIONS WITH TS_VECTOR
+-- =====================================================
+
+-- Hybrid search function for archon_crawled_pages
+CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
+    query_embedding vector(1536),
+    query_text TEXT,
+    match_count INT DEFAULT 10,
+    filter JSONB DEFAULT '{}'::jsonb,
+    source_filter TEXT DEFAULT NULL
+)
+RETURNS TABLE (
+    id BIGINT,
+    url VARCHAR,
+    chunk_number INTEGER,
+    content TEXT,
+    metadata JSONB,
+    source_id TEXT,
+    similarity FLOAT,
+    match_type TEXT
+)
+LANGUAGE plpgsql
+AS $$
+DECLARE
+    max_vector_results INT;
+    max_text_results INT;
+BEGIN
+    -- Calculate how many results to fetch from each search type
+    max_vector_results := match_count;
+    max_text_results := match_count;
+    
+    RETURN QUERY
+    WITH vector_results AS (
+        -- Vector similarity search
+        SELECT 
+            cp.id,
+            cp.url,
+            cp.chunk_number,
+            cp.content,
+            cp.metadata,
+            cp.source_id,
+            1 - (cp.embedding <=> query_embedding) AS vector_sim
+        FROM archon_crawled_pages cp
+        WHERE cp.metadata @> filter
+            AND (source_filter IS NULL OR cp.source_id = source_filter)
+            AND cp.embedding IS NOT NULL
+        ORDER BY cp.embedding <=> query_embedding
+        LIMIT max_vector_results
+    ),
+    text_results AS (
+        -- Full-text search with ranking
+        SELECT 
+            cp.id,
+            cp.url,
+            cp.chunk_number,
+            cp.content,
+            cp.metadata,
+            cp.source_id,
+            ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
+        FROM archon_crawled_pages cp
+        WHERE cp.metadata @> filter
+            AND (source_filter IS NULL OR cp.source_id = source_filter)
+            AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
+        ORDER BY text_sim DESC
+        LIMIT max_text_results
+    ),
+    combined_results AS (
+        -- Combine results from both searches
+        SELECT 
+            COALESCE(v.id, t.id) AS id,
+            COALESCE(v.url, t.url) AS url,
+            COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
+            COALESCE(v.content, t.content) AS content,
+            COALESCE(v.metadata, t.metadata) AS metadata,
+            COALESCE(v.source_id, t.source_id) AS source_id,
+            -- Use vector similarity if available, otherwise text similarity
+            COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
+            -- Determine match type
+            CASE 
+                WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
+                WHEN v.id IS NOT NULL THEN 'vector'
+                ELSE 'keyword'
+            END AS match_type
+        FROM vector_results v
+        FULL OUTER JOIN text_results t ON v.id = t.id
+    )
+    SELECT * FROM combined_results
+    ORDER BY similarity DESC
+    LIMIT match_count;
+END;
+$$;
+
+-- Hybrid search function for archon_code_examples
+CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
+    query_embedding vector(1536),
+    query_text TEXT,
+    match_count INT DEFAULT 10,
+    filter JSONB DEFAULT '{}'::jsonb,
+    source_filter TEXT DEFAULT NULL
+)
+RETURNS TABLE (
+    id BIGINT,
+    url VARCHAR,
+    chunk_number INTEGER,
+    content TEXT,
+    summary TEXT,
+    metadata JSONB,
+    source_id TEXT,
+    similarity FLOAT,
+    match_type TEXT
+)
+LANGUAGE plpgsql
+AS $$
+DECLARE
+    max_vector_results INT;
+    max_text_results INT;
+BEGIN
+    -- Calculate how many results to fetch from each search type
+    max_vector_results := match_count;
+    max_text_results := match_count;
+    
+    RETURN QUERY
+    WITH vector_results AS (
+        -- Vector similarity search
+        SELECT 
+            ce.id,
+            ce.url,
+            ce.chunk_number,
+            ce.content,
+            ce.summary,
+            ce.metadata,
+            ce.source_id,
+            1 - (ce.embedding <=> query_embedding) AS vector_sim
+        FROM archon_code_examples ce
+        WHERE ce.metadata @> filter
+            AND (source_filter IS NULL OR ce.source_id = source_filter)
+            AND ce.embedding IS NOT NULL
+        ORDER BY ce.embedding <=> query_embedding
+        LIMIT max_vector_results
+    ),
+    text_results AS (
+        -- Full-text search with ranking (searches both content and summary)
+        SELECT 
+            ce.id,
+            ce.url,
+            ce.chunk_number,
+            ce.content,
+            ce.summary,
+            ce.metadata,
+            ce.source_id,
+            ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
+        FROM archon_code_examples ce
+        WHERE ce.metadata @> filter
+            AND (source_filter IS NULL OR ce.source_id = source_filter)
+            AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
+        ORDER BY text_sim DESC
+        LIMIT max_text_results
+    ),
+    combined_results AS (
+        -- Combine results from both searches
+        SELECT 
+            COALESCE(v.id, t.id) AS id,
+            COALESCE(v.url, t.url) AS url,
+            COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
+            COALESCE(v.content, t.content) AS content,
+            COALESCE(v.summary, t.summary) AS summary,
+            COALESCE(v.metadata, t.metadata) AS metadata,
+            COALESCE(v.source_id, t.source_id) AS source_id,
+            -- Use vector similarity if available, otherwise text similarity
+            COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
+            -- Determine match type
+            CASE 
+                WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
+                WHEN v.id IS NOT NULL THEN 'vector'
+                ELSE 'keyword'
+            END AS match_type
+        FROM vector_results v
+        FULL OUTER JOIN text_results t ON v.id = t.id
+    )
+    SELECT * FROM combined_results
+    ORDER BY similarity DESC
+    LIMIT match_count;
+END;
+$$;
+
+-- Add comments to document the new functionality
+COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
+COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
+
 -- =====================================================
 -- SECTION 6: RLS POLICIES FOR KNOWLEDGE BASE
 -- =====================================================
--- a/python/src/server/services/search/hybrid_search_strategy.py
+++ b/python/src/server/services/search/hybrid_search_strategy.py
@@ -1,14 +1,14 @@
 """
 Hybrid Search Strategy

-Implements hybrid search combining vector similarity search with keyword search
-for improved recall and precision in document and code example retrieval.
+Implements hybrid search combining vector similarity search with full-text search
+using PostgreSQL's ts_vector for improved recall and precision in document and 
+code example retrieval.

 Strategy combines:
 1. Vector/semantic search for conceptual matches
-2. Keyword search for exact term matches
-3. Score boosting for results appearing in both searches
-4. Intelligent result merging with preference ordering
+2. Full-text search using ts_vector for efficient keyword matching
+3. Returns union of both result sets for maximum coverage
 """

 from typing import Any
@@ -17,129 +17,17 @@ from supabase import Client

 from ...config.logfire_config import get_logger, safe_span
 from ..embeddings.embedding_service import create_embedding
-from .keyword_extractor import build_search_terms, extract_keywords

 logger = get_logger(__name__)


 class HybridSearchStrategy:
-    """Strategy class implementing hybrid search combining vector and keyword search"""
+    """Strategy class implementing hybrid search combining vector and full-text search"""

    def __init__(self, supabase_client: Client, base_strategy):
        self.supabase_client = supabase_client
        self.base_strategy = base_strategy

-    async def keyword_search(
-        self,
-        query: str,
-        match_count: int,
-        table_name: str = "documents",
-        filter_metadata: dict | None = None,
-        select_fields: str | None = None,
-    ) -> list[dict[str, Any]]:
-        """
-        Perform intelligent keyword search using extracted keywords.
-
-        This method extracts keywords from the query and searches for documents
-        containing any of those keywords, ranking results by the number of matches.
-
-        Args:
-            query: The search query text
-            match_count: Number of results to return
-            table_name: The table to search (documents, archon_crawled_pages, or archon_code_examples)
-            filter_metadata: Optional metadata filters
-            select_fields: Optional specific fields to select (default: all)
-
-        Returns:
-            List of matching documents ranked by keyword relevance
-        """
-        try:
-            # Extract keywords from the query
-            keywords = extract_keywords(query, min_length=2, max_keywords=8)
-
-            if not keywords:
-                # Fallback to original query if no keywords extracted
-                keywords = [query]
-
-            logger.debug(f"Extracted keywords from '{query}': {keywords}")
-
-            # Build search terms including variations
-            search_terms = build_search_terms(keywords)[:12]  # Limit total search terms
-
-            # For now, we'll search for documents containing ANY of the keywords
-            # and then rank them by how many keywords they contain
-            all_results = []
-            seen_ids = set()
-
-            # Search for each keyword individually to get better coverage
-            for keyword in search_terms[:6]:  # Limit to avoid too many queries
-                # Build the query with appropriate fields
-                if select_fields:
-                    query_builder = self.supabase_client.from_(table_name).select(select_fields)
-                else:
-                    query_builder = self.supabase_client.from_(table_name).select("*")
-
-                # Add keyword search condition with wildcards
-                search_pattern = f"%{keyword}%"
-
-                # Handle different search patterns based on table
-                if table_name == "archon_code_examples":
-                    # Search both content and summary for code examples
-                    query_builder = query_builder.or_(
-                        f"content.ilike.{search_pattern},summary.ilike.{search_pattern}"
-                    )
-                else:
-                    query_builder = query_builder.ilike("content", search_pattern)
-
-                # Add metadata filters if provided
-                if filter_metadata:
-                    if "source" in filter_metadata and table_name in ["documents", "crawled_pages"]:
-                        query_builder = query_builder.eq("source_id", filter_metadata["source"])
-                    elif "source_id" in filter_metadata:
-                        query_builder = query_builder.eq("source_id", filter_metadata["source_id"])
-
-                # Execute query with limit
-                response = query_builder.limit(match_count * 2).execute()
-
-                if response.data:
-                    for result in response.data:
-                        result_id = result.get("id")
-                        if result_id and result_id not in seen_ids:
-                            # Count how many keywords match in this result
-                            content = result.get("content", "").lower()
-                            summary = (
-                                result.get("summary", "").lower()
-                                if table_name == "archon_code_examples"
-                                else ""
-                            )
-                            combined_text = f"{content} {summary}"
-
-                            # Count keyword matches
-                            match_score = sum(1 for kw in keywords if kw.lower() in combined_text)
-
-                            # Add match score to result
-                            result["keyword_match_score"] = match_score
-                            result["matched_keyword"] = keyword
-
-                            all_results.append(result)
-                            seen_ids.add(result_id)
-
-            # Sort results by keyword match score (descending)
-            all_results.sort(key=lambda x: x.get("keyword_match_score", 0), reverse=True)
-
-            # Return top N results
-            final_results = all_results[:match_count]
-
-            logger.debug(
-                f"Keyword search found {len(final_results)} results from {len(all_results)} total matches"
-            )
-
-            return final_results
-
-        except Exception as e:
-            logger.error(f"Keyword search failed: {e}")
-            return []
-
    async def search_documents_hybrid(
        self,
        query: str,
@@ -148,7 +36,8 @@ class HybridSearchStrategy:
        filter_metadata: dict | None = None,
    ) -> list[dict[str, Any]]:
        """
-        Perform hybrid search on archon_crawled_pages table combining vector and keyword search.
+        Perform hybrid search on archon_crawled_pages table using the PostgreSQL 
+        hybrid search function that combines vector and full-text search.

        Args:
            query: Original search query text
@@ -157,41 +46,59 @@ class HybridSearchStrategy:
            filter_metadata: Optional metadata filter dict

        Returns:
-            List of matching documents with boosted scores for dual matches
+            List of matching documents from both vector and text search
        """
        with safe_span("hybrid_search_documents") as span:
            try:
-                # 1. Get vector search results using base strategy
-                vector_results = await self.base_strategy.vector_search(
-                    query_embedding=query_embedding,
-                    match_count=match_count * 2,  # Get more for filtering
-                    filter_metadata=filter_metadata,
-                    table_rpc="match_archon_crawled_pages",
-                )
+                # Prepare filter and source parameters
+                filter_json = filter_metadata or {}
+                source_filter = filter_json.pop("source", None) if "source" in filter_json else None

-                # 2. Get keyword search results
-                keyword_results = await self.keyword_search(
-                    query=query,
-                    match_count=match_count * 2,
-                    table_name="archon_crawled_pages",
-                    filter_metadata=filter_metadata,
-                    select_fields="id, url, chunk_number, content, metadata, source_id",
-                )
+                # Call the hybrid search PostgreSQL function
+                response = self.supabase_client.rpc(
+                    "hybrid_search_archon_crawled_pages",
+                    {
+                        "query_embedding": query_embedding,
+                        "query_text": query,
+                        "match_count": match_count,
+                        "filter": filter_json,
+                        "source_filter": source_filter,
+                    },
+                ).execute()

-                # 3. Combine and merge results intelligently
-                combined_results = self._merge_search_results(
-                    vector_results, keyword_results, match_count
-                )
+                if not response.data:
+                    logger.debug("No results from hybrid search")
+                    return []

-                span.set_attribute("vector_results_count", len(vector_results))
-                span.set_attribute("keyword_results_count", len(keyword_results))
-                span.set_attribute("final_results_count", len(combined_results))
+                # Format results to match expected structure
+                results = []
+                for row in response.data:
+                    result = {
+                        "id": row["id"],
+                        "url": row["url"],
+                        "chunk_number": row["chunk_number"],
+                        "content": row["content"],
+                        "metadata": row["metadata"],
+                        "source_id": row["source_id"],
+                        "similarity": row["similarity"],
+                        "match_type": row["match_type"],
+                    }
+                    results.append(result)
+
+                span.set_attribute("results_count", len(results))
+
+                # Log match type distribution for debugging
+                match_types = {}
+                for r in results:
+                    mt = r.get("match_type", "unknown")
+                    match_types[mt] = match_types.get(mt, 0) + 1

                logger.debug(
-                    f"Hybrid document search: {len(vector_results)} vector + {len(keyword_results)} keyword → {len(combined_results)} final"
+                    f"Hybrid search returned {len(results)} results. "
+                    f"Match types: {match_types}"
                )

-                return combined_results
+                return results

            except Exception as e:
                logger.error(f"Hybrid document search failed: {e}")
@@ -206,7 +113,8 @@ class HybridSearchStrategy:
        source_id: str | None = None,
    ) -> list[dict[str, Any]]:
        """
-        Perform hybrid search on archon_code_examples table combining vector and keyword search.
+        Perform hybrid search on archon_code_examples table using the PostgreSQL 
+        hybrid search function that combines vector and full-text search.

        Args:
            query: Search query text
@@ -215,147 +123,72 @@ class HybridSearchStrategy:
            source_id: Optional source ID to filter results

        Returns:
-            List of matching code examples with boosted scores for dual matches
+            List of matching code examples from both vector and text search
        """
        with safe_span("hybrid_search_code_examples") as span:
            try:
-                # Create query embedding (no enhancement needed)
+                # Create query embedding
                query_embedding = await create_embedding(query)

                if not query_embedding:
                    logger.error("Failed to create embedding for code example query")
                    return []

-                # 1. Get vector search results using base strategy
-                combined_filter = filter_metadata or {}
-                if source_id:
-                    combined_filter["source"] = source_id
+                # Prepare filter and source parameters
+                filter_json = filter_metadata or {}
+                # Use source_id parameter if provided, otherwise check filter_metadata
+                final_source_filter = source_id
+                if not final_source_filter and "source" in filter_json:
+                    final_source_filter = filter_json.pop("source")

-                vector_results = await self.base_strategy.vector_search(
-                    query_embedding=query_embedding,
-                    match_count=match_count * 2,
-                    filter_metadata=combined_filter,
-                    table_rpc="match_archon_code_examples",
-                )
+                # Call the hybrid search PostgreSQL function
+                response = self.supabase_client.rpc(
+                    "hybrid_search_archon_code_examples",
+                    {
+                        "query_embedding": query_embedding,
+                        "query_text": query,
+                        "match_count": match_count,
+                        "filter": filter_json,
+                        "source_filter": final_source_filter,
+                    },
+                ).execute()

-                # 2. Get keyword search results
-                keyword_filter = filter_metadata or {}
-                if source_id:
-                    keyword_filter["source_id"] = source_id
+                if not response.data:
+                    logger.debug("No results from hybrid code search")
+                    return []

-                keyword_results = await self.keyword_search(
-                    query=query,
-                    match_count=match_count * 2,
-                    table_name="archon_code_examples",
-                    filter_metadata=keyword_filter,
-                    select_fields="id, url, chunk_number, content, summary, metadata, source_id",
-                )
+                # Format results to match expected structure
+                results = []
+                for row in response.data:
+                    result = {
+                        "id": row["id"],
+                        "url": row["url"],
+                        "chunk_number": row["chunk_number"],
+                        "content": row["content"],
+                        "summary": row["summary"],
+                        "metadata": row["metadata"],
+                        "source_id": row["source_id"],
+                        "similarity": row["similarity"],
+                        "match_type": row["match_type"],
+                    }
+                    results.append(result)

-                # 3. Combine and merge results intelligently
-                combined_results = self._merge_search_results(
-                    vector_results, keyword_results, match_count
-                )
+                span.set_attribute("results_count", len(results))

-                span.set_attribute("vector_results_count", len(vector_results))
-                span.set_attribute("keyword_results_count", len(keyword_results))
-                span.set_attribute("final_results_count", len(combined_results))
+                # Log match type distribution for debugging
+                match_types = {}
+                for r in results:
+                    mt = r.get("match_type", "unknown")
+                    match_types[mt] = match_types.get(mt, 0) + 1

                logger.debug(
-                    f"Hybrid code search: {len(vector_results)} vector + {len(keyword_results)} keyword → {len(combined_results)} final"
+                    f"Hybrid code search returned {len(results)} results. "
+                    f"Match types: {match_types}"
                )

-                return combined_results
+                return results

            except Exception as e:
                logger.error(f"Hybrid code example search failed: {e}")
                span.set_attribute("error", str(e))
-                return []
-
-    def _merge_search_results(
-        self,
-        vector_results: list[dict[str, Any]],
-        keyword_results: list[dict[str, Any]],
-        match_count: int,
-    ) -> list[dict[str, Any]]:
-        """
-        Intelligently merge vector and keyword search results with preference ordering.
-
-        Priority order:
-        1. Results appearing in BOTH searches (highest relevance) - get score boost
-        2. Vector-only results (semantic matches)
-        3. Keyword-only results (exact term matches)
-
-        Args:
-            vector_results: Results from vector/semantic search
-            keyword_results: Results from keyword search
-            match_count: Maximum number of final results to return
-
-        Returns:
-            Merged and prioritized list of results
-        """
-        seen_ids: set[str] = set()
-        combined_results: list[dict[str, Any]] = []
-
-        # Create lookup for vector results by ID for efficient matching
-        vector_lookup = {r.get("id"): r for r in vector_results if r.get("id")}
-
-        # Phase 1: Add items that appear in BOTH searches (boost their scores)
-        for keyword_result in keyword_results:
-            result_id = keyword_result.get("id")
-            if result_id and result_id in vector_lookup and result_id not in seen_ids:
-                vector_result = vector_lookup[result_id]
-                # Boost similarity score for dual matches (cap at 1.0)
-                boosted_similarity = min(1.0, vector_result.get("similarity", 0) * 1.2)
-                vector_result["similarity"] = boosted_similarity
-                vector_result["match_type"] = "hybrid"  # Mark as hybrid match
-
-                combined_results.append(vector_result)
-                seen_ids.add(result_id)
-
-        # Phase 2: Add remaining vector results (semantic matches without exact keywords)
-        for vector_result in vector_results:
-            result_id = vector_result.get("id")
-            if result_id and result_id not in seen_ids and len(combined_results) < match_count:
-                vector_result["match_type"] = "vector"
-                combined_results.append(vector_result)
-                seen_ids.add(result_id)
-
-        # Phase 3: Add pure keyword matches if we need more results
-        for keyword_result in keyword_results:
-            result_id = keyword_result.get("id")
-            if result_id and result_id not in seen_ids and len(combined_results) < match_count:
-                # Convert keyword result to match vector result format
-                # Use keyword match score to influence similarity score
-                keyword_score = keyword_result.get("keyword_match_score", 1)
-                # Scale keyword score to similarity range (0.3 to 0.7 based on matches)
-                scaled_similarity = min(0.7, 0.3 + (keyword_score * 0.1))
-
-                standardized_result = {
-                    "id": keyword_result["id"],
-                    "url": keyword_result["url"],
-                    "chunk_number": keyword_result["chunk_number"],
-                    "content": keyword_result["content"],
-                    "metadata": keyword_result["metadata"],
-                    "source_id": keyword_result["source_id"],
-                    "similarity": scaled_similarity,
-                    "match_type": "keyword",
-                    "keyword_match_score": keyword_score,
-                }
-
-                # Include summary if present (for code examples)
-                if "summary" in keyword_result:
-                    standardized_result["summary"] = keyword_result["summary"]
-
-                combined_results.append(standardized_result)
-                seen_ids.add(result_id)
-
-        # Return only up to the requested match count
-        final_results = combined_results[:match_count]
-
-        logger.debug(
-            f"Merge stats - Hybrid: {sum(1 for r in final_results if r.get('match_type') == 'hybrid')}, "
-            f"Vector: {sum(1 for r in final_results if r.get('match_type') == 'vector')}, "
-            f"Keyword: {sum(1 for r in final_results if r.get('match_type') == 'keyword')}"
-        )
-
-        return final_results
+                return []
--- a/python/src/server/services/search/rag_service.py
+++ b/python/src/server/services/search/rag_service.py
@@ -204,10 +204,19 @@ class RAGService:
                use_hybrid_search = self.get_bool_setting("USE_HYBRID_SEARCH", False)
                use_reranking = self.get_bool_setting("USE_RERANKING", False)

+                # If reranking is enabled, fetch more candidates for the reranker to evaluate
+                # This allows the reranker to see a broader set of results
+                search_match_count = match_count
+                if use_reranking and self.reranking_strategy:
+                    # Fetch 5x the requested amount when reranking is enabled
+                    # The reranker will select the best from this larger pool
+                    search_match_count = match_count * 5
+                    logger.debug(f"Reranking enabled - fetching {search_match_count} candidates for {match_count} final results")
+
                # Step 1 & 2: Get results (with hybrid search if enabled)
                results = await self.search_documents(
                    query=query,
-                    match_count=match_count,
+                    match_count=search_match_count,
                    filter_metadata=filter_metadata,
                    use_hybrid_search=use_hybrid_search,
                )
@@ -234,14 +243,18 @@ class RAGService:
                reranking_applied = False
                if self.reranking_strategy and formatted_results:
                    try:
+                        # Pass top_k to limit results to the originally requested count
                        formatted_results = await self.reranking_strategy.rerank_results(
-                            query, formatted_results, content_key="content"
+                            query, formatted_results, content_key="content", top_k=match_count
                        )
                        reranking_applied = True
-                        logger.debug(f"Reranking applied to {len(formatted_results)} results")
+                        logger.debug(f"Reranking applied: {search_match_count} candidates -> {len(formatted_results)} final results")
                    except Exception as e:
                        logger.warning(f"Reranking failed: {e}")
                        reranking_applied = False
+                        # If reranking fails but we fetched extra results, trim to requested count
+                        if len(formatted_results) > match_count:
+                            formatted_results = formatted_results[:match_count]

                # Build response
                response_data = {
@@ -313,6 +326,12 @@ class RAGService:
                use_hybrid_search = self.get_bool_setting("USE_HYBRID_SEARCH", False)
                use_reranking = self.get_bool_setting("USE_RERANKING", False)

+                # If reranking is enabled, fetch more candidates
+                search_match_count = match_count
+                if use_reranking and self.reranking_strategy:
+                    search_match_count = match_count * 5
+                    logger.debug(f"Reranking enabled for code search - fetching {search_match_count} candidates")
+
                # Prepare filter
                filter_metadata = {"source": source_id} if source_id and source_id.strip() else None

@@ -320,7 +339,7 @@ class RAGService:
                    # Use hybrid search for code examples
                    results = await self.hybrid_strategy.search_code_examples_hybrid(
                        query=query,
-                        match_count=match_count,
+                        match_count=search_match_count,
                        filter_metadata=filter_metadata,
                        source_id=source_id,
                    )
@@ -328,7 +347,7 @@ class RAGService:
                    # Use standard agentic search
                    results = await self.agentic_strategy.search_code_examples(
                        query=query,
-                        match_count=match_count,
+                        match_count=search_match_count,
                        filter_metadata=filter_metadata,
                        source_id=source_id,
                    )
@@ -337,10 +356,14 @@ class RAGService:
                if self.reranking_strategy and results:
                    try:
                        results = await self.reranking_strategy.rerank_results(
-                            query, results, content_key="content"
+                            query, results, content_key="content", top_k=match_count
                        )
+                        logger.debug(f"Code reranking applied: {search_match_count} candidates -> {len(results)} final results")
                    except Exception as e:
                        logger.warning(f"Code reranking failed: {e}")
+                        # If reranking fails but we fetched extra results, trim to requested count
+                        if len(results) > match_count:
+                            results = results[:match_count]

                # Format results
                formatted_results = []
--- a/python/tests/test_rag_simple.py
+++ b/python/tests/test_rag_simple.py
@@ -162,38 +162,6 @@ class TestHybridSearchCore:
        """Test hybrid strategy initializes"""
        assert hybrid_strategy is not None
        assert hasattr(hybrid_strategy, "search_documents_hybrid")
-        assert hasattr(hybrid_strategy, "_merge_search_results")
-
-    def test_merge_results_functionality(self, hybrid_strategy):
-        """Test result merging logic"""
-        vector_results = [
-            {
-                "id": "1",
-                "content": "Vector result",
-                "similarity": 0.9,
-                "url": "test1.com",
-                "chunk_number": 1,
-                "metadata": {},
-                "source_id": "src1",
-            }
-        ]
-        keyword_results = [
-            {
-                "id": "2",
-                "content": "Keyword result",
-                "url": "test2.com",
-                "chunk_number": 1,
-                "metadata": {},
-                "source_id": "src2",
-            }
-        ]
-
-        merged = hybrid_strategy._merge_search_results(
-            vector_results, keyword_results, match_count=5
-        )
-
-        assert isinstance(merged, list)
-        assert len(merged) <= 5


 class TestRerankingCore:
--- a/python/tests/test_rag_strategies.py
+++ b/python/tests/test_rag_strategies.py
@@ -168,42 +168,6 @@ class TestHybridSearchStrategy:
        assert hasattr(hybrid_strategy, "search_documents_hybrid")
        assert hasattr(hybrid_strategy, "search_code_examples_hybrid")

-    def test_merge_search_results(self, hybrid_strategy):
-        """Test search result merging"""
-        vector_results = [
-            {
-                "id": "1",
-                "content": "Vector result 1",
-                "score": 0.9,
-                "url": "url1",
-                "chunk_number": 1,
-                "metadata": {},
-                "source_id": "source1",
-                "similarity": 0.9,
-            }
-        ]
-        keyword_results = [
-            {
-                "id": "2",
-                "content": "Keyword result 1",
-                "score": 0.8,
-                "url": "url2",
-                "chunk_number": 1,
-                "metadata": {},
-                "source_id": "source2",
-            }
-        ]
-
-        merged = hybrid_strategy._merge_search_results(
-            vector_results, keyword_results, match_count=5
-        )
-
-        assert isinstance(merged, list)
-        assert len(merged) <= 5
-        # Should contain results from both sources
-        if merged:
-            assert any("Vector result" in str(r) or "Keyword result" in str(r) for r in merged)
-

 class TestRerankingStrategy:
    """Test reranking strategy implementation"""