diff --git a/migration/RESET_DB.sql b/migration/RESET_DB.sql index eff218ee..775464f5 100644 --- a/migration/RESET_DB.sql +++ b/migration/RESET_DB.sql @@ -133,6 +133,10 @@ BEGIN DROP FUNCTION IF EXISTS match_archon_crawled_pages(vector, int, jsonb, text) CASCADE; DROP FUNCTION IF EXISTS match_archon_code_examples(vector, int, jsonb, text) CASCADE; + -- Hybrid search functions (with ts_vector support) + DROP FUNCTION IF EXISTS hybrid_search_archon_crawled_pages(vector, text, int, jsonb, text) CASCADE; + DROP FUNCTION IF EXISTS hybrid_search_archon_code_examples(vector, text, int, jsonb, text) CASCADE; + -- Search functions (old without prefix) DROP FUNCTION IF EXISTS match_crawled_pages(vector, int, jsonb, text) CASCADE; DROP FUNCTION IF EXISTS match_code_examples(vector, int, jsonb, text) CASCADE; diff --git a/migration/add_hybrid_search_tsvector.sql b/migration/add_hybrid_search_tsvector.sql new file mode 100644 index 00000000..8016a667 --- /dev/null +++ b/migration/add_hybrid_search_tsvector.sql @@ -0,0 +1,237 @@ +-- ===================================================== +-- Add Hybrid Search with ts_vector Support +-- ===================================================== +-- This migration adds efficient text search capabilities using PostgreSQL's +-- full-text search features (ts_vector) to enable better keyword matching +-- in hybrid search operations. +-- ===================================================== + +-- Enable required extensions (pg_trgm for fuzzy matching) +CREATE EXTENSION IF NOT EXISTS pg_trgm; + +-- ===================================================== +-- SECTION 1: ADD TEXT SEARCH COLUMNS AND INDEXES +-- ===================================================== + +-- Add ts_vector columns for full-text search if they don't exist +ALTER TABLE archon_crawled_pages +ADD COLUMN IF NOT EXISTS content_search_vector tsvector +GENERATED ALWAYS AS (to_tsvector('english', content)) STORED; + +ALTER TABLE archon_code_examples +ADD COLUMN IF NOT EXISTS content_search_vector tsvector +GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED; + +-- Create GIN indexes for fast text search +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector); + +-- Create trigram indexes for fuzzy matching (useful for typos and partial matches) +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops); + +-- ===================================================== +-- SECTION 2: HYBRID SEARCH FUNCTIONS +-- ===================================================== + +-- Hybrid search function for archon_crawled_pages +CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages( + query_embedding vector(1536), + query_text TEXT, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) +RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT, + match_type TEXT +) +LANGUAGE plpgsql +AS $$ +DECLARE + max_vector_results INT; + max_text_results INT; +BEGIN + -- Calculate how many results to fetch from each search type + max_vector_results := match_count; + max_text_results := match_count; + + RETURN QUERY + WITH vector_results AS ( + -- Vector similarity search + SELECT + cp.id, + cp.url, + cp.chunk_number, + cp.content, + cp.metadata, + cp.source_id, + 1 - (cp.embedding <=> query_embedding) AS vector_sim + FROM archon_crawled_pages cp + WHERE cp.metadata @> filter + AND (source_filter IS NULL OR cp.source_id = source_filter) + AND cp.embedding IS NOT NULL + ORDER BY cp.embedding <=> query_embedding + LIMIT max_vector_results + ), + text_results AS ( + -- Full-text search with ranking + SELECT + cp.id, + cp.url, + cp.chunk_number, + cp.content, + cp.metadata, + cp.source_id, + ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim + FROM archon_crawled_pages cp + WHERE cp.metadata @> filter + AND (source_filter IS NULL OR cp.source_id = source_filter) + AND cp.content_search_vector @@ plainto_tsquery('english', query_text) + ORDER BY text_sim DESC + LIMIT max_text_results + ), + combined_results AS ( + -- Combine results from both searches + SELECT + COALESCE(v.id, t.id) AS id, + COALESCE(v.url, t.url) AS url, + COALESCE(v.chunk_number, t.chunk_number) AS chunk_number, + COALESCE(v.content, t.content) AS content, + COALESCE(v.metadata, t.metadata) AS metadata, + COALESCE(v.source_id, t.source_id) AS source_id, + -- Use vector similarity if available, otherwise text similarity + COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity, + -- Determine match type + CASE + WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid' + WHEN v.id IS NOT NULL THEN 'vector' + ELSE 'keyword' + END AS match_type + FROM vector_results v + FULL OUTER JOIN text_results t ON v.id = t.id + ) + SELECT * FROM combined_results + ORDER BY similarity DESC + LIMIT match_count; +END; +$$; + +-- Hybrid search function for archon_code_examples +CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples( + query_embedding vector(1536), + query_text TEXT, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) +RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + summary TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT, + match_type TEXT +) +LANGUAGE plpgsql +AS $$ +DECLARE + max_vector_results INT; + max_text_results INT; +BEGIN + -- Calculate how many results to fetch from each search type + max_vector_results := match_count; + max_text_results := match_count; + + RETURN QUERY + WITH vector_results AS ( + -- Vector similarity search + SELECT + ce.id, + ce.url, + ce.chunk_number, + ce.content, + ce.summary, + ce.metadata, + ce.source_id, + 1 - (ce.embedding <=> query_embedding) AS vector_sim + FROM archon_code_examples ce + WHERE ce.metadata @> filter + AND (source_filter IS NULL OR ce.source_id = source_filter) + AND ce.embedding IS NOT NULL + ORDER BY ce.embedding <=> query_embedding + LIMIT max_vector_results + ), + text_results AS ( + -- Full-text search with ranking (searches both content and summary) + SELECT + ce.id, + ce.url, + ce.chunk_number, + ce.content, + ce.summary, + ce.metadata, + ce.source_id, + ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim + FROM archon_code_examples ce + WHERE ce.metadata @> filter + AND (source_filter IS NULL OR ce.source_id = source_filter) + AND ce.content_search_vector @@ plainto_tsquery('english', query_text) + ORDER BY text_sim DESC + LIMIT max_text_results + ), + combined_results AS ( + -- Combine results from both searches + SELECT + COALESCE(v.id, t.id) AS id, + COALESCE(v.url, t.url) AS url, + COALESCE(v.chunk_number, t.chunk_number) AS chunk_number, + COALESCE(v.content, t.content) AS content, + COALESCE(v.summary, t.summary) AS summary, + COALESCE(v.metadata, t.metadata) AS metadata, + COALESCE(v.source_id, t.source_id) AS source_id, + -- Use vector similarity if available, otherwise text similarity + COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity, + -- Determine match type + CASE + WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid' + WHEN v.id IS NOT NULL THEN 'vector' + ELSE 'keyword' + END AS match_type + FROM vector_results v + FULL OUTER JOIN text_results t ON v.id = t.id + ) + SELECT * FROM combined_results + ORDER BY similarity DESC + LIMIT match_count; +END; +$$; + +-- ===================================================== +-- SECTION 3: UPDATE EXISTING DATA +-- ===================================================== + +-- Force regeneration of search vectors for existing data +-- This is handled automatically by the GENERATED ALWAYS AS columns + +-- Add comment to document the new functionality +COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting'; +COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search'; + +-- ===================================================== +-- MIGRATION COMPLETE +-- ===================================================== +-- Hybrid search with ts_vector is now available! +-- The search vectors will be automatically maintained +-- as data is inserted or updated. +-- ===================================================== \ No newline at end of file diff --git a/migration/complete_setup.sql b/migration/complete_setup.sql index 4b3550bd..723180c2 100644 --- a/migration/complete_setup.sql +++ b/migration/complete_setup.sql @@ -15,6 +15,7 @@ -- Enable required PostgreSQL extensions CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION IF NOT EXISTS pgcrypto; +CREATE EXTENSION IF NOT EXISTS pg_trgm; -- ===================================================== -- SECTION 2: CREDENTIALS AND SETTINGS @@ -203,6 +204,7 @@ CREATE TABLE IF NOT EXISTS archon_crawled_pages ( metadata JSONB NOT NULL DEFAULT '{}'::jsonb, source_id TEXT NOT NULL, embedding VECTOR(1536), -- OpenAI embeddings are 1536 dimensions + content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED, created_at TIMESTAMP WITH TIME ZONE DEFAULT timezone('utc'::text, now()) NOT NULL, -- Add a unique constraint to prevent duplicate chunks for the same URL @@ -216,6 +218,8 @@ CREATE TABLE IF NOT EXISTS archon_crawled_pages ( CREATE INDEX ON archon_crawled_pages USING ivfflat (embedding vector_cosine_ops); CREATE INDEX idx_archon_crawled_pages_metadata ON archon_crawled_pages USING GIN (metadata); CREATE INDEX idx_archon_crawled_pages_source_id ON archon_crawled_pages (source_id); +CREATE INDEX idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector); +CREATE INDEX idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops); -- Create the code_examples table CREATE TABLE IF NOT EXISTS archon_code_examples ( @@ -227,6 +231,7 @@ CREATE TABLE IF NOT EXISTS archon_code_examples ( metadata JSONB NOT NULL DEFAULT '{}'::jsonb, source_id TEXT NOT NULL, embedding VECTOR(1536), -- OpenAI embeddings are 1536 dimensions + content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED, created_at TIMESTAMP WITH TIME ZONE DEFAULT timezone('utc'::text, now()) NOT NULL, -- Add a unique constraint to prevent duplicate chunks for the same URL @@ -240,6 +245,9 @@ CREATE TABLE IF NOT EXISTS archon_code_examples ( CREATE INDEX ON archon_code_examples USING ivfflat (embedding vector_cosine_ops); CREATE INDEX idx_archon_code_examples_metadata ON archon_code_examples USING GIN (metadata); CREATE INDEX idx_archon_code_examples_source_id ON archon_code_examples (source_id); +CREATE INDEX idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector); +CREATE INDEX idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops); +CREATE INDEX idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops); -- ===================================================== -- SECTION 5: SEARCH FUNCTIONS @@ -319,6 +327,196 @@ BEGIN END; $$; +-- ===================================================== +-- SECTION 5B: HYBRID SEARCH FUNCTIONS WITH TS_VECTOR +-- ===================================================== + +-- Hybrid search function for archon_crawled_pages +CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages( + query_embedding vector(1536), + query_text TEXT, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) +RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT, + match_type TEXT +) +LANGUAGE plpgsql +AS $$ +DECLARE + max_vector_results INT; + max_text_results INT; +BEGIN + -- Calculate how many results to fetch from each search type + max_vector_results := match_count; + max_text_results := match_count; + + RETURN QUERY + WITH vector_results AS ( + -- Vector similarity search + SELECT + cp.id, + cp.url, + cp.chunk_number, + cp.content, + cp.metadata, + cp.source_id, + 1 - (cp.embedding <=> query_embedding) AS vector_sim + FROM archon_crawled_pages cp + WHERE cp.metadata @> filter + AND (source_filter IS NULL OR cp.source_id = source_filter) + AND cp.embedding IS NOT NULL + ORDER BY cp.embedding <=> query_embedding + LIMIT max_vector_results + ), + text_results AS ( + -- Full-text search with ranking + SELECT + cp.id, + cp.url, + cp.chunk_number, + cp.content, + cp.metadata, + cp.source_id, + ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim + FROM archon_crawled_pages cp + WHERE cp.metadata @> filter + AND (source_filter IS NULL OR cp.source_id = source_filter) + AND cp.content_search_vector @@ plainto_tsquery('english', query_text) + ORDER BY text_sim DESC + LIMIT max_text_results + ), + combined_results AS ( + -- Combine results from both searches + SELECT + COALESCE(v.id, t.id) AS id, + COALESCE(v.url, t.url) AS url, + COALESCE(v.chunk_number, t.chunk_number) AS chunk_number, + COALESCE(v.content, t.content) AS content, + COALESCE(v.metadata, t.metadata) AS metadata, + COALESCE(v.source_id, t.source_id) AS source_id, + -- Use vector similarity if available, otherwise text similarity + COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity, + -- Determine match type + CASE + WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid' + WHEN v.id IS NOT NULL THEN 'vector' + ELSE 'keyword' + END AS match_type + FROM vector_results v + FULL OUTER JOIN text_results t ON v.id = t.id + ) + SELECT * FROM combined_results + ORDER BY similarity DESC + LIMIT match_count; +END; +$$; + +-- Hybrid search function for archon_code_examples +CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples( + query_embedding vector(1536), + query_text TEXT, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) +RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + summary TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT, + match_type TEXT +) +LANGUAGE plpgsql +AS $$ +DECLARE + max_vector_results INT; + max_text_results INT; +BEGIN + -- Calculate how many results to fetch from each search type + max_vector_results := match_count; + max_text_results := match_count; + + RETURN QUERY + WITH vector_results AS ( + -- Vector similarity search + SELECT + ce.id, + ce.url, + ce.chunk_number, + ce.content, + ce.summary, + ce.metadata, + ce.source_id, + 1 - (ce.embedding <=> query_embedding) AS vector_sim + FROM archon_code_examples ce + WHERE ce.metadata @> filter + AND (source_filter IS NULL OR ce.source_id = source_filter) + AND ce.embedding IS NOT NULL + ORDER BY ce.embedding <=> query_embedding + LIMIT max_vector_results + ), + text_results AS ( + -- Full-text search with ranking (searches both content and summary) + SELECT + ce.id, + ce.url, + ce.chunk_number, + ce.content, + ce.summary, + ce.metadata, + ce.source_id, + ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim + FROM archon_code_examples ce + WHERE ce.metadata @> filter + AND (source_filter IS NULL OR ce.source_id = source_filter) + AND ce.content_search_vector @@ plainto_tsquery('english', query_text) + ORDER BY text_sim DESC + LIMIT max_text_results + ), + combined_results AS ( + -- Combine results from both searches + SELECT + COALESCE(v.id, t.id) AS id, + COALESCE(v.url, t.url) AS url, + COALESCE(v.chunk_number, t.chunk_number) AS chunk_number, + COALESCE(v.content, t.content) AS content, + COALESCE(v.summary, t.summary) AS summary, + COALESCE(v.metadata, t.metadata) AS metadata, + COALESCE(v.source_id, t.source_id) AS source_id, + -- Use vector similarity if available, otherwise text similarity + COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity, + -- Determine match type + CASE + WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid' + WHEN v.id IS NOT NULL THEN 'vector' + ELSE 'keyword' + END AS match_type + FROM vector_results v + FULL OUTER JOIN text_results t ON v.id = t.id + ) + SELECT * FROM combined_results + ORDER BY similarity DESC + LIMIT match_count; +END; +$$; + +-- Add comments to document the new functionality +COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting'; +COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search'; + -- ===================================================== -- SECTION 6: RLS POLICIES FOR KNOWLEDGE BASE -- ===================================================== diff --git a/python/src/server/services/search/hybrid_search_strategy.py b/python/src/server/services/search/hybrid_search_strategy.py index b3995b84..caad26e6 100644 --- a/python/src/server/services/search/hybrid_search_strategy.py +++ b/python/src/server/services/search/hybrid_search_strategy.py @@ -1,14 +1,14 @@ """ Hybrid Search Strategy -Implements hybrid search combining vector similarity search with keyword search -for improved recall and precision in document and code example retrieval. +Implements hybrid search combining vector similarity search with full-text search +using PostgreSQL's ts_vector for improved recall and precision in document and +code example retrieval. Strategy combines: 1. Vector/semantic search for conceptual matches -2. Keyword search for exact term matches -3. Score boosting for results appearing in both searches -4. Intelligent result merging with preference ordering +2. Full-text search using ts_vector for efficient keyword matching +3. Returns union of both result sets for maximum coverage """ from typing import Any @@ -17,129 +17,17 @@ from supabase import Client from ...config.logfire_config import get_logger, safe_span from ..embeddings.embedding_service import create_embedding -from .keyword_extractor import build_search_terms, extract_keywords logger = get_logger(__name__) class HybridSearchStrategy: - """Strategy class implementing hybrid search combining vector and keyword search""" + """Strategy class implementing hybrid search combining vector and full-text search""" def __init__(self, supabase_client: Client, base_strategy): self.supabase_client = supabase_client self.base_strategy = base_strategy - async def keyword_search( - self, - query: str, - match_count: int, - table_name: str = "documents", - filter_metadata: dict | None = None, - select_fields: str | None = None, - ) -> list[dict[str, Any]]: - """ - Perform intelligent keyword search using extracted keywords. - - This method extracts keywords from the query and searches for documents - containing any of those keywords, ranking results by the number of matches. - - Args: - query: The search query text - match_count: Number of results to return - table_name: The table to search (documents, archon_crawled_pages, or archon_code_examples) - filter_metadata: Optional metadata filters - select_fields: Optional specific fields to select (default: all) - - Returns: - List of matching documents ranked by keyword relevance - """ - try: - # Extract keywords from the query - keywords = extract_keywords(query, min_length=2, max_keywords=8) - - if not keywords: - # Fallback to original query if no keywords extracted - keywords = [query] - - logger.debug(f"Extracted keywords from '{query}': {keywords}") - - # Build search terms including variations - search_terms = build_search_terms(keywords)[:12] # Limit total search terms - - # For now, we'll search for documents containing ANY of the keywords - # and then rank them by how many keywords they contain - all_results = [] - seen_ids = set() - - # Search for each keyword individually to get better coverage - for keyword in search_terms[:6]: # Limit to avoid too many queries - # Build the query with appropriate fields - if select_fields: - query_builder = self.supabase_client.from_(table_name).select(select_fields) - else: - query_builder = self.supabase_client.from_(table_name).select("*") - - # Add keyword search condition with wildcards - search_pattern = f"%{keyword}%" - - # Handle different search patterns based on table - if table_name == "archon_code_examples": - # Search both content and summary for code examples - query_builder = query_builder.or_( - f"content.ilike.{search_pattern},summary.ilike.{search_pattern}" - ) - else: - query_builder = query_builder.ilike("content", search_pattern) - - # Add metadata filters if provided - if filter_metadata: - if "source" in filter_metadata and table_name in ["documents", "crawled_pages"]: - query_builder = query_builder.eq("source_id", filter_metadata["source"]) - elif "source_id" in filter_metadata: - query_builder = query_builder.eq("source_id", filter_metadata["source_id"]) - - # Execute query with limit - response = query_builder.limit(match_count * 2).execute() - - if response.data: - for result in response.data: - result_id = result.get("id") - if result_id and result_id not in seen_ids: - # Count how many keywords match in this result - content = result.get("content", "").lower() - summary = ( - result.get("summary", "").lower() - if table_name == "archon_code_examples" - else "" - ) - combined_text = f"{content} {summary}" - - # Count keyword matches - match_score = sum(1 for kw in keywords if kw.lower() in combined_text) - - # Add match score to result - result["keyword_match_score"] = match_score - result["matched_keyword"] = keyword - - all_results.append(result) - seen_ids.add(result_id) - - # Sort results by keyword match score (descending) - all_results.sort(key=lambda x: x.get("keyword_match_score", 0), reverse=True) - - # Return top N results - final_results = all_results[:match_count] - - logger.debug( - f"Keyword search found {len(final_results)} results from {len(all_results)} total matches" - ) - - return final_results - - except Exception as e: - logger.error(f"Keyword search failed: {e}") - return [] - async def search_documents_hybrid( self, query: str, @@ -148,7 +36,8 @@ class HybridSearchStrategy: filter_metadata: dict | None = None, ) -> list[dict[str, Any]]: """ - Perform hybrid search on archon_crawled_pages table combining vector and keyword search. + Perform hybrid search on archon_crawled_pages table using the PostgreSQL + hybrid search function that combines vector and full-text search. Args: query: Original search query text @@ -157,41 +46,59 @@ class HybridSearchStrategy: filter_metadata: Optional metadata filter dict Returns: - List of matching documents with boosted scores for dual matches + List of matching documents from both vector and text search """ with safe_span("hybrid_search_documents") as span: try: - # 1. Get vector search results using base strategy - vector_results = await self.base_strategy.vector_search( - query_embedding=query_embedding, - match_count=match_count * 2, # Get more for filtering - filter_metadata=filter_metadata, - table_rpc="match_archon_crawled_pages", - ) + # Prepare filter and source parameters + filter_json = filter_metadata or {} + source_filter = filter_json.pop("source", None) if "source" in filter_json else None - # 2. Get keyword search results - keyword_results = await self.keyword_search( - query=query, - match_count=match_count * 2, - table_name="archon_crawled_pages", - filter_metadata=filter_metadata, - select_fields="id, url, chunk_number, content, metadata, source_id", - ) + # Call the hybrid search PostgreSQL function + response = self.supabase_client.rpc( + "hybrid_search_archon_crawled_pages", + { + "query_embedding": query_embedding, + "query_text": query, + "match_count": match_count, + "filter": filter_json, + "source_filter": source_filter, + }, + ).execute() - # 3. Combine and merge results intelligently - combined_results = self._merge_search_results( - vector_results, keyword_results, match_count - ) + if not response.data: + logger.debug("No results from hybrid search") + return [] - span.set_attribute("vector_results_count", len(vector_results)) - span.set_attribute("keyword_results_count", len(keyword_results)) - span.set_attribute("final_results_count", len(combined_results)) + # Format results to match expected structure + results = [] + for row in response.data: + result = { + "id": row["id"], + "url": row["url"], + "chunk_number": row["chunk_number"], + "content": row["content"], + "metadata": row["metadata"], + "source_id": row["source_id"], + "similarity": row["similarity"], + "match_type": row["match_type"], + } + results.append(result) + + span.set_attribute("results_count", len(results)) + + # Log match type distribution for debugging + match_types = {} + for r in results: + mt = r.get("match_type", "unknown") + match_types[mt] = match_types.get(mt, 0) + 1 logger.debug( - f"Hybrid document search: {len(vector_results)} vector + {len(keyword_results)} keyword → {len(combined_results)} final" + f"Hybrid search returned {len(results)} results. " + f"Match types: {match_types}" ) - return combined_results + return results except Exception as e: logger.error(f"Hybrid document search failed: {e}") @@ -206,7 +113,8 @@ class HybridSearchStrategy: source_id: str | None = None, ) -> list[dict[str, Any]]: """ - Perform hybrid search on archon_code_examples table combining vector and keyword search. + Perform hybrid search on archon_code_examples table using the PostgreSQL + hybrid search function that combines vector and full-text search. Args: query: Search query text @@ -215,147 +123,72 @@ class HybridSearchStrategy: source_id: Optional source ID to filter results Returns: - List of matching code examples with boosted scores for dual matches + List of matching code examples from both vector and text search """ with safe_span("hybrid_search_code_examples") as span: try: - # Create query embedding (no enhancement needed) + # Create query embedding query_embedding = await create_embedding(query) if not query_embedding: logger.error("Failed to create embedding for code example query") return [] - # 1. Get vector search results using base strategy - combined_filter = filter_metadata or {} - if source_id: - combined_filter["source"] = source_id + # Prepare filter and source parameters + filter_json = filter_metadata or {} + # Use source_id parameter if provided, otherwise check filter_metadata + final_source_filter = source_id + if not final_source_filter and "source" in filter_json: + final_source_filter = filter_json.pop("source") - vector_results = await self.base_strategy.vector_search( - query_embedding=query_embedding, - match_count=match_count * 2, - filter_metadata=combined_filter, - table_rpc="match_archon_code_examples", - ) + # Call the hybrid search PostgreSQL function + response = self.supabase_client.rpc( + "hybrid_search_archon_code_examples", + { + "query_embedding": query_embedding, + "query_text": query, + "match_count": match_count, + "filter": filter_json, + "source_filter": final_source_filter, + }, + ).execute() - # 2. Get keyword search results - keyword_filter = filter_metadata or {} - if source_id: - keyword_filter["source_id"] = source_id + if not response.data: + logger.debug("No results from hybrid code search") + return [] - keyword_results = await self.keyword_search( - query=query, - match_count=match_count * 2, - table_name="archon_code_examples", - filter_metadata=keyword_filter, - select_fields="id, url, chunk_number, content, summary, metadata, source_id", - ) + # Format results to match expected structure + results = [] + for row in response.data: + result = { + "id": row["id"], + "url": row["url"], + "chunk_number": row["chunk_number"], + "content": row["content"], + "summary": row["summary"], + "metadata": row["metadata"], + "source_id": row["source_id"], + "similarity": row["similarity"], + "match_type": row["match_type"], + } + results.append(result) - # 3. Combine and merge results intelligently - combined_results = self._merge_search_results( - vector_results, keyword_results, match_count - ) + span.set_attribute("results_count", len(results)) - span.set_attribute("vector_results_count", len(vector_results)) - span.set_attribute("keyword_results_count", len(keyword_results)) - span.set_attribute("final_results_count", len(combined_results)) + # Log match type distribution for debugging + match_types = {} + for r in results: + mt = r.get("match_type", "unknown") + match_types[mt] = match_types.get(mt, 0) + 1 logger.debug( - f"Hybrid code search: {len(vector_results)} vector + {len(keyword_results)} keyword → {len(combined_results)} final" + f"Hybrid code search returned {len(results)} results. " + f"Match types: {match_types}" ) - return combined_results + return results except Exception as e: logger.error(f"Hybrid code example search failed: {e}") span.set_attribute("error", str(e)) - return [] - - def _merge_search_results( - self, - vector_results: list[dict[str, Any]], - keyword_results: list[dict[str, Any]], - match_count: int, - ) -> list[dict[str, Any]]: - """ - Intelligently merge vector and keyword search results with preference ordering. - - Priority order: - 1. Results appearing in BOTH searches (highest relevance) - get score boost - 2. Vector-only results (semantic matches) - 3. Keyword-only results (exact term matches) - - Args: - vector_results: Results from vector/semantic search - keyword_results: Results from keyword search - match_count: Maximum number of final results to return - - Returns: - Merged and prioritized list of results - """ - seen_ids: set[str] = set() - combined_results: list[dict[str, Any]] = [] - - # Create lookup for vector results by ID for efficient matching - vector_lookup = {r.get("id"): r for r in vector_results if r.get("id")} - - # Phase 1: Add items that appear in BOTH searches (boost their scores) - for keyword_result in keyword_results: - result_id = keyword_result.get("id") - if result_id and result_id in vector_lookup and result_id not in seen_ids: - vector_result = vector_lookup[result_id] - # Boost similarity score for dual matches (cap at 1.0) - boosted_similarity = min(1.0, vector_result.get("similarity", 0) * 1.2) - vector_result["similarity"] = boosted_similarity - vector_result["match_type"] = "hybrid" # Mark as hybrid match - - combined_results.append(vector_result) - seen_ids.add(result_id) - - # Phase 2: Add remaining vector results (semantic matches without exact keywords) - for vector_result in vector_results: - result_id = vector_result.get("id") - if result_id and result_id not in seen_ids and len(combined_results) < match_count: - vector_result["match_type"] = "vector" - combined_results.append(vector_result) - seen_ids.add(result_id) - - # Phase 3: Add pure keyword matches if we need more results - for keyword_result in keyword_results: - result_id = keyword_result.get("id") - if result_id and result_id not in seen_ids and len(combined_results) < match_count: - # Convert keyword result to match vector result format - # Use keyword match score to influence similarity score - keyword_score = keyword_result.get("keyword_match_score", 1) - # Scale keyword score to similarity range (0.3 to 0.7 based on matches) - scaled_similarity = min(0.7, 0.3 + (keyword_score * 0.1)) - - standardized_result = { - "id": keyword_result["id"], - "url": keyword_result["url"], - "chunk_number": keyword_result["chunk_number"], - "content": keyword_result["content"], - "metadata": keyword_result["metadata"], - "source_id": keyword_result["source_id"], - "similarity": scaled_similarity, - "match_type": "keyword", - "keyword_match_score": keyword_score, - } - - # Include summary if present (for code examples) - if "summary" in keyword_result: - standardized_result["summary"] = keyword_result["summary"] - - combined_results.append(standardized_result) - seen_ids.add(result_id) - - # Return only up to the requested match count - final_results = combined_results[:match_count] - - logger.debug( - f"Merge stats - Hybrid: {sum(1 for r in final_results if r.get('match_type') == 'hybrid')}, " - f"Vector: {sum(1 for r in final_results if r.get('match_type') == 'vector')}, " - f"Keyword: {sum(1 for r in final_results if r.get('match_type') == 'keyword')}" - ) - - return final_results + return [] \ No newline at end of file diff --git a/python/src/server/services/search/rag_service.py b/python/src/server/services/search/rag_service.py index cdc89c23..cf89cffe 100644 --- a/python/src/server/services/search/rag_service.py +++ b/python/src/server/services/search/rag_service.py @@ -204,10 +204,19 @@ class RAGService: use_hybrid_search = self.get_bool_setting("USE_HYBRID_SEARCH", False) use_reranking = self.get_bool_setting("USE_RERANKING", False) + # If reranking is enabled, fetch more candidates for the reranker to evaluate + # This allows the reranker to see a broader set of results + search_match_count = match_count + if use_reranking and self.reranking_strategy: + # Fetch 5x the requested amount when reranking is enabled + # The reranker will select the best from this larger pool + search_match_count = match_count * 5 + logger.debug(f"Reranking enabled - fetching {search_match_count} candidates for {match_count} final results") + # Step 1 & 2: Get results (with hybrid search if enabled) results = await self.search_documents( query=query, - match_count=match_count, + match_count=search_match_count, filter_metadata=filter_metadata, use_hybrid_search=use_hybrid_search, ) @@ -234,14 +243,18 @@ class RAGService: reranking_applied = False if self.reranking_strategy and formatted_results: try: + # Pass top_k to limit results to the originally requested count formatted_results = await self.reranking_strategy.rerank_results( - query, formatted_results, content_key="content" + query, formatted_results, content_key="content", top_k=match_count ) reranking_applied = True - logger.debug(f"Reranking applied to {len(formatted_results)} results") + logger.debug(f"Reranking applied: {search_match_count} candidates -> {len(formatted_results)} final results") except Exception as e: logger.warning(f"Reranking failed: {e}") reranking_applied = False + # If reranking fails but we fetched extra results, trim to requested count + if len(formatted_results) > match_count: + formatted_results = formatted_results[:match_count] # Build response response_data = { @@ -313,6 +326,12 @@ class RAGService: use_hybrid_search = self.get_bool_setting("USE_HYBRID_SEARCH", False) use_reranking = self.get_bool_setting("USE_RERANKING", False) + # If reranking is enabled, fetch more candidates + search_match_count = match_count + if use_reranking and self.reranking_strategy: + search_match_count = match_count * 5 + logger.debug(f"Reranking enabled for code search - fetching {search_match_count} candidates") + # Prepare filter filter_metadata = {"source": source_id} if source_id and source_id.strip() else None @@ -320,7 +339,7 @@ class RAGService: # Use hybrid search for code examples results = await self.hybrid_strategy.search_code_examples_hybrid( query=query, - match_count=match_count, + match_count=search_match_count, filter_metadata=filter_metadata, source_id=source_id, ) @@ -328,7 +347,7 @@ class RAGService: # Use standard agentic search results = await self.agentic_strategy.search_code_examples( query=query, - match_count=match_count, + match_count=search_match_count, filter_metadata=filter_metadata, source_id=source_id, ) @@ -337,10 +356,14 @@ class RAGService: if self.reranking_strategy and results: try: results = await self.reranking_strategy.rerank_results( - query, results, content_key="content" + query, results, content_key="content", top_k=match_count ) + logger.debug(f"Code reranking applied: {search_match_count} candidates -> {len(results)} final results") except Exception as e: logger.warning(f"Code reranking failed: {e}") + # If reranking fails but we fetched extra results, trim to requested count + if len(results) > match_count: + results = results[:match_count] # Format results formatted_results = [] diff --git a/python/tests/test_rag_simple.py b/python/tests/test_rag_simple.py index e8322e29..c9cecfdc 100644 --- a/python/tests/test_rag_simple.py +++ b/python/tests/test_rag_simple.py @@ -162,38 +162,6 @@ class TestHybridSearchCore: """Test hybrid strategy initializes""" assert hybrid_strategy is not None assert hasattr(hybrid_strategy, "search_documents_hybrid") - assert hasattr(hybrid_strategy, "_merge_search_results") - - def test_merge_results_functionality(self, hybrid_strategy): - """Test result merging logic""" - vector_results = [ - { - "id": "1", - "content": "Vector result", - "similarity": 0.9, - "url": "test1.com", - "chunk_number": 1, - "metadata": {}, - "source_id": "src1", - } - ] - keyword_results = [ - { - "id": "2", - "content": "Keyword result", - "url": "test2.com", - "chunk_number": 1, - "metadata": {}, - "source_id": "src2", - } - ] - - merged = hybrid_strategy._merge_search_results( - vector_results, keyword_results, match_count=5 - ) - - assert isinstance(merged, list) - assert len(merged) <= 5 class TestRerankingCore: diff --git a/python/tests/test_rag_strategies.py b/python/tests/test_rag_strategies.py index ff9dc90e..27fbd67b 100644 --- a/python/tests/test_rag_strategies.py +++ b/python/tests/test_rag_strategies.py @@ -168,42 +168,6 @@ class TestHybridSearchStrategy: assert hasattr(hybrid_strategy, "search_documents_hybrid") assert hasattr(hybrid_strategy, "search_code_examples_hybrid") - def test_merge_search_results(self, hybrid_strategy): - """Test search result merging""" - vector_results = [ - { - "id": "1", - "content": "Vector result 1", - "score": 0.9, - "url": "url1", - "chunk_number": 1, - "metadata": {}, - "source_id": "source1", - "similarity": 0.9, - } - ] - keyword_results = [ - { - "id": "2", - "content": "Keyword result 1", - "score": 0.8, - "url": "url2", - "chunk_number": 1, - "metadata": {}, - "source_id": "source2", - } - ] - - merged = hybrid_strategy._merge_search_results( - vector_results, keyword_results, match_count=5 - ) - - assert isinstance(merged, list) - assert len(merged) <= 5 - # Should contain results from both sources - if merged: - assert any("Vector result" in str(r) or "Keyword result" in str(r) for r in merged) - class TestRerankingStrategy: """Test reranking strategy implementation"""