Enhanced the hybrid search strategy with tsvector keyword matching (#539)

This commit is contained in:
Cole Medin
2025-09-10 05:23:49 -06:00
committed by GitHub
parent 012d2c58ed
commit 926b6f5a7b
7 changed files with 568 additions and 341 deletions

View File

@@ -133,6 +133,10 @@ BEGIN
DROP FUNCTION IF EXISTS match_archon_crawled_pages(vector, int, jsonb, text) CASCADE;
DROP FUNCTION IF EXISTS match_archon_code_examples(vector, int, jsonb, text) CASCADE;
-- Hybrid search functions (with ts_vector support)
DROP FUNCTION IF EXISTS hybrid_search_archon_crawled_pages(vector, text, int, jsonb, text) CASCADE;
DROP FUNCTION IF EXISTS hybrid_search_archon_code_examples(vector, text, int, jsonb, text) CASCADE;
-- Search functions (old without prefix)
DROP FUNCTION IF EXISTS match_crawled_pages(vector, int, jsonb, text) CASCADE;
DROP FUNCTION IF EXISTS match_code_examples(vector, int, jsonb, text) CASCADE;

View File

@@ -0,0 +1,237 @@
-- =====================================================
-- Add Hybrid Search with ts_vector Support
-- =====================================================
-- This migration adds efficient text search capabilities using PostgreSQL's
-- full-text search features (ts_vector) to enable better keyword matching
-- in hybrid search operations.
-- =====================================================
-- Enable required extensions (pg_trgm for fuzzy matching)
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- =====================================================
-- SECTION 1: ADD TEXT SEARCH COLUMNS AND INDEXES
-- =====================================================
-- Add ts_vector columns for full-text search if they don't exist
ALTER TABLE archon_crawled_pages
ADD COLUMN IF NOT EXISTS content_search_vector tsvector
GENERATED ALWAYS AS (to_tsvector('english', content)) STORED;
ALTER TABLE archon_code_examples
ADD COLUMN IF NOT EXISTS content_search_vector tsvector
GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED;
-- Create GIN indexes for fast text search
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector);
-- Create trigram indexes for fuzzy matching (useful for typos and partial matches)
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops);
-- =====================================================
-- SECTION 2: HYBRID SEARCH FUNCTIONS
-- =====================================================
-- Hybrid search function for archon_crawled_pages
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.embedding <=> query_embedding) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.embedding IS NOT NULL
ORDER BY cp.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
END;
$$;
-- Hybrid search function for archon_code_examples
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.embedding <=> query_embedding) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.embedding IS NOT NULL
ORDER BY ce.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking (searches both content and summary)
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
END;
$$;
-- =====================================================
-- SECTION 3: UPDATE EXISTING DATA
-- =====================================================
-- Force regeneration of search vectors for existing data
-- This is handled automatically by the GENERATED ALWAYS AS columns
-- Add comment to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
-- =====================================================
-- MIGRATION COMPLETE
-- =====================================================
-- Hybrid search with ts_vector is now available!
-- The search vectors will be automatically maintained
-- as data is inserted or updated.
-- =====================================================

View File

@@ -15,6 +15,7 @@
-- Enable required PostgreSQL extensions
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pgcrypto;
CREATE EXTENSION IF NOT EXISTS pg_trgm;
-- =====================================================
-- SECTION 2: CREDENTIALS AND SETTINGS
@@ -203,6 +204,7 @@ CREATE TABLE IF NOT EXISTS archon_crawled_pages (
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
source_id TEXT NOT NULL,
embedding VECTOR(1536), -- OpenAI embeddings are 1536 dimensions
content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content)) STORED,
created_at TIMESTAMP WITH TIME ZONE DEFAULT timezone('utc'::text, now()) NOT NULL,
-- Add a unique constraint to prevent duplicate chunks for the same URL
@@ -216,6 +218,8 @@ CREATE TABLE IF NOT EXISTS archon_crawled_pages (
CREATE INDEX ON archon_crawled_pages USING ivfflat (embedding vector_cosine_ops);
CREATE INDEX idx_archon_crawled_pages_metadata ON archon_crawled_pages USING GIN (metadata);
CREATE INDEX idx_archon_crawled_pages_source_id ON archon_crawled_pages (source_id);
CREATE INDEX idx_archon_crawled_pages_content_search ON archon_crawled_pages USING GIN (content_search_vector);
CREATE INDEX idx_archon_crawled_pages_content_trgm ON archon_crawled_pages USING GIN (content gin_trgm_ops);
-- Create the code_examples table
CREATE TABLE IF NOT EXISTS archon_code_examples (
@@ -227,6 +231,7 @@ CREATE TABLE IF NOT EXISTS archon_code_examples (
metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
source_id TEXT NOT NULL,
embedding VECTOR(1536), -- OpenAI embeddings are 1536 dimensions
content_search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', content || ' ' || COALESCE(summary, ''))) STORED,
created_at TIMESTAMP WITH TIME ZONE DEFAULT timezone('utc'::text, now()) NOT NULL,
-- Add a unique constraint to prevent duplicate chunks for the same URL
@@ -240,6 +245,9 @@ CREATE TABLE IF NOT EXISTS archon_code_examples (
CREATE INDEX ON archon_code_examples USING ivfflat (embedding vector_cosine_ops);
CREATE INDEX idx_archon_code_examples_metadata ON archon_code_examples USING GIN (metadata);
CREATE INDEX idx_archon_code_examples_source_id ON archon_code_examples (source_id);
CREATE INDEX idx_archon_code_examples_content_search ON archon_code_examples USING GIN (content_search_vector);
CREATE INDEX idx_archon_code_examples_content_trgm ON archon_code_examples USING GIN (content gin_trgm_ops);
CREATE INDEX idx_archon_code_examples_summary_trgm ON archon_code_examples USING GIN (summary gin_trgm_ops);
-- =====================================================
-- SECTION 5: SEARCH FUNCTIONS
@@ -319,6 +327,196 @@ BEGIN
END;
$$;
-- =====================================================
-- SECTION 5B: HYBRID SEARCH FUNCTIONS WITH TS_VECTOR
-- =====================================================
-- Hybrid search function for archon_crawled_pages
CREATE OR REPLACE FUNCTION hybrid_search_archon_crawled_pages(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
1 - (cp.embedding <=> query_embedding) AS vector_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.embedding IS NOT NULL
ORDER BY cp.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking
SELECT
cp.id,
cp.url,
cp.chunk_number,
cp.content,
cp.metadata,
cp.source_id,
ts_rank_cd(cp.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_crawled_pages cp
WHERE cp.metadata @> filter
AND (source_filter IS NULL OR cp.source_id = source_filter)
AND cp.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
END;
$$;
-- Hybrid search function for archon_code_examples
CREATE OR REPLACE FUNCTION hybrid_search_archon_code_examples(
query_embedding vector(1536),
query_text TEXT,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
)
RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT,
match_type TEXT
)
LANGUAGE plpgsql
AS $$
DECLARE
max_vector_results INT;
max_text_results INT;
BEGIN
-- Calculate how many results to fetch from each search type
max_vector_results := match_count;
max_text_results := match_count;
RETURN QUERY
WITH vector_results AS (
-- Vector similarity search
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
1 - (ce.embedding <=> query_embedding) AS vector_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.embedding IS NOT NULL
ORDER BY ce.embedding <=> query_embedding
LIMIT max_vector_results
),
text_results AS (
-- Full-text search with ranking (searches both content and summary)
SELECT
ce.id,
ce.url,
ce.chunk_number,
ce.content,
ce.summary,
ce.metadata,
ce.source_id,
ts_rank_cd(ce.content_search_vector, plainto_tsquery('english', query_text)) AS text_sim
FROM archon_code_examples ce
WHERE ce.metadata @> filter
AND (source_filter IS NULL OR ce.source_id = source_filter)
AND ce.content_search_vector @@ plainto_tsquery('english', query_text)
ORDER BY text_sim DESC
LIMIT max_text_results
),
combined_results AS (
-- Combine results from both searches
SELECT
COALESCE(v.id, t.id) AS id,
COALESCE(v.url, t.url) AS url,
COALESCE(v.chunk_number, t.chunk_number) AS chunk_number,
COALESCE(v.content, t.content) AS content,
COALESCE(v.summary, t.summary) AS summary,
COALESCE(v.metadata, t.metadata) AS metadata,
COALESCE(v.source_id, t.source_id) AS source_id,
-- Use vector similarity if available, otherwise text similarity
COALESCE(v.vector_sim, t.text_sim, 0)::float8 AS similarity,
-- Determine match type
CASE
WHEN v.id IS NOT NULL AND t.id IS NOT NULL THEN 'hybrid'
WHEN v.id IS NOT NULL THEN 'vector'
ELSE 'keyword'
END AS match_type
FROM vector_results v
FULL OUTER JOIN text_results t ON v.id = t.id
)
SELECT * FROM combined_results
ORDER BY similarity DESC
LIMIT match_count;
END;
$$;
-- Add comments to document the new functionality
COMMENT ON FUNCTION hybrid_search_archon_crawled_pages IS 'Performs hybrid search combining vector similarity and full-text search with configurable weighting';
COMMENT ON FUNCTION hybrid_search_archon_code_examples IS 'Performs hybrid search on code examples combining vector similarity and full-text search';
-- =====================================================
-- SECTION 6: RLS POLICIES FOR KNOWLEDGE BASE
-- =====================================================