From 3e5eef5503abb8b01c7995586b6abb3a4a6eeaea Mon Sep 17 00:00:00 2001 From: Cole Medin Date: Sat, 20 Sep 2025 13:33:15 -0500 Subject: [PATCH] Splitting up the Ollama migration scripts --- migration/0.1.0/003_ollama_add_columns.sql | 35 ++ migration/0.1.0/003_ollama_implementation.sql | 518 ------------------ migration/0.1.0/004_ollama_migrate_data.sql | 70 +++ .../0.1.0/005_ollama_create_functions.sql | 172 ++++++ .../006_ollama_create_indexes_optional.sql | 67 +++ ...l => 007_add_priority_column_to_tasks.sql} | 0 ...ing.sql => 008_add_migration_tracking.sql} | 13 +- migration/0.1.0/DB_UPGRADE_INSTRUCTIONS.md | 65 ++- migration/complete_setup.sql | 9 +- 9 files changed, 408 insertions(+), 541 deletions(-) create mode 100644 migration/0.1.0/003_ollama_add_columns.sql delete mode 100644 migration/0.1.0/003_ollama_implementation.sql create mode 100644 migration/0.1.0/004_ollama_migrate_data.sql create mode 100644 migration/0.1.0/005_ollama_create_functions.sql create mode 100644 migration/0.1.0/006_ollama_create_indexes_optional.sql rename migration/0.1.0/{004_add_priority_column_to_tasks.sql => 007_add_priority_column_to_tasks.sql} (100%) rename migration/0.1.0/{005_add_migration_tracking.sql => 008_add_migration_tracking.sql} (87%) diff --git a/migration/0.1.0/003_ollama_add_columns.sql b/migration/0.1.0/003_ollama_add_columns.sql new file mode 100644 index 00000000..d55afb08 --- /dev/null +++ b/migration/0.1.0/003_ollama_add_columns.sql @@ -0,0 +1,35 @@ +-- ====================================================================== +-- Migration 003: Ollama Implementation - Add Columns +-- Adds multi-dimensional embedding support columns +-- ====================================================================== + +-- Increase memory for this session +SET maintenance_work_mem = '256MB'; + +BEGIN; + +-- Add multi-dimensional embedding columns to archon_crawled_pages +ALTER TABLE archon_crawled_pages +ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), +ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), +ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), +ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), +ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072), +ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; + +-- Add multi-dimensional embedding columns to archon_code_examples +ALTER TABLE archon_code_examples +ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), +ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), +ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), +ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), +ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072), +ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; + +COMMIT; + +SELECT 'Ollama columns added successfully' AS status; \ No newline at end of file diff --git a/migration/0.1.0/003_ollama_implementation.sql b/migration/0.1.0/003_ollama_implementation.sql deleted file mode 100644 index 30a4f486..00000000 --- a/migration/0.1.0/003_ollama_implementation.sql +++ /dev/null @@ -1,518 +0,0 @@ --- ====================================================================== --- UPGRADE TO MODEL TRACKING AND MULTI-DIMENSIONAL EMBEDDINGS --- ====================================================================== --- This migration upgrades existing Archon installations to support: --- 1. Multi-dimensional embedding columns (768, 1024, 1536, 3072) --- 2. Model tracking fields (llm_chat_model, embedding_model, embedding_dimension) --- 3. 384-dimension support for smaller embedding models --- 4. Enhanced search functions for multi-dimensional support --- ====================================================================== --- --- IMPORTANT: Run this ONLY if you have an existing Archon installation --- that was created BEFORE the multi-dimensional embedding support. --- --- This script is SAFE to run multiple times - it uses IF NOT EXISTS checks. --- ====================================================================== - -BEGIN; - --- ====================================================================== --- SECTION 1: ADD MULTI-DIMENSIONAL EMBEDDING COLUMNS --- ====================================================================== - --- Add multi-dimensional embedding columns to archon_crawled_pages -ALTER TABLE archon_crawled_pages -ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), -- Small embedding models -ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), -- Google/Ollama models -ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), -- Ollama large models -ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), -- OpenAI standard models -ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072); -- OpenAI large models - --- Add multi-dimensional embedding columns to archon_code_examples -ALTER TABLE archon_code_examples -ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), -- Small embedding models -ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), -- Google/Ollama models -ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), -- Ollama large models -ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), -- OpenAI standard models -ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072); -- OpenAI large models - --- ====================================================================== --- SECTION 2: ADD MODEL TRACKING COLUMNS --- ====================================================================== - --- Add model tracking columns to archon_crawled_pages -ALTER TABLE archon_crawled_pages -ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, -- LLM model used for processing (e.g., 'gpt-4', 'llama3:8b') -ADD COLUMN IF NOT EXISTS embedding_model TEXT, -- Embedding model used (e.g., 'text-embedding-3-large', 'all-MiniLM-L6-v2') -ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; -- Dimension of the embedding used (384, 768, 1024, 1536, 3072) - --- Add model tracking columns to archon_code_examples -ALTER TABLE archon_code_examples -ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, -- LLM model used for processing (e.g., 'gpt-4', 'llama3:8b') -ADD COLUMN IF NOT EXISTS embedding_model TEXT, -- Embedding model used (e.g., 'text-embedding-3-large', 'all-MiniLM-L6-v2') -ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; -- Dimension of the embedding used (384, 768, 1024, 1536, 3072) - --- ====================================================================== --- SECTION 3: MIGRATE EXISTING EMBEDDING DATA --- ====================================================================== - --- Check if there's existing embedding data in old 'embedding' column -DO $$ -DECLARE - crawled_pages_count INTEGER; - code_examples_count INTEGER; - dimension_detected INTEGER; -BEGIN - -- Check if old embedding column exists and has data - SELECT COUNT(*) INTO crawled_pages_count - FROM information_schema.columns - WHERE table_name = 'archon_crawled_pages' - AND column_name = 'embedding'; - - SELECT COUNT(*) INTO code_examples_count - FROM information_schema.columns - WHERE table_name = 'archon_code_examples' - AND column_name = 'embedding'; - - -- If old embedding columns exist, migrate the data - IF crawled_pages_count > 0 THEN - RAISE NOTICE 'Found existing embedding column in archon_crawled_pages - migrating data...'; - - -- Detect dimension from first non-null embedding - SELECT vector_dims(embedding) INTO dimension_detected - FROM archon_crawled_pages - WHERE embedding IS NOT NULL - LIMIT 1; - - IF dimension_detected IS NOT NULL THEN - RAISE NOTICE 'Detected embedding dimension: %', dimension_detected; - - -- Migrate based on detected dimension - CASE dimension_detected - WHEN 384 THEN - UPDATE archon_crawled_pages - SET embedding_384 = embedding, - embedding_dimension = 384, - embedding_model = COALESCE(embedding_model, 'legacy-384d-model') - WHERE embedding IS NOT NULL AND embedding_384 IS NULL; - - WHEN 768 THEN - UPDATE archon_crawled_pages - SET embedding_768 = embedding, - embedding_dimension = 768, - embedding_model = COALESCE(embedding_model, 'legacy-768d-model') - WHERE embedding IS NOT NULL AND embedding_768 IS NULL; - - WHEN 1024 THEN - UPDATE archon_crawled_pages - SET embedding_1024 = embedding, - embedding_dimension = 1024, - embedding_model = COALESCE(embedding_model, 'legacy-1024d-model') - WHERE embedding IS NOT NULL AND embedding_1024 IS NULL; - - WHEN 1536 THEN - UPDATE archon_crawled_pages - SET embedding_1536 = embedding, - embedding_dimension = 1536, - embedding_model = COALESCE(embedding_model, 'text-embedding-3-small') - WHERE embedding IS NOT NULL AND embedding_1536 IS NULL; - - WHEN 3072 THEN - UPDATE archon_crawled_pages - SET embedding_3072 = embedding, - embedding_dimension = 3072, - embedding_model = COALESCE(embedding_model, 'text-embedding-3-large') - WHERE embedding IS NOT NULL AND embedding_3072 IS NULL; - - ELSE - RAISE NOTICE 'Unsupported embedding dimension detected: %. Skipping migration.', dimension_detected; - END CASE; - - RAISE NOTICE 'Migrated existing embeddings to dimension-specific columns'; - END IF; - END IF; - - -- Migrate code examples if they exist - IF code_examples_count > 0 THEN - RAISE NOTICE 'Found existing embedding column in archon_code_examples - migrating data...'; - - -- Detect dimension from first non-null embedding - SELECT vector_dims(embedding) INTO dimension_detected - FROM archon_code_examples - WHERE embedding IS NOT NULL - LIMIT 1; - - IF dimension_detected IS NOT NULL THEN - RAISE NOTICE 'Detected code examples embedding dimension: %', dimension_detected; - - -- Migrate based on detected dimension - CASE dimension_detected - WHEN 384 THEN - UPDATE archon_code_examples - SET embedding_384 = embedding, - embedding_dimension = 384, - embedding_model = COALESCE(embedding_model, 'legacy-384d-model') - WHERE embedding IS NOT NULL AND embedding_384 IS NULL; - - WHEN 768 THEN - UPDATE archon_code_examples - SET embedding_768 = embedding, - embedding_dimension = 768, - embedding_model = COALESCE(embedding_model, 'legacy-768d-model') - WHERE embedding IS NOT NULL AND embedding_768 IS NULL; - - WHEN 1024 THEN - UPDATE archon_code_examples - SET embedding_1024 = embedding, - embedding_dimension = 1024, - embedding_model = COALESCE(embedding_model, 'legacy-1024d-model') - WHERE embedding IS NOT NULL AND embedding_1024 IS NULL; - - WHEN 1536 THEN - UPDATE archon_code_examples - SET embedding_1536 = embedding, - embedding_dimension = 1536, - embedding_model = COALESCE(embedding_model, 'text-embedding-3-small') - WHERE embedding IS NOT NULL AND embedding_1536 IS NULL; - - WHEN 3072 THEN - UPDATE archon_code_examples - SET embedding_3072 = embedding, - embedding_dimension = 3072, - embedding_model = COALESCE(embedding_model, 'text-embedding-3-large') - WHERE embedding IS NOT NULL AND embedding_3072 IS NULL; - - ELSE - RAISE NOTICE 'Unsupported code examples embedding dimension: %. Skipping migration.', dimension_detected; - END CASE; - - RAISE NOTICE 'Migrated existing code example embeddings to dimension-specific columns'; - END IF; - END IF; -END $$; - --- ====================================================================== --- SECTION 4: CLEANUP LEGACY EMBEDDING COLUMNS --- ====================================================================== - --- Remove old embedding columns after successful migration -DO $$ -DECLARE - crawled_pages_count INTEGER; - code_examples_count INTEGER; -BEGIN - -- Check if old embedding column exists in crawled pages - SELECT COUNT(*) INTO crawled_pages_count - FROM information_schema.columns - WHERE table_name = 'archon_crawled_pages' - AND column_name = 'embedding'; - - -- Check if old embedding column exists in code examples - SELECT COUNT(*) INTO code_examples_count - FROM information_schema.columns - WHERE table_name = 'archon_code_examples' - AND column_name = 'embedding'; - - -- Drop old embedding column from crawled pages if it exists - IF crawled_pages_count > 0 THEN - RAISE NOTICE 'Dropping legacy embedding column from archon_crawled_pages...'; - ALTER TABLE archon_crawled_pages DROP COLUMN embedding; - RAISE NOTICE 'Successfully removed legacy embedding column from archon_crawled_pages'; - END IF; - - -- Drop old embedding column from code examples if it exists - IF code_examples_count > 0 THEN - RAISE NOTICE 'Dropping legacy embedding column from archon_code_examples...'; - ALTER TABLE archon_code_examples DROP COLUMN embedding; - RAISE NOTICE 'Successfully removed legacy embedding column from archon_code_examples'; - END IF; - - -- Drop any indexes on the old embedding column if they exist - DROP INDEX IF EXISTS idx_archon_crawled_pages_embedding; - DROP INDEX IF EXISTS idx_archon_code_examples_embedding; - - RAISE NOTICE 'Legacy column cleanup completed'; -END $$; - --- ====================================================================== --- SECTION 5: CREATE OPTIMIZED INDEXES --- ====================================================================== - --- Create indexes for archon_crawled_pages (multi-dimensional support) -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_384 -ON archon_crawled_pages USING ivfflat (embedding_384 vector_cosine_ops) -WITH (lists = 100); - -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_768 -ON archon_crawled_pages USING ivfflat (embedding_768 vector_cosine_ops) -WITH (lists = 100); - -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1024 -ON archon_crawled_pages USING ivfflat (embedding_1024 vector_cosine_ops) -WITH (lists = 100); - -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1536 -ON archon_crawled_pages USING ivfflat (embedding_1536 vector_cosine_ops) -WITH (lists = 100); - --- Note: 3072-dimensional embeddings cannot have vector indexes due to PostgreSQL vector extension 2000 dimension limit --- The embedding_3072 column exists but cannot be indexed with current pgvector version --- Brute force search will be used for 3072-dimensional vectors --- CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_3072 --- ON archon_crawled_pages USING hnsw (embedding_3072 vector_cosine_ops); - --- Create indexes for archon_code_examples (multi-dimensional support) -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_384 -ON archon_code_examples USING ivfflat (embedding_384 vector_cosine_ops) -WITH (lists = 100); - -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_768 -ON archon_code_examples USING ivfflat (embedding_768 vector_cosine_ops) -WITH (lists = 100); - -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1024 -ON archon_code_examples USING ivfflat (embedding_1024 vector_cosine_ops) -WITH (lists = 100); - -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1536 -ON archon_code_examples USING ivfflat (embedding_1536 vector_cosine_ops) -WITH (lists = 100); - --- Note: 3072-dimensional embeddings cannot have vector indexes due to PostgreSQL vector extension 2000 dimension limit --- The embedding_3072 column exists but cannot be indexed with current pgvector version --- Brute force search will be used for 3072-dimensional vectors --- CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_3072 --- ON archon_code_examples USING hnsw (embedding_3072 vector_cosine_ops); - --- Create indexes for model tracking columns -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_model -ON archon_crawled_pages (embedding_model); - -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_dimension -ON archon_crawled_pages (embedding_dimension); - -CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_llm_chat_model -ON archon_crawled_pages (llm_chat_model); - -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_model -ON archon_code_examples (embedding_model); - -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_dimension -ON archon_code_examples (embedding_dimension); - -CREATE INDEX IF NOT EXISTS idx_archon_code_examples_llm_chat_model -ON archon_code_examples (llm_chat_model); - --- ====================================================================== --- SECTION 6: HELPER FUNCTIONS FOR MULTI-DIMENSIONAL SUPPORT --- ====================================================================== - --- Function to detect embedding dimension from vector -CREATE OR REPLACE FUNCTION detect_embedding_dimension(embedding_vector vector) -RETURNS INTEGER AS $$ -BEGIN - RETURN vector_dims(embedding_vector); -END; -$$ LANGUAGE plpgsql IMMUTABLE; - --- Function to get the appropriate column name for a dimension -CREATE OR REPLACE FUNCTION get_embedding_column_name(dimension INTEGER) -RETURNS TEXT AS $$ -BEGIN - CASE dimension - WHEN 384 THEN RETURN 'embedding_384'; - WHEN 768 THEN RETURN 'embedding_768'; - WHEN 1024 THEN RETURN 'embedding_1024'; - WHEN 1536 THEN RETURN 'embedding_1536'; - WHEN 3072 THEN RETURN 'embedding_3072'; - ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %. Supported dimensions are: 384, 768, 1024, 1536, 3072', dimension; - END CASE; -END; -$$ LANGUAGE plpgsql IMMUTABLE; - --- ====================================================================== --- SECTION 7: ENHANCED SEARCH FUNCTIONS --- ====================================================================== - --- Create multi-dimensional function to search for documentation chunks -CREATE OR REPLACE FUNCTION match_archon_crawled_pages_multi ( - query_embedding VECTOR, - embedding_dimension INTEGER, - match_count INT DEFAULT 10, - filter JSONB DEFAULT '{}'::jsonb, - source_filter TEXT DEFAULT NULL -) RETURNS TABLE ( - id BIGINT, - url VARCHAR, - chunk_number INTEGER, - content TEXT, - metadata JSONB, - source_id TEXT, - similarity FLOAT -) -LANGUAGE plpgsql -AS $$ -#variable_conflict use_column -DECLARE - sql_query TEXT; - embedding_column TEXT; -BEGIN - -- Determine which embedding column to use based on dimension - CASE embedding_dimension - WHEN 384 THEN embedding_column := 'embedding_384'; - WHEN 768 THEN embedding_column := 'embedding_768'; - WHEN 1024 THEN embedding_column := 'embedding_1024'; - WHEN 1536 THEN embedding_column := 'embedding_1536'; - WHEN 3072 THEN embedding_column := 'embedding_3072'; - ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension; - END CASE; - - -- Build dynamic query - sql_query := format(' - SELECT id, url, chunk_number, content, metadata, source_id, - 1 - (%I <=> $1) AS similarity - FROM archon_crawled_pages - WHERE (%I IS NOT NULL) - AND metadata @> $3 - AND ($4 IS NULL OR source_id = $4) - ORDER BY %I <=> $1 - LIMIT $2', - embedding_column, embedding_column, embedding_column); - - -- Execute dynamic query - RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter; -END; -$$; - --- Create multi-dimensional function to search for code examples -CREATE OR REPLACE FUNCTION match_archon_code_examples_multi ( - query_embedding VECTOR, - embedding_dimension INTEGER, - match_count INT DEFAULT 10, - filter JSONB DEFAULT '{}'::jsonb, - source_filter TEXT DEFAULT NULL -) RETURNS TABLE ( - id BIGINT, - url VARCHAR, - chunk_number INTEGER, - content TEXT, - summary TEXT, - metadata JSONB, - source_id TEXT, - similarity FLOAT -) -LANGUAGE plpgsql -AS $$ -#variable_conflict use_column -DECLARE - sql_query TEXT; - embedding_column TEXT; -BEGIN - -- Determine which embedding column to use based on dimension - CASE embedding_dimension - WHEN 384 THEN embedding_column := 'embedding_384'; - WHEN 768 THEN embedding_column := 'embedding_768'; - WHEN 1024 THEN embedding_column := 'embedding_1024'; - WHEN 1536 THEN embedding_column := 'embedding_1536'; - WHEN 3072 THEN embedding_column := 'embedding_3072'; - ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension; - END CASE; - - -- Build dynamic query - sql_query := format(' - SELECT id, url, chunk_number, content, summary, metadata, source_id, - 1 - (%I <=> $1) AS similarity - FROM archon_code_examples - WHERE (%I IS NOT NULL) - AND metadata @> $3 - AND ($4 IS NULL OR source_id = $4) - ORDER BY %I <=> $1 - LIMIT $2', - embedding_column, embedding_column, embedding_column); - - -- Execute dynamic query - RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter; -END; -$$; - --- ====================================================================== --- SECTION 8: LEGACY COMPATIBILITY FUNCTIONS --- ====================================================================== - --- Legacy compatibility function for crawled pages (defaults to 1536D) -CREATE OR REPLACE FUNCTION match_archon_crawled_pages ( - query_embedding VECTOR(1536), - match_count INT DEFAULT 10, - filter JSONB DEFAULT '{}'::jsonb, - source_filter TEXT DEFAULT NULL -) RETURNS TABLE ( - id BIGINT, - url VARCHAR, - chunk_number INTEGER, - content TEXT, - metadata JSONB, - source_id TEXT, - similarity FLOAT -) -LANGUAGE plpgsql -AS $$ -BEGIN - RETURN QUERY SELECT * FROM match_archon_crawled_pages_multi(query_embedding, 1536, match_count, filter, source_filter); -END; -$$; - --- Legacy compatibility function for code examples (defaults to 1536D) -CREATE OR REPLACE FUNCTION match_archon_code_examples ( - query_embedding VECTOR(1536), - match_count INT DEFAULT 10, - filter JSONB DEFAULT '{}'::jsonb, - source_filter TEXT DEFAULT NULL -) RETURNS TABLE ( - id BIGINT, - url VARCHAR, - chunk_number INTEGER, - content TEXT, - summary TEXT, - metadata JSONB, - source_id TEXT, - similarity FLOAT -) -LANGUAGE plpgsql -AS $$ -BEGIN - RETURN QUERY SELECT * FROM match_archon_code_examples_multi(query_embedding, 1536, match_count, filter, source_filter); -END; -$$; - -COMMIT; - --- ====================================================================== --- MIGRATION COMPLETE - SUPABASE-FRIENDLY STATUS REPORT --- ====================================================================== --- This final SELECT statement consolidates all status information for --- display in Supabase SQL Editor (users only see the last query result) - -SELECT - '🎉 ARCHON MODEL TRACKING UPGRADE COMPLETED! 🎉' AS status, - 'Successfully upgraded your Archon installation' AS message, - ARRAY[ - '✅ Multi-dimensional embedding support (384, 768, 1024, 1536, 3072)', - '✅ Model tracking fields (llm_chat_model, embedding_model, embedding_dimension)', - '✅ Optimized indexes for improved search performance', - '✅ Enhanced search functions with dimension-aware querying', - '✅ Legacy compatibility maintained for existing code', - '✅ Existing embedding data migrated (if any was found)', - '✅ Support for 3072-dimensional vectors (using brute force search)' - ] AS features_added, - ARRAY[ - '• Multiple embedding providers (OpenAI, Ollama, Google, etc.)', - '• Automatic model detection and tracking', - '• Improved search accuracy with dimension-specific indexing', - '• Full audit trail of which models processed your data' - ] AS capabilities_enabled, - ARRAY[ - '1. Restart your Archon services: docker compose restart', - '2. New crawls will automatically use the enhanced features', - '3. Check the Settings page to configure your preferred models', - '4. Run validate_migration.sql to verify everything works' - ] AS next_steps; \ No newline at end of file diff --git a/migration/0.1.0/004_ollama_migrate_data.sql b/migration/0.1.0/004_ollama_migrate_data.sql new file mode 100644 index 00000000..226f86d3 --- /dev/null +++ b/migration/0.1.0/004_ollama_migrate_data.sql @@ -0,0 +1,70 @@ +-- ====================================================================== +-- Migration 004: Ollama Implementation - Migrate Data +-- Migrates existing embeddings to new multi-dimensional columns +-- ====================================================================== + +BEGIN; + +-- Migrate existing embedding data from old column (if exists) +DO $$ +DECLARE + crawled_pages_count INTEGER; + code_examples_count INTEGER; + dimension_detected INTEGER; +BEGIN + -- Check if old embedding column exists + SELECT COUNT(*) INTO crawled_pages_count + FROM information_schema.columns + WHERE table_name = 'archon_crawled_pages' + AND column_name = 'embedding'; + + IF crawled_pages_count > 0 THEN + -- Detect dimension + SELECT vector_dims(embedding) INTO dimension_detected + FROM archon_crawled_pages + WHERE embedding IS NOT NULL + LIMIT 1; + + IF dimension_detected = 1536 THEN + UPDATE archon_crawled_pages + SET embedding_1536 = embedding, + embedding_dimension = 1536, + embedding_model = COALESCE(embedding_model, 'text-embedding-3-small') + WHERE embedding IS NOT NULL AND embedding_1536 IS NULL; + END IF; + + -- Drop old column + ALTER TABLE archon_crawled_pages DROP COLUMN IF EXISTS embedding; + END IF; + + -- Same for code_examples + SELECT COUNT(*) INTO code_examples_count + FROM information_schema.columns + WHERE table_name = 'archon_code_examples' + AND column_name = 'embedding'; + + IF code_examples_count > 0 THEN + SELECT vector_dims(embedding) INTO dimension_detected + FROM archon_code_examples + WHERE embedding IS NOT NULL + LIMIT 1; + + IF dimension_detected = 1536 THEN + UPDATE archon_code_examples + SET embedding_1536 = embedding, + embedding_dimension = 1536, + embedding_model = COALESCE(embedding_model, 'text-embedding-3-small') + WHERE embedding IS NOT NULL AND embedding_1536 IS NULL; + END IF; + + ALTER TABLE archon_code_examples DROP COLUMN IF EXISTS embedding; + END IF; +END $$; + +-- Drop old indexes if they exist +DROP INDEX IF EXISTS idx_archon_crawled_pages_embedding; +DROP INDEX IF EXISTS idx_archon_code_examples_embedding; + +COMMIT; + +SELECT 'Ollama data migrated successfully' AS status; \ No newline at end of file diff --git a/migration/0.1.0/005_ollama_create_functions.sql b/migration/0.1.0/005_ollama_create_functions.sql new file mode 100644 index 00000000..0426cdf6 --- /dev/null +++ b/migration/0.1.0/005_ollama_create_functions.sql @@ -0,0 +1,172 @@ +-- ====================================================================== +-- Migration 005: Ollama Implementation - Create Functions +-- Creates search functions for multi-dimensional embeddings +-- ====================================================================== + +BEGIN; + +-- Helper function to detect embedding dimension +CREATE OR REPLACE FUNCTION detect_embedding_dimension(embedding_vector vector) +RETURNS INTEGER AS $$ +BEGIN + RETURN vector_dims(embedding_vector); +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +-- Helper function to get column name for dimension +CREATE OR REPLACE FUNCTION get_embedding_column_name(dimension INTEGER) +RETURNS TEXT AS $$ +BEGIN + CASE dimension + WHEN 384 THEN RETURN 'embedding_384'; + WHEN 768 THEN RETURN 'embedding_768'; + WHEN 1024 THEN RETURN 'embedding_1024'; + WHEN 1536 THEN RETURN 'embedding_1536'; + WHEN 3072 THEN RETURN 'embedding_3072'; + ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', dimension; + END CASE; +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +-- Multi-dimensional search for crawled pages +CREATE OR REPLACE FUNCTION match_archon_crawled_pages_multi ( + query_embedding VECTOR, + embedding_dimension INTEGER, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +#variable_conflict use_column +DECLARE + sql_query TEXT; + embedding_column TEXT; +BEGIN + CASE embedding_dimension + WHEN 384 THEN embedding_column := 'embedding_384'; + WHEN 768 THEN embedding_column := 'embedding_768'; + WHEN 1024 THEN embedding_column := 'embedding_1024'; + WHEN 1536 THEN embedding_column := 'embedding_1536'; + WHEN 3072 THEN embedding_column := 'embedding_3072'; + ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension; + END CASE; + + sql_query := format(' + SELECT id, url, chunk_number, content, metadata, source_id, + 1 - (%I <=> $1) AS similarity + FROM archon_crawled_pages + WHERE (%I IS NOT NULL) + AND metadata @> $3 + AND ($4 IS NULL OR source_id = $4) + ORDER BY %I <=> $1 + LIMIT $2', + embedding_column, embedding_column, embedding_column); + + RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter; +END; +$$; + +-- Multi-dimensional search for code examples +CREATE OR REPLACE FUNCTION match_archon_code_examples_multi ( + query_embedding VECTOR, + embedding_dimension INTEGER, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + summary TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +#variable_conflict use_column +DECLARE + sql_query TEXT; + embedding_column TEXT; +BEGIN + CASE embedding_dimension + WHEN 384 THEN embedding_column := 'embedding_384'; + WHEN 768 THEN embedding_column := 'embedding_768'; + WHEN 1024 THEN embedding_column := 'embedding_1024'; + WHEN 1536 THEN embedding_column := 'embedding_1536'; + WHEN 3072 THEN embedding_column := 'embedding_3072'; + ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension; + END CASE; + + sql_query := format(' + SELECT id, url, chunk_number, content, summary, metadata, source_id, + 1 - (%I <=> $1) AS similarity + FROM archon_code_examples + WHERE (%I IS NOT NULL) + AND metadata @> $3 + AND ($4 IS NULL OR source_id = $4) + ORDER BY %I <=> $1 + LIMIT $2', + embedding_column, embedding_column, embedding_column); + + RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter; +END; +$$; + +-- Legacy compatibility (defaults to 1536D) +CREATE OR REPLACE FUNCTION match_archon_crawled_pages ( + query_embedding VECTOR(1536), + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +BEGIN + RETURN QUERY SELECT * FROM match_archon_crawled_pages_multi(query_embedding, 1536, match_count, filter, source_filter); +END; +$$; + +CREATE OR REPLACE FUNCTION match_archon_code_examples ( + query_embedding VECTOR(1536), + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + summary TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +BEGIN + RETURN QUERY SELECT * FROM match_archon_code_examples_multi(query_embedding, 1536, match_count, filter, source_filter); +END; +$$; + +COMMIT; + +SELECT 'Ollama functions created successfully' AS status; \ No newline at end of file diff --git a/migration/0.1.0/006_ollama_create_indexes_optional.sql b/migration/0.1.0/006_ollama_create_indexes_optional.sql new file mode 100644 index 00000000..d8a38080 --- /dev/null +++ b/migration/0.1.0/006_ollama_create_indexes_optional.sql @@ -0,0 +1,67 @@ +-- ====================================================================== +-- Migration 006: Ollama Implementation - Create Indexes (Optional) +-- Creates vector indexes for performance (may timeout on large datasets) +-- ====================================================================== + +-- IMPORTANT: This migration creates vector indexes which are memory-intensive +-- If this fails, you can skip it and the system will use brute-force search +-- You can create these indexes later via direct database connection + +SET maintenance_work_mem = '512MB'; +SET statement_timeout = '10min'; + +-- Create ONE index at a time to avoid memory issues +-- Comment out any that fail and continue with the next + +-- Index 1 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1536 +ON archon_crawled_pages USING ivfflat (embedding_1536 vector_cosine_ops) +WITH (lists = 100); + +-- Index 2 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1536 +ON archon_code_examples USING ivfflat (embedding_1536 vector_cosine_ops) +WITH (lists = 100); + +-- Index 3 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_768 +ON archon_crawled_pages USING ivfflat (embedding_768 vector_cosine_ops) +WITH (lists = 100); + +-- Index 4 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_768 +ON archon_code_examples USING ivfflat (embedding_768 vector_cosine_ops) +WITH (lists = 100); + +-- Index 5 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_384 +ON archon_crawled_pages USING ivfflat (embedding_384 vector_cosine_ops) +WITH (lists = 100); + +-- Index 6 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_384 +ON archon_code_examples USING ivfflat (embedding_384 vector_cosine_ops) +WITH (lists = 100); + +-- Index 7 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1024 +ON archon_crawled_pages USING ivfflat (embedding_1024 vector_cosine_ops) +WITH (lists = 100); + +-- Index 8 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1024 +ON archon_code_examples USING ivfflat (embedding_1024 vector_cosine_ops) +WITH (lists = 100); + +-- Simple B-tree indexes (these are fast) +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_model ON archon_crawled_pages (embedding_model); +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_dimension ON archon_crawled_pages (embedding_dimension); +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_llm_chat_model ON archon_crawled_pages (llm_chat_model); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_model ON archon_code_examples (embedding_model); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_dimension ON archon_code_examples (embedding_dimension); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_llm_chat_model ON archon_code_examples (llm_chat_model); + +RESET maintenance_work_mem; +RESET statement_timeout; + +SELECT 'Ollama indexes created (or skipped if timed out - that issue will be obvious in Supabase)' AS status; \ No newline at end of file diff --git a/migration/0.1.0/004_add_priority_column_to_tasks.sql b/migration/0.1.0/007_add_priority_column_to_tasks.sql similarity index 100% rename from migration/0.1.0/004_add_priority_column_to_tasks.sql rename to migration/0.1.0/007_add_priority_column_to_tasks.sql diff --git a/migration/0.1.0/005_add_migration_tracking.sql b/migration/0.1.0/008_add_migration_tracking.sql similarity index 87% rename from migration/0.1.0/005_add_migration_tracking.sql rename to migration/0.1.0/008_add_migration_tracking.sql index eed1e909..5cac0c72 100644 --- a/migration/0.1.0/005_add_migration_tracking.sql +++ b/migration/0.1.0/008_add_migration_tracking.sql @@ -1,4 +1,4 @@ --- Migration: 005_add_migration_tracking.sql +-- Migration: 008_add_migration_tracking.sql -- Description: Create archon_migrations table for tracking applied database migrations -- Version: 0.1.0 -- Author: Archon Team @@ -30,18 +30,21 @@ COMMENT ON COLUMN archon_migrations.checksum IS 'Optional MD5 checksum of migrat -- Record this migration as applied (self-recording pattern) -- This allows the migration system to bootstrap itself INSERT INTO archon_migrations (version, migration_name) -VALUES ('0.1.0', '005_add_migration_tracking') +VALUES ('0.1.0', '008_add_migration_tracking') ON CONFLICT (version, migration_name) DO NOTHING; --- Retroactively record previously applied migrations (001-004) +-- Retroactively record previously applied migrations (001-007) -- Since these migrations couldn't self-record (table didn't exist yet), -- we record them here to ensure the migration system knows they've been applied INSERT INTO archon_migrations (version, migration_name) VALUES ('0.1.0', '001_add_source_url_display_name'), ('0.1.0', '002_add_hybrid_search_tsvector'), - ('0.1.0', '003_ollama_implementation'), - ('0.1.0', '004_add_priority_column_to_tasks') + ('0.1.0', '003_ollama_add_columns'), + ('0.1.0', '004_ollama_migrate_data'), + ('0.1.0', '005_ollama_create_functions'), + ('0.1.0', '006_ollama_create_indexes_optional'), + ('0.1.0', '007_add_priority_column_to_tasks') ON CONFLICT (version, migration_name) DO NOTHING; -- Enable Row Level Security on migrations table diff --git a/migration/0.1.0/DB_UPGRADE_INSTRUCTIONS.md b/migration/0.1.0/DB_UPGRADE_INSTRUCTIONS.md index 9c14a809..5523d26a 100644 --- a/migration/0.1.0/DB_UPGRADE_INSTRUCTIONS.md +++ b/migration/0.1.0/DB_UPGRADE_INSTRUCTIONS.md @@ -16,6 +16,8 @@ Creates timestamped backup tables of all your existing data: ### 2. Migration Scripts (Run in Order) +You only have to run the ones you haven't already! If you don't remember exactly, it is okay to rerun migration scripts. + **2.1. `001_add_source_url_display_name.sql`** - Adds display name field to sources table - Improves UI presentation of crawled sources @@ -25,17 +27,34 @@ Creates timestamped backup tables of all your existing data: - Implements hybrid search with tsvector columns - Creates optimized search indexes -**2.3. `003_ollama_implementation.sql`** -- Multi-dimensional embedding support (384, 768, 1024, 1536, 3072 dimensions) -- Model tracking fields (`llm_chat_model`, `embedding_model`, `embedding_dimension`) -- Optimized indexes for improved search performance -- Enhanced search functions with dimension-aware querying -- Automatic migration of existing embedding data +**2.3. `003_ollama_add_columns.sql`** +- Adds multi-dimensional embedding columns (384, 768, 1024, 1536, 3072 dimensions) +- Adds model tracking fields (`llm_chat_model`, `embedding_model`, `embedding_dimension`) -**2.4. `004_add_priority_column_to_tasks.sql`** +**2.4. `004_ollama_migrate_data.sql`** +- Migrates existing embeddings to new multi-dimensional columns +- Drops old embedding column after migration +- Removes obsolete indexes + +**2.5. `005_ollama_create_functions.sql`** +- Creates search functions for multi-dimensional embeddings +- Adds helper functions for dimension detection +- Maintains backward compatibility with legacy search functions + +**2.6. `006_ollama_create_indexes_optional.sql`** +- Creates vector indexes for performance (may timeout on large datasets) +- Creates B-tree indexes for model fields +- Can be skipped if timeout occurs (system will use brute-force search) + +**2.7. `007_add_priority_column_to_tasks.sql`** - Adds priority field to tasks table - Enables task prioritization in project management +**2.8. `008_add_migration_tracking.sql`** +- Creates migration tracking table +- Records all applied migrations +- Enables migration version control + ## Migration Process (Follow This Order!) ### Step 1: Backup Your Data @@ -49,8 +68,12 @@ Creates timestamped backup tables of all your existing data: -- Run each script in sequence: -- 1. Run: 001_add_source_url_display_name.sql -- 2. Run: 002_add_hybrid_search_tsvector.sql --- 3. Run: 003_ollama_implementation.sql --- 4. Run: 004_add_priority_column_to_tasks.sql +-- 3. Run: 003_ollama_add_columns.sql +-- 4. Run: 004_ollama_migrate_data.sql +-- 5. Run: 005_ollama_create_functions.sql +-- 6. Run: 006_ollama_create_indexes_optional.sql (optional - may timeout) +-- 7. Run: 007_add_priority_column_to_tasks.sql +-- 8. Run: 008_add_migration_tracking.sql ``` ### Step 3: Restart Services @@ -75,8 +98,12 @@ psql -h your-supabase-host -p 5432 -U postgres -d postgres # Run the migrations in order \i /path/to/001_add_source_url_display_name.sql \i /path/to/002_add_hybrid_search_tsvector.sql -\i /path/to/003_ollama_implementation.sql -\i /path/to/004_add_priority_column_to_tasks.sql +\i /path/to/003_ollama_add_columns.sql +\i /path/to/004_ollama_migrate_data.sql +\i /path/to/005_ollama_create_functions.sql +\i /path/to/006_ollama_create_indexes_optional.sql +\i /path/to/007_add_priority_column_to_tasks.sql +\i /path/to/008_add_migration_tracking.sql # Exit \q @@ -87,14 +114,22 @@ psql -h your-supabase-host -p 5432 -U postgres -d postgres # Copy migrations to container docker cp 001_add_source_url_display_name.sql supabase-db:/tmp/ docker cp 002_add_hybrid_search_tsvector.sql supabase-db:/tmp/ -docker cp 003_ollama_implementation.sql supabase-db:/tmp/ -docker cp 004_add_priority_column_to_tasks.sql supabase-db:/tmp/ +docker cp 003_ollama_add_columns.sql supabase-db:/tmp/ +docker cp 004_ollama_migrate_data.sql supabase-db:/tmp/ +docker cp 005_ollama_create_functions.sql supabase-db:/tmp/ +docker cp 006_ollama_create_indexes_optional.sql supabase-db:/tmp/ +docker cp 007_add_priority_column_to_tasks.sql supabase-db:/tmp/ +docker cp 008_add_migration_tracking.sql supabase-db:/tmp/ # Execute migrations in order docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/001_add_source_url_display_name.sql docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/002_add_hybrid_search_tsvector.sql -docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/003_ollama_implementation.sql -docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/004_add_priority_column_to_tasks.sql +docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/003_ollama_add_columns.sql +docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/004_ollama_migrate_data.sql +docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/005_ollama_create_functions.sql +docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/006_ollama_create_indexes_optional.sql +docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/007_add_priority_column_to_tasks.sql +docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/008_add_migration_tracking.sql ``` ## Migration Safety diff --git a/migration/complete_setup.sql b/migration/complete_setup.sql index 1c09107d..c8ee8e61 100644 --- a/migration/complete_setup.sql +++ b/migration/complete_setup.sql @@ -982,9 +982,12 @@ INSERT INTO archon_migrations (version, migration_name) VALUES ('0.1.0', '001_add_source_url_display_name'), ('0.1.0', '002_add_hybrid_search_tsvector'), - ('0.1.0', '003_ollama_implementation'), - ('0.1.0', '004_add_priority_column_to_tasks'), - ('0.1.0', '005_add_migration_tracking') + ('0.1.0', '003_ollama_add_columns'), + ('0.1.0', '004_ollama_migrate_data'), + ('0.1.0', '005_ollama_create_functions'), + ('0.1.0', '006_ollama_create_indexes_optional'), + ('0.1.0', '007_add_priority_column_to_tasks'), + ('0.1.0', '008_add_migration_tracking') ON CONFLICT (version, migration_name) DO NOTHING; -- Enable Row Level Security on migrations table