diff --git a/migration/MIGRATION_GUIDE.md b/migration/MIGRATION_GUIDE.md new file mode 100644 index 00000000..82d731f5 --- /dev/null +++ b/migration/MIGRATION_GUIDE.md @@ -0,0 +1,128 @@ +# Archon Database Migration Guide + +## Problem: Supabase SQL Editor Timeout + +The full migration script times out in Supabase SQL editor due to memory-intensive vector index creation. + +## Solution: Run Migration in Steps + +### Method 1: Use Step-by-Step Scripts (Recommended) + +Run these scripts in order in the Supabase SQL editor: + +1. **Step 1**: `step1_add_columns.sql` - Adds new columns (fast, ~5 seconds) +2. **Step 2**: `step2_migrate_data.sql` - Migrates existing data (fast, ~10 seconds) +3. **Step 3**: `step3_create_functions.sql` - Creates search functions (fast, ~5 seconds) +4. **Step 4**: `step4_create_indexes_optional.sql` - Creates indexes (may timeout - OPTIONAL) + +**Note**: If Step 4 times out, the system will still work using brute-force search. You can create indexes later. + +### Method 2: Direct Database Connection + +Connect directly to your Supabase database using psql or a database client: + +#### Get Connection String +1. Go to Supabase Dashboard → Settings → Database +2. Copy the connection string (use "Session pooler" for migrations) +3. It looks like: `postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:6543/postgres` + +#### Using psql +```bash +# Connect to database +psql "postgresql://postgres.[project-ref]:[password]@aws-0-[region].pooler.supabase.com:6543/postgres" + +# Run the full migration +\i migration/upgrade_database_with_memory_fix.sql + +# Or run individual steps +\i migration/step1_add_columns.sql +\i migration/step2_migrate_data.sql +\i migration/step3_create_functions.sql +\i migration/step4_create_indexes_optional.sql +``` + +#### Using TablePlus/DBeaver/pgAdmin +1. Create new connection with your connection string +2. Open and run each SQL file in order +3. Monitor execution time and memory usage + +### Method 3: Use Supabase CLI + +```bash +# Install Supabase CLI +npm install -g supabase + +# Login and link project +supabase login +supabase link --project-ref [your-project-ref] + +# Run migration +supabase db push migration/upgrade_database_with_memory_fix.sql +``` + +### Method 4: Skip Vector Indexes Entirely + +If you have a small dataset (<10,000 documents), you can skip Step 4 entirely. The system will use brute-force search which is fast enough for small datasets. + +## Verification + +After migration, run this query to verify: + +```sql +SELECT + EXISTS(SELECT 1 FROM information_schema.columns + WHERE table_name = 'archon_crawled_pages' + AND column_name = 'embedding_1536') as has_1536_column, + EXISTS(SELECT 1 FROM information_schema.routines + WHERE routine_name = 'match_archon_crawled_pages_multi') as has_multi_function, + COUNT(*) as index_count +FROM pg_indexes +WHERE tablename IN ('archon_crawled_pages', 'archon_code_examples') +AND indexname LIKE '%embedding%'; +``` + +Expected result: +- `has_1536_column`: true +- `has_multi_function`: true +- `index_count`: 8+ (or 0 if you skipped Step 4) + +## Troubleshooting + +### "Memory required" error +- Increase `maintenance_work_mem` in the script +- Use direct database connection instead of SQL editor +- Create indexes one at a time + +### "Statement timeout" error +- Run scripts in smaller steps +- Use direct database connection +- Increase `statement_timeout` setting + +### "Permission denied" error +- Ensure you're using the service role key +- Check database permissions in Supabase dashboard + +## Post-Migration + +After successful migration: + +1. **Restart services**: + ```bash + docker compose restart + ``` + +2. **Test the system**: + - Check if RAG search works in the UI + - Try crawling a new website + - Verify embeddings are being created + +3. **Monitor performance**: + - If searches are slow without indexes, create them via direct connection + - Consider using smaller embedding dimensions (384 or 768) for faster performance + +## Need Help? + +If you encounter issues: +1. Check Supabase logs: Dashboard → Logs → Postgres +2. Verify your Supabase plan has sufficient resources +3. Contact Supabase support for memory limit increases (paid plans only) \ No newline at end of file diff --git a/migration/step1_add_columns.sql b/migration/step1_add_columns.sql new file mode 100644 index 00000000..4b8dd91b --- /dev/null +++ b/migration/step1_add_columns.sql @@ -0,0 +1,35 @@ +-- ====================================================================== +-- STEP 1: ADD COLUMNS ONLY (No indexes yet) +-- Run this first - should complete quickly +-- ====================================================================== + +-- Increase memory for this session +SET maintenance_work_mem = '256MB'; + +BEGIN; + +-- Add multi-dimensional embedding columns to archon_crawled_pages +ALTER TABLE archon_crawled_pages +ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), +ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), +ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), +ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), +ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072), +ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; + +-- Add multi-dimensional embedding columns to archon_code_examples +ALTER TABLE archon_code_examples +ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), +ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), +ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), +ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), +ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072), +ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_model TEXT, +ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; + +COMMIT; + +SELECT 'Step 1 Complete: Columns added successfully' AS status; \ No newline at end of file diff --git a/migration/step2_migrate_data.sql b/migration/step2_migrate_data.sql new file mode 100644 index 00000000..1191e176 --- /dev/null +++ b/migration/step2_migrate_data.sql @@ -0,0 +1,70 @@ +-- ====================================================================== +-- STEP 2: MIGRATE EXISTING DATA (if any) +-- Run after Step 1 completes +-- ====================================================================== + +BEGIN; + +-- Migrate existing embedding data from old column (if exists) +DO $$ +DECLARE + crawled_pages_count INTEGER; + code_examples_count INTEGER; + dimension_detected INTEGER; +BEGIN + -- Check if old embedding column exists + SELECT COUNT(*) INTO crawled_pages_count + FROM information_schema.columns + WHERE table_name = 'archon_crawled_pages' + AND column_name = 'embedding'; + + IF crawled_pages_count > 0 THEN + -- Detect dimension + SELECT vector_dims(embedding) INTO dimension_detected + FROM archon_crawled_pages + WHERE embedding IS NOT NULL + LIMIT 1; + + IF dimension_detected = 1536 THEN + UPDATE archon_crawled_pages + SET embedding_1536 = embedding, + embedding_dimension = 1536, + embedding_model = COALESCE(embedding_model, 'text-embedding-3-small') + WHERE embedding IS NOT NULL AND embedding_1536 IS NULL; + END IF; + + -- Drop old column + ALTER TABLE archon_crawled_pages DROP COLUMN IF EXISTS embedding; + END IF; + + -- Same for code_examples + SELECT COUNT(*) INTO code_examples_count + FROM information_schema.columns + WHERE table_name = 'archon_code_examples' + AND column_name = 'embedding'; + + IF code_examples_count > 0 THEN + SELECT vector_dims(embedding) INTO dimension_detected + FROM archon_code_examples + WHERE embedding IS NOT NULL + LIMIT 1; + + IF dimension_detected = 1536 THEN + UPDATE archon_code_examples + SET embedding_1536 = embedding, + embedding_dimension = 1536, + embedding_model = COALESCE(embedding_model, 'text-embedding-3-small') + WHERE embedding IS NOT NULL AND embedding_1536 IS NULL; + END IF; + + ALTER TABLE archon_code_examples DROP COLUMN IF EXISTS embedding; + END IF; +END $$; + +-- Drop old indexes if they exist +DROP INDEX IF EXISTS idx_archon_crawled_pages_embedding; +DROP INDEX IF EXISTS idx_archon_code_examples_embedding; + +COMMIT; + +SELECT 'Step 2 Complete: Data migrated successfully' AS status; \ No newline at end of file diff --git a/migration/step3_create_functions.sql b/migration/step3_create_functions.sql new file mode 100644 index 00000000..2632d074 --- /dev/null +++ b/migration/step3_create_functions.sql @@ -0,0 +1,172 @@ +-- ====================================================================== +-- STEP 3: CREATE SEARCH FUNCTIONS +-- Run after Step 2 completes +-- ====================================================================== + +BEGIN; + +-- Helper function to detect embedding dimension +CREATE OR REPLACE FUNCTION detect_embedding_dimension(embedding_vector vector) +RETURNS INTEGER AS $$ +BEGIN + RETURN vector_dims(embedding_vector); +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +-- Helper function to get column name for dimension +CREATE OR REPLACE FUNCTION get_embedding_column_name(dimension INTEGER) +RETURNS TEXT AS $$ +BEGIN + CASE dimension + WHEN 384 THEN RETURN 'embedding_384'; + WHEN 768 THEN RETURN 'embedding_768'; + WHEN 1024 THEN RETURN 'embedding_1024'; + WHEN 1536 THEN RETURN 'embedding_1536'; + WHEN 3072 THEN RETURN 'embedding_3072'; + ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', dimension; + END CASE; +END; +$$ LANGUAGE plpgsql IMMUTABLE; + +-- Multi-dimensional search for crawled pages +CREATE OR REPLACE FUNCTION match_archon_crawled_pages_multi ( + query_embedding VECTOR, + embedding_dimension INTEGER, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +#variable_conflict use_column +DECLARE + sql_query TEXT; + embedding_column TEXT; +BEGIN + CASE embedding_dimension + WHEN 384 THEN embedding_column := 'embedding_384'; + WHEN 768 THEN embedding_column := 'embedding_768'; + WHEN 1024 THEN embedding_column := 'embedding_1024'; + WHEN 1536 THEN embedding_column := 'embedding_1536'; + WHEN 3072 THEN embedding_column := 'embedding_3072'; + ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension; + END CASE; + + sql_query := format(' + SELECT id, url, chunk_number, content, metadata, source_id, + 1 - (%I <=> $1) AS similarity + FROM archon_crawled_pages + WHERE (%I IS NOT NULL) + AND metadata @> $3 + AND ($4 IS NULL OR source_id = $4) + ORDER BY %I <=> $1 + LIMIT $2', + embedding_column, embedding_column, embedding_column); + + RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter; +END; +$$; + +-- Multi-dimensional search for code examples +CREATE OR REPLACE FUNCTION match_archon_code_examples_multi ( + query_embedding VECTOR, + embedding_dimension INTEGER, + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + summary TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +#variable_conflict use_column +DECLARE + sql_query TEXT; + embedding_column TEXT; +BEGIN + CASE embedding_dimension + WHEN 384 THEN embedding_column := 'embedding_384'; + WHEN 768 THEN embedding_column := 'embedding_768'; + WHEN 1024 THEN embedding_column := 'embedding_1024'; + WHEN 1536 THEN embedding_column := 'embedding_1536'; + WHEN 3072 THEN embedding_column := 'embedding_3072'; + ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension; + END CASE; + + sql_query := format(' + SELECT id, url, chunk_number, content, summary, metadata, source_id, + 1 - (%I <=> $1) AS similarity + FROM archon_code_examples + WHERE (%I IS NOT NULL) + AND metadata @> $3 + AND ($4 IS NULL OR source_id = $4) + ORDER BY %I <=> $1 + LIMIT $2', + embedding_column, embedding_column, embedding_column); + + RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter; +END; +$$; + +-- Legacy compatibility (defaults to 1536D) +CREATE OR REPLACE FUNCTION match_archon_crawled_pages ( + query_embedding VECTOR(1536), + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +BEGIN + RETURN QUERY SELECT * FROM match_archon_crawled_pages_multi(query_embedding, 1536, match_count, filter, source_filter); +END; +$$; + +CREATE OR REPLACE FUNCTION match_archon_code_examples ( + query_embedding VECTOR(1536), + match_count INT DEFAULT 10, + filter JSONB DEFAULT '{}'::jsonb, + source_filter TEXT DEFAULT NULL +) RETURNS TABLE ( + id BIGINT, + url VARCHAR, + chunk_number INTEGER, + content TEXT, + summary TEXT, + metadata JSONB, + source_id TEXT, + similarity FLOAT +) +LANGUAGE plpgsql +AS $$ +BEGIN + RETURN QUERY SELECT * FROM match_archon_code_examples_multi(query_embedding, 1536, match_count, filter, source_filter); +END; +$$; + +COMMIT; + +SELECT 'Step 3 Complete: Functions created successfully' AS status; \ No newline at end of file diff --git a/migration/step4_create_indexes_optional.sql b/migration/step4_create_indexes_optional.sql new file mode 100644 index 00000000..b97d1fd7 --- /dev/null +++ b/migration/step4_create_indexes_optional.sql @@ -0,0 +1,67 @@ +-- ====================================================================== +-- STEP 4: CREATE INDEXES (OPTIONAL - May timeout on large datasets) +-- Run this LAST, or skip and run via direct connection +-- ====================================================================== + +-- IMPORTANT: This step creates vector indexes which are memory-intensive +-- If this fails, you can skip it and the system will use brute-force search +-- You can create these indexes later via direct database connection + +SET maintenance_work_mem = '512MB'; +SET statement_timeout = '10min'; + +-- Create ONE index at a time to avoid memory issues +-- Comment out any that fail and continue with the next + +-- Index 1 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1536 +ON archon_crawled_pages USING ivfflat (embedding_1536 vector_cosine_ops) +WITH (lists = 100); + +-- Index 2 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1536 +ON archon_code_examples USING ivfflat (embedding_1536 vector_cosine_ops) +WITH (lists = 100); + +-- Index 3 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_768 +ON archon_crawled_pages USING ivfflat (embedding_768 vector_cosine_ops) +WITH (lists = 100); + +-- Index 4 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_768 +ON archon_code_examples USING ivfflat (embedding_768 vector_cosine_ops) +WITH (lists = 100); + +-- Index 5 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_384 +ON archon_crawled_pages USING ivfflat (embedding_384 vector_cosine_ops) +WITH (lists = 100); + +-- Index 6 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_384 +ON archon_code_examples USING ivfflat (embedding_384 vector_cosine_ops) +WITH (lists = 100); + +-- Index 7 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1024 +ON archon_crawled_pages USING ivfflat (embedding_1024 vector_cosine_ops) +WITH (lists = 100); + +-- Index 8 of 8 +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1024 +ON archon_code_examples USING ivfflat (embedding_1024 vector_cosine_ops) +WITH (lists = 100); + +-- Simple B-tree indexes (these are fast) +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_model ON archon_crawled_pages (embedding_model); +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_dimension ON archon_crawled_pages (embedding_dimension); +CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_llm_chat_model ON archon_crawled_pages (llm_chat_model); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_model ON archon_code_examples (embedding_model); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_dimension ON archon_code_examples (embedding_dimension); +CREATE INDEX IF NOT EXISTS idx_archon_code_examples_llm_chat_model ON archon_code_examples (llm_chat_model); + +RESET maintenance_work_mem; +RESET statement_timeout; + +SELECT 'Step 4 Complete: Indexes created (or skipped if timed out)' AS status; \ No newline at end of file