Splitting up the Ollama migration scripts

This commit is contained in:
Cole Medin
2025-09-20 13:33:15 -05:00
parent dd79254689
commit 3e5eef5503
9 changed files with 408 additions and 541 deletions

View File

@@ -0,0 +1,35 @@
-- ======================================================================
-- Migration 003: Ollama Implementation - Add Columns
-- Adds multi-dimensional embedding support columns
-- ======================================================================
-- Increase memory for this session
SET maintenance_work_mem = '256MB';
BEGIN;
-- Add multi-dimensional embedding columns to archon_crawled_pages
ALTER TABLE archon_crawled_pages
ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384),
ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768),
ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024),
ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536),
ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072),
ADD COLUMN IF NOT EXISTS llm_chat_model TEXT,
ADD COLUMN IF NOT EXISTS embedding_model TEXT,
ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER;
-- Add multi-dimensional embedding columns to archon_code_examples
ALTER TABLE archon_code_examples
ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384),
ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768),
ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024),
ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536),
ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072),
ADD COLUMN IF NOT EXISTS llm_chat_model TEXT,
ADD COLUMN IF NOT EXISTS embedding_model TEXT,
ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER;
COMMIT;
SELECT 'Ollama columns added successfully' AS status;

View File

@@ -1,518 +0,0 @@
-- ======================================================================
-- UPGRADE TO MODEL TRACKING AND MULTI-DIMENSIONAL EMBEDDINGS
-- ======================================================================
-- This migration upgrades existing Archon installations to support:
-- 1. Multi-dimensional embedding columns (768, 1024, 1536, 3072)
-- 2. Model tracking fields (llm_chat_model, embedding_model, embedding_dimension)
-- 3. 384-dimension support for smaller embedding models
-- 4. Enhanced search functions for multi-dimensional support
-- ======================================================================
--
-- IMPORTANT: Run this ONLY if you have an existing Archon installation
-- that was created BEFORE the multi-dimensional embedding support.
--
-- This script is SAFE to run multiple times - it uses IF NOT EXISTS checks.
-- ======================================================================
BEGIN;
-- ======================================================================
-- SECTION 1: ADD MULTI-DIMENSIONAL EMBEDDING COLUMNS
-- ======================================================================
-- Add multi-dimensional embedding columns to archon_crawled_pages
ALTER TABLE archon_crawled_pages
ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), -- Small embedding models
ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), -- Google/Ollama models
ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), -- Ollama large models
ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), -- OpenAI standard models
ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072); -- OpenAI large models
-- Add multi-dimensional embedding columns to archon_code_examples
ALTER TABLE archon_code_examples
ADD COLUMN IF NOT EXISTS embedding_384 VECTOR(384), -- Small embedding models
ADD COLUMN IF NOT EXISTS embedding_768 VECTOR(768), -- Google/Ollama models
ADD COLUMN IF NOT EXISTS embedding_1024 VECTOR(1024), -- Ollama large models
ADD COLUMN IF NOT EXISTS embedding_1536 VECTOR(1536), -- OpenAI standard models
ADD COLUMN IF NOT EXISTS embedding_3072 VECTOR(3072); -- OpenAI large models
-- ======================================================================
-- SECTION 2: ADD MODEL TRACKING COLUMNS
-- ======================================================================
-- Add model tracking columns to archon_crawled_pages
ALTER TABLE archon_crawled_pages
ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, -- LLM model used for processing (e.g., 'gpt-4', 'llama3:8b')
ADD COLUMN IF NOT EXISTS embedding_model TEXT, -- Embedding model used (e.g., 'text-embedding-3-large', 'all-MiniLM-L6-v2')
ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; -- Dimension of the embedding used (384, 768, 1024, 1536, 3072)
-- Add model tracking columns to archon_code_examples
ALTER TABLE archon_code_examples
ADD COLUMN IF NOT EXISTS llm_chat_model TEXT, -- LLM model used for processing (e.g., 'gpt-4', 'llama3:8b')
ADD COLUMN IF NOT EXISTS embedding_model TEXT, -- Embedding model used (e.g., 'text-embedding-3-large', 'all-MiniLM-L6-v2')
ADD COLUMN IF NOT EXISTS embedding_dimension INTEGER; -- Dimension of the embedding used (384, 768, 1024, 1536, 3072)
-- ======================================================================
-- SECTION 3: MIGRATE EXISTING EMBEDDING DATA
-- ======================================================================
-- Check if there's existing embedding data in old 'embedding' column
DO $$
DECLARE
crawled_pages_count INTEGER;
code_examples_count INTEGER;
dimension_detected INTEGER;
BEGIN
-- Check if old embedding column exists and has data
SELECT COUNT(*) INTO crawled_pages_count
FROM information_schema.columns
WHERE table_name = 'archon_crawled_pages'
AND column_name = 'embedding';
SELECT COUNT(*) INTO code_examples_count
FROM information_schema.columns
WHERE table_name = 'archon_code_examples'
AND column_name = 'embedding';
-- If old embedding columns exist, migrate the data
IF crawled_pages_count > 0 THEN
RAISE NOTICE 'Found existing embedding column in archon_crawled_pages - migrating data...';
-- Detect dimension from first non-null embedding
SELECT vector_dims(embedding) INTO dimension_detected
FROM archon_crawled_pages
WHERE embedding IS NOT NULL
LIMIT 1;
IF dimension_detected IS NOT NULL THEN
RAISE NOTICE 'Detected embedding dimension: %', dimension_detected;
-- Migrate based on detected dimension
CASE dimension_detected
WHEN 384 THEN
UPDATE archon_crawled_pages
SET embedding_384 = embedding,
embedding_dimension = 384,
embedding_model = COALESCE(embedding_model, 'legacy-384d-model')
WHERE embedding IS NOT NULL AND embedding_384 IS NULL;
WHEN 768 THEN
UPDATE archon_crawled_pages
SET embedding_768 = embedding,
embedding_dimension = 768,
embedding_model = COALESCE(embedding_model, 'legacy-768d-model')
WHERE embedding IS NOT NULL AND embedding_768 IS NULL;
WHEN 1024 THEN
UPDATE archon_crawled_pages
SET embedding_1024 = embedding,
embedding_dimension = 1024,
embedding_model = COALESCE(embedding_model, 'legacy-1024d-model')
WHERE embedding IS NOT NULL AND embedding_1024 IS NULL;
WHEN 1536 THEN
UPDATE archon_crawled_pages
SET embedding_1536 = embedding,
embedding_dimension = 1536,
embedding_model = COALESCE(embedding_model, 'text-embedding-3-small')
WHERE embedding IS NOT NULL AND embedding_1536 IS NULL;
WHEN 3072 THEN
UPDATE archon_crawled_pages
SET embedding_3072 = embedding,
embedding_dimension = 3072,
embedding_model = COALESCE(embedding_model, 'text-embedding-3-large')
WHERE embedding IS NOT NULL AND embedding_3072 IS NULL;
ELSE
RAISE NOTICE 'Unsupported embedding dimension detected: %. Skipping migration.', dimension_detected;
END CASE;
RAISE NOTICE 'Migrated existing embeddings to dimension-specific columns';
END IF;
END IF;
-- Migrate code examples if they exist
IF code_examples_count > 0 THEN
RAISE NOTICE 'Found existing embedding column in archon_code_examples - migrating data...';
-- Detect dimension from first non-null embedding
SELECT vector_dims(embedding) INTO dimension_detected
FROM archon_code_examples
WHERE embedding IS NOT NULL
LIMIT 1;
IF dimension_detected IS NOT NULL THEN
RAISE NOTICE 'Detected code examples embedding dimension: %', dimension_detected;
-- Migrate based on detected dimension
CASE dimension_detected
WHEN 384 THEN
UPDATE archon_code_examples
SET embedding_384 = embedding,
embedding_dimension = 384,
embedding_model = COALESCE(embedding_model, 'legacy-384d-model')
WHERE embedding IS NOT NULL AND embedding_384 IS NULL;
WHEN 768 THEN
UPDATE archon_code_examples
SET embedding_768 = embedding,
embedding_dimension = 768,
embedding_model = COALESCE(embedding_model, 'legacy-768d-model')
WHERE embedding IS NOT NULL AND embedding_768 IS NULL;
WHEN 1024 THEN
UPDATE archon_code_examples
SET embedding_1024 = embedding,
embedding_dimension = 1024,
embedding_model = COALESCE(embedding_model, 'legacy-1024d-model')
WHERE embedding IS NOT NULL AND embedding_1024 IS NULL;
WHEN 1536 THEN
UPDATE archon_code_examples
SET embedding_1536 = embedding,
embedding_dimension = 1536,
embedding_model = COALESCE(embedding_model, 'text-embedding-3-small')
WHERE embedding IS NOT NULL AND embedding_1536 IS NULL;
WHEN 3072 THEN
UPDATE archon_code_examples
SET embedding_3072 = embedding,
embedding_dimension = 3072,
embedding_model = COALESCE(embedding_model, 'text-embedding-3-large')
WHERE embedding IS NOT NULL AND embedding_3072 IS NULL;
ELSE
RAISE NOTICE 'Unsupported code examples embedding dimension: %. Skipping migration.', dimension_detected;
END CASE;
RAISE NOTICE 'Migrated existing code example embeddings to dimension-specific columns';
END IF;
END IF;
END $$;
-- ======================================================================
-- SECTION 4: CLEANUP LEGACY EMBEDDING COLUMNS
-- ======================================================================
-- Remove old embedding columns after successful migration
DO $$
DECLARE
crawled_pages_count INTEGER;
code_examples_count INTEGER;
BEGIN
-- Check if old embedding column exists in crawled pages
SELECT COUNT(*) INTO crawled_pages_count
FROM information_schema.columns
WHERE table_name = 'archon_crawled_pages'
AND column_name = 'embedding';
-- Check if old embedding column exists in code examples
SELECT COUNT(*) INTO code_examples_count
FROM information_schema.columns
WHERE table_name = 'archon_code_examples'
AND column_name = 'embedding';
-- Drop old embedding column from crawled pages if it exists
IF crawled_pages_count > 0 THEN
RAISE NOTICE 'Dropping legacy embedding column from archon_crawled_pages...';
ALTER TABLE archon_crawled_pages DROP COLUMN embedding;
RAISE NOTICE 'Successfully removed legacy embedding column from archon_crawled_pages';
END IF;
-- Drop old embedding column from code examples if it exists
IF code_examples_count > 0 THEN
RAISE NOTICE 'Dropping legacy embedding column from archon_code_examples...';
ALTER TABLE archon_code_examples DROP COLUMN embedding;
RAISE NOTICE 'Successfully removed legacy embedding column from archon_code_examples';
END IF;
-- Drop any indexes on the old embedding column if they exist
DROP INDEX IF EXISTS idx_archon_crawled_pages_embedding;
DROP INDEX IF EXISTS idx_archon_code_examples_embedding;
RAISE NOTICE 'Legacy column cleanup completed';
END $$;
-- ======================================================================
-- SECTION 5: CREATE OPTIMIZED INDEXES
-- ======================================================================
-- Create indexes for archon_crawled_pages (multi-dimensional support)
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_384
ON archon_crawled_pages USING ivfflat (embedding_384 vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_768
ON archon_crawled_pages USING ivfflat (embedding_768 vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1024
ON archon_crawled_pages USING ivfflat (embedding_1024 vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1536
ON archon_crawled_pages USING ivfflat (embedding_1536 vector_cosine_ops)
WITH (lists = 100);
-- Note: 3072-dimensional embeddings cannot have vector indexes due to PostgreSQL vector extension 2000 dimension limit
-- The embedding_3072 column exists but cannot be indexed with current pgvector version
-- Brute force search will be used for 3072-dimensional vectors
-- CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_3072
-- ON archon_crawled_pages USING hnsw (embedding_3072 vector_cosine_ops);
-- Create indexes for archon_code_examples (multi-dimensional support)
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_384
ON archon_code_examples USING ivfflat (embedding_384 vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_768
ON archon_code_examples USING ivfflat (embedding_768 vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1024
ON archon_code_examples USING ivfflat (embedding_1024 vector_cosine_ops)
WITH (lists = 100);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1536
ON archon_code_examples USING ivfflat (embedding_1536 vector_cosine_ops)
WITH (lists = 100);
-- Note: 3072-dimensional embeddings cannot have vector indexes due to PostgreSQL vector extension 2000 dimension limit
-- The embedding_3072 column exists but cannot be indexed with current pgvector version
-- Brute force search will be used for 3072-dimensional vectors
-- CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_3072
-- ON archon_code_examples USING hnsw (embedding_3072 vector_cosine_ops);
-- Create indexes for model tracking columns
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_model
ON archon_crawled_pages (embedding_model);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_dimension
ON archon_crawled_pages (embedding_dimension);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_llm_chat_model
ON archon_crawled_pages (llm_chat_model);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_model
ON archon_code_examples (embedding_model);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_dimension
ON archon_code_examples (embedding_dimension);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_llm_chat_model
ON archon_code_examples (llm_chat_model);
-- ======================================================================
-- SECTION 6: HELPER FUNCTIONS FOR MULTI-DIMENSIONAL SUPPORT
-- ======================================================================
-- Function to detect embedding dimension from vector
CREATE OR REPLACE FUNCTION detect_embedding_dimension(embedding_vector vector)
RETURNS INTEGER AS $$
BEGIN
RETURN vector_dims(embedding_vector);
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- Function to get the appropriate column name for a dimension
CREATE OR REPLACE FUNCTION get_embedding_column_name(dimension INTEGER)
RETURNS TEXT AS $$
BEGIN
CASE dimension
WHEN 384 THEN RETURN 'embedding_384';
WHEN 768 THEN RETURN 'embedding_768';
WHEN 1024 THEN RETURN 'embedding_1024';
WHEN 1536 THEN RETURN 'embedding_1536';
WHEN 3072 THEN RETURN 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %. Supported dimensions are: 384, 768, 1024, 1536, 3072', dimension;
END CASE;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- ======================================================================
-- SECTION 7: ENHANCED SEARCH FUNCTIONS
-- ======================================================================
-- Create multi-dimensional function to search for documentation chunks
CREATE OR REPLACE FUNCTION match_archon_crawled_pages_multi (
query_embedding VECTOR,
embedding_dimension INTEGER,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Build dynamic query
sql_query := format('
SELECT id, url, chunk_number, content, metadata, source_id,
1 - (%I <=> $1) AS similarity
FROM archon_crawled_pages
WHERE (%I IS NOT NULL)
AND metadata @> $3
AND ($4 IS NULL OR source_id = $4)
ORDER BY %I <=> $1
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter;
END;
$$;
-- Create multi-dimensional function to search for code examples
CREATE OR REPLACE FUNCTION match_archon_code_examples_multi (
query_embedding VECTOR,
embedding_dimension INTEGER,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
sql_query TEXT;
embedding_column TEXT;
BEGIN
-- Determine which embedding column to use based on dimension
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
-- Build dynamic query
sql_query := format('
SELECT id, url, chunk_number, content, summary, metadata, source_id,
1 - (%I <=> $1) AS similarity
FROM archon_code_examples
WHERE (%I IS NOT NULL)
AND metadata @> $3
AND ($4 IS NULL OR source_id = $4)
ORDER BY %I <=> $1
LIMIT $2',
embedding_column, embedding_column, embedding_column);
-- Execute dynamic query
RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter;
END;
$$;
-- ======================================================================
-- SECTION 8: LEGACY COMPATIBILITY FUNCTIONS
-- ======================================================================
-- Legacy compatibility function for crawled pages (defaults to 1536D)
CREATE OR REPLACE FUNCTION match_archon_crawled_pages (
query_embedding VECTOR(1536),
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM match_archon_crawled_pages_multi(query_embedding, 1536, match_count, filter, source_filter);
END;
$$;
-- Legacy compatibility function for code examples (defaults to 1536D)
CREATE OR REPLACE FUNCTION match_archon_code_examples (
query_embedding VECTOR(1536),
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM match_archon_code_examples_multi(query_embedding, 1536, match_count, filter, source_filter);
END;
$$;
COMMIT;
-- ======================================================================
-- MIGRATION COMPLETE - SUPABASE-FRIENDLY STATUS REPORT
-- ======================================================================
-- This final SELECT statement consolidates all status information for
-- display in Supabase SQL Editor (users only see the last query result)
SELECT
'🎉 ARCHON MODEL TRACKING UPGRADE COMPLETED! 🎉' AS status,
'Successfully upgraded your Archon installation' AS message,
ARRAY[
'✅ Multi-dimensional embedding support (384, 768, 1024, 1536, 3072)',
'✅ Model tracking fields (llm_chat_model, embedding_model, embedding_dimension)',
'✅ Optimized indexes for improved search performance',
'✅ Enhanced search functions with dimension-aware querying',
'✅ Legacy compatibility maintained for existing code',
'✅ Existing embedding data migrated (if any was found)',
'✅ Support for 3072-dimensional vectors (using brute force search)'
] AS features_added,
ARRAY[
'• Multiple embedding providers (OpenAI, Ollama, Google, etc.)',
'• Automatic model detection and tracking',
'• Improved search accuracy with dimension-specific indexing',
'• Full audit trail of which models processed your data'
] AS capabilities_enabled,
ARRAY[
'1. Restart your Archon services: docker compose restart',
'2. New crawls will automatically use the enhanced features',
'3. Check the Settings page to configure your preferred models',
'4. Run validate_migration.sql to verify everything works'
] AS next_steps;

View File

@@ -0,0 +1,70 @@
-- ======================================================================
-- Migration 004: Ollama Implementation - Migrate Data
-- Migrates existing embeddings to new multi-dimensional columns
-- ======================================================================
BEGIN;
-- Migrate existing embedding data from old column (if exists)
DO $$
DECLARE
crawled_pages_count INTEGER;
code_examples_count INTEGER;
dimension_detected INTEGER;
BEGIN
-- Check if old embedding column exists
SELECT COUNT(*) INTO crawled_pages_count
FROM information_schema.columns
WHERE table_name = 'archon_crawled_pages'
AND column_name = 'embedding';
IF crawled_pages_count > 0 THEN
-- Detect dimension
SELECT vector_dims(embedding) INTO dimension_detected
FROM archon_crawled_pages
WHERE embedding IS NOT NULL
LIMIT 1;
IF dimension_detected = 1536 THEN
UPDATE archon_crawled_pages
SET embedding_1536 = embedding,
embedding_dimension = 1536,
embedding_model = COALESCE(embedding_model, 'text-embedding-3-small')
WHERE embedding IS NOT NULL AND embedding_1536 IS NULL;
END IF;
-- Drop old column
ALTER TABLE archon_crawled_pages DROP COLUMN IF EXISTS embedding;
END IF;
-- Same for code_examples
SELECT COUNT(*) INTO code_examples_count
FROM information_schema.columns
WHERE table_name = 'archon_code_examples'
AND column_name = 'embedding';
IF code_examples_count > 0 THEN
SELECT vector_dims(embedding) INTO dimension_detected
FROM archon_code_examples
WHERE embedding IS NOT NULL
LIMIT 1;
IF dimension_detected = 1536 THEN
UPDATE archon_code_examples
SET embedding_1536 = embedding,
embedding_dimension = 1536,
embedding_model = COALESCE(embedding_model, 'text-embedding-3-small')
WHERE embedding IS NOT NULL AND embedding_1536 IS NULL;
END IF;
ALTER TABLE archon_code_examples DROP COLUMN IF EXISTS embedding;
END IF;
END $$;
-- Drop old indexes if they exist
DROP INDEX IF EXISTS idx_archon_crawled_pages_embedding;
DROP INDEX IF EXISTS idx_archon_code_examples_embedding;
COMMIT;
SELECT 'Ollama data migrated successfully' AS status;

View File

@@ -0,0 +1,172 @@
-- ======================================================================
-- Migration 005: Ollama Implementation - Create Functions
-- Creates search functions for multi-dimensional embeddings
-- ======================================================================
BEGIN;
-- Helper function to detect embedding dimension
CREATE OR REPLACE FUNCTION detect_embedding_dimension(embedding_vector vector)
RETURNS INTEGER AS $$
BEGIN
RETURN vector_dims(embedding_vector);
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- Helper function to get column name for dimension
CREATE OR REPLACE FUNCTION get_embedding_column_name(dimension INTEGER)
RETURNS TEXT AS $$
BEGIN
CASE dimension
WHEN 384 THEN RETURN 'embedding_384';
WHEN 768 THEN RETURN 'embedding_768';
WHEN 1024 THEN RETURN 'embedding_1024';
WHEN 1536 THEN RETURN 'embedding_1536';
WHEN 3072 THEN RETURN 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', dimension;
END CASE;
END;
$$ LANGUAGE plpgsql IMMUTABLE;
-- Multi-dimensional search for crawled pages
CREATE OR REPLACE FUNCTION match_archon_crawled_pages_multi (
query_embedding VECTOR,
embedding_dimension INTEGER,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
sql_query TEXT;
embedding_column TEXT;
BEGIN
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
sql_query := format('
SELECT id, url, chunk_number, content, metadata, source_id,
1 - (%I <=> $1) AS similarity
FROM archon_crawled_pages
WHERE (%I IS NOT NULL)
AND metadata @> $3
AND ($4 IS NULL OR source_id = $4)
ORDER BY %I <=> $1
LIMIT $2',
embedding_column, embedding_column, embedding_column);
RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter;
END;
$$;
-- Multi-dimensional search for code examples
CREATE OR REPLACE FUNCTION match_archon_code_examples_multi (
query_embedding VECTOR,
embedding_dimension INTEGER,
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
#variable_conflict use_column
DECLARE
sql_query TEXT;
embedding_column TEXT;
BEGIN
CASE embedding_dimension
WHEN 384 THEN embedding_column := 'embedding_384';
WHEN 768 THEN embedding_column := 'embedding_768';
WHEN 1024 THEN embedding_column := 'embedding_1024';
WHEN 1536 THEN embedding_column := 'embedding_1536';
WHEN 3072 THEN embedding_column := 'embedding_3072';
ELSE RAISE EXCEPTION 'Unsupported embedding dimension: %', embedding_dimension;
END CASE;
sql_query := format('
SELECT id, url, chunk_number, content, summary, metadata, source_id,
1 - (%I <=> $1) AS similarity
FROM archon_code_examples
WHERE (%I IS NOT NULL)
AND metadata @> $3
AND ($4 IS NULL OR source_id = $4)
ORDER BY %I <=> $1
LIMIT $2',
embedding_column, embedding_column, embedding_column);
RETURN QUERY EXECUTE sql_query USING query_embedding, match_count, filter, source_filter;
END;
$$;
-- Legacy compatibility (defaults to 1536D)
CREATE OR REPLACE FUNCTION match_archon_crawled_pages (
query_embedding VECTOR(1536),
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM match_archon_crawled_pages_multi(query_embedding, 1536, match_count, filter, source_filter);
END;
$$;
CREATE OR REPLACE FUNCTION match_archon_code_examples (
query_embedding VECTOR(1536),
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'::jsonb,
source_filter TEXT DEFAULT NULL
) RETURNS TABLE (
id BIGINT,
url VARCHAR,
chunk_number INTEGER,
content TEXT,
summary TEXT,
metadata JSONB,
source_id TEXT,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY SELECT * FROM match_archon_code_examples_multi(query_embedding, 1536, match_count, filter, source_filter);
END;
$$;
COMMIT;
SELECT 'Ollama functions created successfully' AS status;

View File

@@ -0,0 +1,67 @@
-- ======================================================================
-- Migration 006: Ollama Implementation - Create Indexes (Optional)
-- Creates vector indexes for performance (may timeout on large datasets)
-- ======================================================================
-- IMPORTANT: This migration creates vector indexes which are memory-intensive
-- If this fails, you can skip it and the system will use brute-force search
-- You can create these indexes later via direct database connection
SET maintenance_work_mem = '512MB';
SET statement_timeout = '10min';
-- Create ONE index at a time to avoid memory issues
-- Comment out any that fail and continue with the next
-- Index 1 of 8
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1536
ON archon_crawled_pages USING ivfflat (embedding_1536 vector_cosine_ops)
WITH (lists = 100);
-- Index 2 of 8
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1536
ON archon_code_examples USING ivfflat (embedding_1536 vector_cosine_ops)
WITH (lists = 100);
-- Index 3 of 8
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_768
ON archon_crawled_pages USING ivfflat (embedding_768 vector_cosine_ops)
WITH (lists = 100);
-- Index 4 of 8
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_768
ON archon_code_examples USING ivfflat (embedding_768 vector_cosine_ops)
WITH (lists = 100);
-- Index 5 of 8
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_384
ON archon_crawled_pages USING ivfflat (embedding_384 vector_cosine_ops)
WITH (lists = 100);
-- Index 6 of 8
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_384
ON archon_code_examples USING ivfflat (embedding_384 vector_cosine_ops)
WITH (lists = 100);
-- Index 7 of 8
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_1024
ON archon_crawled_pages USING ivfflat (embedding_1024 vector_cosine_ops)
WITH (lists = 100);
-- Index 8 of 8
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_1024
ON archon_code_examples USING ivfflat (embedding_1024 vector_cosine_ops)
WITH (lists = 100);
-- Simple B-tree indexes (these are fast)
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_model ON archon_crawled_pages (embedding_model);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_embedding_dimension ON archon_crawled_pages (embedding_dimension);
CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_llm_chat_model ON archon_crawled_pages (llm_chat_model);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_model ON archon_code_examples (embedding_model);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_embedding_dimension ON archon_code_examples (embedding_dimension);
CREATE INDEX IF NOT EXISTS idx_archon_code_examples_llm_chat_model ON archon_code_examples (llm_chat_model);
RESET maintenance_work_mem;
RESET statement_timeout;
SELECT 'Ollama indexes created (or skipped if timed out - that issue will be obvious in Supabase)' AS status;

View File

@@ -1,4 +1,4 @@
-- Migration: 005_add_migration_tracking.sql
-- Migration: 008_add_migration_tracking.sql
-- Description: Create archon_migrations table for tracking applied database migrations
-- Version: 0.1.0
-- Author: Archon Team
@@ -30,18 +30,21 @@ COMMENT ON COLUMN archon_migrations.checksum IS 'Optional MD5 checksum of migrat
-- Record this migration as applied (self-recording pattern)
-- This allows the migration system to bootstrap itself
INSERT INTO archon_migrations (version, migration_name)
VALUES ('0.1.0', '005_add_migration_tracking')
VALUES ('0.1.0', '008_add_migration_tracking')
ON CONFLICT (version, migration_name) DO NOTHING;
-- Retroactively record previously applied migrations (001-004)
-- Retroactively record previously applied migrations (001-007)
-- Since these migrations couldn't self-record (table didn't exist yet),
-- we record them here to ensure the migration system knows they've been applied
INSERT INTO archon_migrations (version, migration_name)
VALUES
('0.1.0', '001_add_source_url_display_name'),
('0.1.0', '002_add_hybrid_search_tsvector'),
('0.1.0', '003_ollama_implementation'),
('0.1.0', '004_add_priority_column_to_tasks')
('0.1.0', '003_ollama_add_columns'),
('0.1.0', '004_ollama_migrate_data'),
('0.1.0', '005_ollama_create_functions'),
('0.1.0', '006_ollama_create_indexes_optional'),
('0.1.0', '007_add_priority_column_to_tasks')
ON CONFLICT (version, migration_name) DO NOTHING;
-- Enable Row Level Security on migrations table

View File

@@ -16,6 +16,8 @@ Creates timestamped backup tables of all your existing data:
### 2. Migration Scripts (Run in Order)
You only have to run the ones you haven't already! If you don't remember exactly, it is okay to rerun migration scripts.
**2.1. `001_add_source_url_display_name.sql`**
- Adds display name field to sources table
- Improves UI presentation of crawled sources
@@ -25,17 +27,34 @@ Creates timestamped backup tables of all your existing data:
- Implements hybrid search with tsvector columns
- Creates optimized search indexes
**2.3. `003_ollama_implementation.sql`**
- Multi-dimensional embedding support (384, 768, 1024, 1536, 3072 dimensions)
- Model tracking fields (`llm_chat_model`, `embedding_model`, `embedding_dimension`)
- Optimized indexes for improved search performance
- Enhanced search functions with dimension-aware querying
- Automatic migration of existing embedding data
**2.3. `003_ollama_add_columns.sql`**
- Adds multi-dimensional embedding columns (384, 768, 1024, 1536, 3072 dimensions)
- Adds model tracking fields (`llm_chat_model`, `embedding_model`, `embedding_dimension`)
**2.4. `004_add_priority_column_to_tasks.sql`**
**2.4. `004_ollama_migrate_data.sql`**
- Migrates existing embeddings to new multi-dimensional columns
- Drops old embedding column after migration
- Removes obsolete indexes
**2.5. `005_ollama_create_functions.sql`**
- Creates search functions for multi-dimensional embeddings
- Adds helper functions for dimension detection
- Maintains backward compatibility with legacy search functions
**2.6. `006_ollama_create_indexes_optional.sql`**
- Creates vector indexes for performance (may timeout on large datasets)
- Creates B-tree indexes for model fields
- Can be skipped if timeout occurs (system will use brute-force search)
**2.7. `007_add_priority_column_to_tasks.sql`**
- Adds priority field to tasks table
- Enables task prioritization in project management
**2.8. `008_add_migration_tracking.sql`**
- Creates migration tracking table
- Records all applied migrations
- Enables migration version control
## Migration Process (Follow This Order!)
### Step 1: Backup Your Data
@@ -49,8 +68,12 @@ Creates timestamped backup tables of all your existing data:
-- Run each script in sequence:
-- 1. Run: 001_add_source_url_display_name.sql
-- 2. Run: 002_add_hybrid_search_tsvector.sql
-- 3. Run: 003_ollama_implementation.sql
-- 4. Run: 004_add_priority_column_to_tasks.sql
-- 3. Run: 003_ollama_add_columns.sql
-- 4. Run: 004_ollama_migrate_data.sql
-- 5. Run: 005_ollama_create_functions.sql
-- 6. Run: 006_ollama_create_indexes_optional.sql (optional - may timeout)
-- 7. Run: 007_add_priority_column_to_tasks.sql
-- 8. Run: 008_add_migration_tracking.sql
```
### Step 3: Restart Services
@@ -75,8 +98,12 @@ psql -h your-supabase-host -p 5432 -U postgres -d postgres
# Run the migrations in order
\i /path/to/001_add_source_url_display_name.sql
\i /path/to/002_add_hybrid_search_tsvector.sql
\i /path/to/003_ollama_implementation.sql
\i /path/to/004_add_priority_column_to_tasks.sql
\i /path/to/003_ollama_add_columns.sql
\i /path/to/004_ollama_migrate_data.sql
\i /path/to/005_ollama_create_functions.sql
\i /path/to/006_ollama_create_indexes_optional.sql
\i /path/to/007_add_priority_column_to_tasks.sql
\i /path/to/008_add_migration_tracking.sql
# Exit
\q
@@ -87,14 +114,22 @@ psql -h your-supabase-host -p 5432 -U postgres -d postgres
# Copy migrations to container
docker cp 001_add_source_url_display_name.sql supabase-db:/tmp/
docker cp 002_add_hybrid_search_tsvector.sql supabase-db:/tmp/
docker cp 003_ollama_implementation.sql supabase-db:/tmp/
docker cp 004_add_priority_column_to_tasks.sql supabase-db:/tmp/
docker cp 003_ollama_add_columns.sql supabase-db:/tmp/
docker cp 004_ollama_migrate_data.sql supabase-db:/tmp/
docker cp 005_ollama_create_functions.sql supabase-db:/tmp/
docker cp 006_ollama_create_indexes_optional.sql supabase-db:/tmp/
docker cp 007_add_priority_column_to_tasks.sql supabase-db:/tmp/
docker cp 008_add_migration_tracking.sql supabase-db:/tmp/
# Execute migrations in order
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/001_add_source_url_display_name.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/002_add_hybrid_search_tsvector.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/003_ollama_implementation.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/004_add_priority_column_to_tasks.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/003_ollama_add_columns.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/004_ollama_migrate_data.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/005_ollama_create_functions.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/006_ollama_create_indexes_optional.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/007_add_priority_column_to_tasks.sql
docker exec -it supabase-db psql -U postgres -d postgres -f /tmp/008_add_migration_tracking.sql
```
## Migration Safety

View File

@@ -982,9 +982,12 @@ INSERT INTO archon_migrations (version, migration_name)
VALUES
('0.1.0', '001_add_source_url_display_name'),
('0.1.0', '002_add_hybrid_search_tsvector'),
('0.1.0', '003_ollama_implementation'),
('0.1.0', '004_add_priority_column_to_tasks'),
('0.1.0', '005_add_migration_tracking')
('0.1.0', '003_ollama_add_columns'),
('0.1.0', '004_ollama_migrate_data'),
('0.1.0', '005_ollama_create_functions'),
('0.1.0', '006_ollama_create_indexes_optional'),
('0.1.0', '007_add_priority_column_to_tasks'),
('0.1.0', '008_add_migration_tracking')
ON CONFLICT (version, migration_name) DO NOTHING;
-- Enable Row Level Security on migrations table