Merge branch 'ui-changes-backup' into merge-ui-socket-fixes

2026-01-06 06:38:00 -05:00 · 2025-08-31 22:30:07 -04:00
parent f938a3d9d0 4fa0e65dd3
commit 12096ef15c
60 changed files with 6092 additions and 2259 deletions
--- a/python/src/mcp_server/features/tasks/task_tools.py
+++ b/python/src/mcp_server/features/tasks/task_tools.py
@@ -316,8 +316,8 @@ def register_task_tools(mcp: FastMCP):

        Args:
            task_id: UUID of the task to update
-            title: New title (optional)
-            description: New description (optional)
+            title: New task title (optional)
+            description: New task description (optional)
            status: New status - "todo" | "doing" | "review" | "done" (optional)
            assignee: New assignee (optional)
            task_order: New priority order (optional)
@@ -358,7 +358,7 @@ def register_task_tools(mcp: FastMCP):
            if not update_fields:
                return MCPErrorFormatter.format_error(
                    error_type="validation_error",
-                    message="No fields to update",
+                    message="No fields provided to update",
                    suggestion="Provide at least one field to update",
                )

--- a/python/src/server/api_routes/agent_chat_api.py
+++ b/python/src/server/api_routes/agent_chat_api.py
@@ -18,9 +18,7 @@ from pydantic import BaseModel
 logger = logging.getLogger(__name__)

 # Import Socket.IO instance
-from ..socketio_app import get_socketio_instance
-
-sio = get_socketio_instance()
+from ..socketio_app import sio

 # Create router
 router = APIRouter(prefix="/api/agent-chat", tags=["agent-chat"])
--- a/python/src/server/api_routes/knowledge_api.py
+++ b/python/src/server/api_routes/knowledge_api.py
--- a/python/src/server/api_routes/socketio_broadcasts.py
+++ b/python/src/server/api_routes/socketio_broadcasts.py
@@ -8,12 +8,10 @@ No other modules should import from this file.
 import asyncio

 from ..config.logfire_config import get_logger
-from ..socketio_app import get_socketio_instance
+from ..socketio_app import sio

 logger = get_logger(__name__)

-# Get Socket.IO instance
-sio = get_socketio_instance()


 # Core broadcast functions
--- a/python/src/server/api_routes/socketio_handlers.py
+++ b/python/src/server/api_routes/socketio_handlers.py
@@ -13,13 +13,10 @@ from ..config.logfire_config import get_logger
 from ..services.background_task_manager import get_task_manager
 from ..services.projects.project_service import ProjectService
 from ..services.projects.source_linking_service import SourceLinkingService
-from ..socketio_app import get_socketio_instance
+from ..socketio_app import sio

 logger = get_logger(__name__)

-# Get Socket.IO instance
-sio = get_socketio_instance()
-logger.info(f"🔗 [SOCKETIO] Socket.IO instance ID: {id(sio)}")

 # Rate limiting for Socket.IO broadcasts
 _last_broadcast_times: dict[str, float] = {}
--- a/python/src/server/services/crawling/code_extraction_service.py
+++ b/python/src/server/services/crawling/code_extraction_service.py
@@ -217,14 +217,21 @@ class CodeExtractionService:
        Returns:
            List of code blocks with metadata
        """
+        import asyncio
+        import time
+        
        # Progress will be reported during the loop below

        all_code_blocks = []
        total_docs = len(crawl_results)
        completed_docs = 0
+        
+        # PERFORMANCE: Track extraction time per document
+        MAX_EXTRACTION_TIME_PER_DOC = 5.0  # 5 seconds max per document

        for doc in crawl_results:
            try:
+                doc_start_time = time.time()
                source_url = doc["url"]
                html_content = doc.get("html", "")
                md = doc.get("markdown", "")
@@ -234,9 +241,7 @@ class CodeExtractionService:
                    f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
                )

-                # Get dynamic minimum length based on document context
-                # Extract some context from the document for analysis
-                doc_context = md[:1000] if md else html_content[:1000] if html_content else ""
+                # Dynamic minimum length is handled inside the extraction methods

                # Check markdown first to see if it has code blocks
                if md:
@@ -287,15 +292,32 @@ class CodeExtractionService:

                # If not a text file or no code blocks found, try HTML extraction first
                if len(code_blocks) == 0 and html_content and not is_text_file:
-                    safe_logfire_info(
-                        f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
-                    )
-                    html_code_blocks = await self._extract_html_code_blocks(html_content)
-                    if html_code_blocks:
-                        code_blocks = html_code_blocks
+                    # PERFORMANCE: Check if we've already spent too much time on this document
+                    elapsed_time = time.time() - doc_start_time
+                    if elapsed_time > MAX_EXTRACTION_TIME_PER_DOC:
                        safe_logfire_info(
-                            f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
+                            f"⏱️ Skipping HTML extraction for {source_url} - already spent {elapsed_time:.1f}s"
                        )
+                    else:
+                        safe_logfire_info(
+                            f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
+                        )
+                        # Create a timeout for HTML extraction
+                        remaining_time = MAX_EXTRACTION_TIME_PER_DOC - elapsed_time
+                        try:
+                            html_code_blocks = await asyncio.wait_for(
+                                self._extract_html_code_blocks(html_content, source_url),
+                                timeout=remaining_time
+                            )
+                            if html_code_blocks:
+                                code_blocks = html_code_blocks
+                                safe_logfire_info(
+                                    f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
+                                )
+                        except asyncio.TimeoutError:
+                            safe_logfire_info(
+                                f"⏱️ HTML extraction timed out after {remaining_time:.1f}s for {source_url}"
+                            )

                # If still no code blocks, try markdown extraction as fallback
                if len(code_blocks) == 0 and md and "```" in md:
@@ -322,6 +344,14 @@ class CodeExtractionService:

                # Update progress only after completing document extraction
                completed_docs += 1
+                extraction_time = time.time() - doc_start_time
+                if extraction_time > 2.0:  # Log slow extractions
+                    safe_logfire_info(
+                        f"⏱️ Document extraction took {extraction_time:.1f}s | url={source_url} | "
+                        f"html_size={len(html_content) if html_content else 0} | "
+                        f"blocks_found={len([b for b in all_code_blocks if b['source_url'] == source_url])}"
+                    )
+                    
                if progress_callback and total_docs > 0:
                    # Calculate progress within the specified range
                    raw_progress = completed_docs / total_docs
@@ -343,13 +373,14 @@ class CodeExtractionService:

        return all_code_blocks

-    async def _extract_html_code_blocks(self, content: str) -> list[dict[str, Any]]:
+    async def _extract_html_code_blocks(self, content: str, source_url: str = "") -> list[dict[str, Any]]:
        """
        Extract code blocks from HTML patterns in content.
        This is a fallback when markdown conversion didn't preserve code blocks.

        Args:
            content: The content to search for HTML code patterns
+            source_url: The URL of the document being processed
            min_length: Minimum length for code blocks

        Returns:
@@ -359,6 +390,20 @@ class CodeExtractionService:

        # Add detailed logging
        safe_logfire_info(f"Processing HTML of length {len(content)} for code extraction")
+        
+        # PERFORMANCE OPTIMIZATION: Skip extremely large HTML files or chunk them
+        MAX_HTML_SIZE = 1_000_000  # 1MB limit for single-pass processing (increased from 500KB)
+        if len(content) > MAX_HTML_SIZE:
+            safe_logfire_info(
+                f"⚠️ HTML content is very large ({len(content)} bytes). "
+                f"Limiting to first {MAX_HTML_SIZE} bytes to prevent timeout."
+            )
+            # For very large files, focus on the first portion where code examples are likely to be
+            content = content[:MAX_HTML_SIZE]
+            # Try to find a good cutoff point (end of a tag)
+            last_tag_end = content.rfind('>')
+            if last_tag_end > MAX_HTML_SIZE - 1000:
+                content = content[:last_tag_end + 1]

        # Check if we have actual content
        if len(content) < 1000:
@@ -510,9 +555,71 @@ class CodeExtractionService:
            ),
        ]

-        for pattern_tuple in patterns:
+        # PERFORMANCE: Early exit checks to avoid unnecessary regex processing
+        # Check more content (20KB instead of 5KB) and add URL-based exceptions
+        check_size = min(20000, len(content))  # Check first 20KB or entire content if smaller
+        has_code_indicators = any(indicator in content[:check_size] for indicator in 
+                                 ['<pre', '<code', 'language-', 'hljs', 'prism', 'shiki', 'highlight'])
+        
+        # Never skip certain documentation sites that we know have code
+        is_known_code_site = any(domain in source_url.lower() for domain in 
+                                ['milkdown', 'github.com', 'gitlab', 'docs.', 'dev.', 'api.'])
+        
+        if not has_code_indicators and not is_known_code_site:
+            safe_logfire_info(f"No code indicators found in first {check_size} chars and not a known code site, skipping HTML extraction | url={source_url}")
+            return []
+        
+        if is_known_code_site and not has_code_indicators:
+            safe_logfire_info(f"Known code site but no indicators in first {check_size} chars, continuing anyway | url={source_url}")
+        
+        # PERFORMANCE: Limit number of patterns to check based on detected libraries
+        patterns_to_check = []
+        content_lower = content[:10000].lower()  # Check first 10KB for library detection
+        
+        # Selectively add patterns based on what's detected
+        if 'milkdown' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'milkdown' in p[1]])
+        if 'monaco' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'monaco' in p[1]])
+        if 'codemirror' in content_lower or 'cm-' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'codemirror' in p[1]])
+        if 'prism' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'prism' in p[1]])
+        if 'hljs' in content_lower or 'highlight' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'hljs' in p[1] or 'highlight' in p[1]])
+        if 'shiki' in content_lower or 'astro' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'shiki' in p[1] or 'astro' in p[1]])
+        
+        # Always include standard patterns as fallback (get ALL standard/generic patterns, not just last 5)
+        standard_patterns = [p for p in patterns if any(tag in p[1] for tag in ['standard', 'generic', 'prism', 'hljs'])]
+        patterns_to_check.extend(standard_patterns)
+        
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_patterns = []
+        for p in patterns_to_check:
+            if p[1] not in seen:
+                unique_patterns.append(p)
+                seen.add(p[1])
+        patterns_to_check = unique_patterns
+        
+        # If we have very few patterns and it's a known code site, add more generic patterns
+        if len(patterns_to_check) < 5 and is_known_code_site:
+            safe_logfire_info(f"Known code site with few patterns ({len(patterns_to_check)}), adding more generic patterns")
+            patterns_to_check = patterns  # Use all patterns for known code sites
+        
+        safe_logfire_info(f"Checking {len(patterns_to_check)} relevant patterns out of {len(patterns)} total")
+        
+        for pattern_tuple in patterns_to_check:
            pattern_str, source_type = pattern_tuple
-            matches = list(re.finditer(pattern_str, content, re.DOTALL | re.IGNORECASE))
+            
+            # PERFORMANCE: Use re.finditer with smaller chunks for very long content
+            # Only use DOTALL for patterns that really need it (multi-line blocks)
+            flags = re.IGNORECASE
+            if 'monaco' in source_type or 'codemirror' in source_type:
+                flags |= re.DOTALL  # These need DOTALL for multi-line matching
+            
+            matches = list(re.finditer(pattern_str, content, flags))

            # Log pattern matches for Milkdown patterns and CodeMirror
            if matches and (
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -433,6 +433,9 @@ class CrawlingService:
            )

            # Complete - send both the progress update and completion event
+            # CRITICAL: This is the ONLY place that should send status="completed"!
+            # All crawl strategies (batch, recursive, etc.) should use "finished" or other words.
+            # The frontend disconnects when it sees status="completed", so this must be the final step.
            await update_mapped_progress(
                "completed",
                100,
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -73,7 +73,8 @@ class BatchCrawlStrategy:
        except Exception as e:
            # For non-critical errors (e.g., network issues), use defaults but log prominently
            logger.error(
-                f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
+                f"Failed to load crawl settings from database: {e}, using defaults",
+                exc_info=True
            )
            batch_size = 50
            if max_concurrent is None:
@@ -98,101 +99,93 @@ class BatchCrawlStrategy:
                wait_for_images=False,  # Skip images for faster crawling
                scan_full_page=True,  # Trigger lazy loading
                exclude_all_images=False,
-                remove_overlay_elements=True,
-                process_iframes=True,
            )
        else:
-            # Configuration for regular batch crawling
+            # Regular sites use standard configuration
            crawl_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
-                stream=True,  # Enable streaming
+                stream=True,
                markdown_generator=self.markdown_generator,
                wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
-                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
+                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
                delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
-                scan_full_page=True,
+                wait_for_images=False,
+                scan_full_page=False,  # Don't scan full page for non-doc sites
+                exclude_all_images=False,
            )

+        # Transform URLs if needed
+        processed_urls = [transform_url_func(url) for url in urls]
+
+        # Create memory adaptive dispatcher
        dispatcher = MemoryAdaptiveDispatcher(
-            memory_threshold_percent=memory_threshold,
+            max_sessions=max_concurrent,
+            memory_threshold_mb=memory_threshold,
            check_interval=check_interval,
-            max_session_permit=max_concurrent,
        )

-        async def report_progress(percentage: int, message: str, **kwargs):
-            """Helper to report progress if callback is available"""
+        # Crawl URLs in batches using arun_many
+        results = []
+        total_urls = len(processed_urls)
+
+        for batch_start in range(0, total_urls, batch_size):
+            batch_end = min(batch_start + batch_size, total_urls)
+            batch = processed_urls[batch_start:batch_end]
+
+            # Calculate progress for this batch
            if progress_callback:
-                step_info = {"currentStep": message, "stepMessage": message, **kwargs}
-                await progress_callback("crawling", percentage, message, step_info=step_info)
-
-        total_urls = len(urls)
-        await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
-
-        # Use configured batch size
-        successful_results = []
-        processed = 0
-
-        # Transform all URLs at the beginning
-        url_mapping = {}  # Map transformed URLs back to original
-        transformed_urls = []
-        for url in urls:
-            transformed = transform_url_func(url)
-            transformed_urls.append(transformed)
-            url_mapping[transformed] = url
-
-        for i in range(0, total_urls, batch_size):
-            batch_urls = transformed_urls[i : i + batch_size]
-            batch_start = i
-            batch_end = min(i + batch_size, total_urls)
-
-            # Report batch start with smooth progress
-            progress_percentage = start_progress + int(
-                (i / total_urls) * (end_progress - start_progress)
-            )
-            await report_progress(
-                progress_percentage,
-                f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
-            )
-
-            # Crawl this batch using arun_many with streaming
-            logger.info(
-                f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
-            )
-            batch_results = await self.crawler.arun_many(
-                urls=batch_urls, config=crawl_config, dispatcher=dispatcher
-            )
-
-            # Handle streaming results
-            async for result in batch_results:
-                processed += 1
-                if result.success and result.markdown:
-                    # Map back to original URL
-                    original_url = url_mapping.get(result.url, result.url)
-                    successful_results.append({
-                        "url": original_url,
-                        "markdown": result.markdown,
-                        "html": result.html,  # Use raw HTML
-                    })
-                else:
-                    logger.warning(
-                        f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
-                    )
-
-                # Report individual URL progress with smooth increments
-                progress_percentage = start_progress + int(
-                    (processed / total_urls) * (end_progress - start_progress)
+                batch_progress = start_progress + ((batch_start / total_urls) * (end_progress - start_progress))
+                await progress_callback(
+                    "batch_crawling",
+                    int(batch_progress),
+                    f"Crawling batch {batch_start // batch_size + 1} ({batch_start + 1}-{batch_end}/{total_urls} URLs)"
                )
-                # Report more frequently for smoother progress
-                if (
-                    processed % 5 == 0 or processed == total_urls
-                ):  # Report every 5 URLs or at the end
-                    await report_progress(
-                        progress_percentage,
-                        f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
-                    )

-        await report_progress(
-            end_progress,
-            f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
-        )
-        return successful_results
+            # Run batch crawl
+            try:
+                batch_results = await self.crawler.arun_many(
+                    batch, 
+                    config=crawl_config, 
+                    dispatcher=dispatcher
+                )
+
+                # Process results
+                for result in batch_results:
+                    if result.success:
+                        results.append({
+                            "url": result.url,
+                            "markdown": result.markdown_v2.raw_markdown if result.markdown_v2 else "",
+                            "success": True,
+                            "metadata": result.extracted_content if hasattr(result, 'extracted_content') else {}
+                        })
+                    else:
+                        logger.warning(f"Failed to crawl {result.url}: {result.error_message}")
+                        results.append({
+                            "url": result.url,
+                            "markdown": "",
+                            "success": False,
+                            "error": result.error_message
+                        })
+
+            except Exception as e:
+                logger.error(f"Batch crawl error: {e}", exc_info=True)
+                # Add failed results for this batch
+                for url in batch:
+                    results.append({
+                        "url": url,
+                        "markdown": "",
+                        "success": False,
+                        "error": str(e)
+                    })
+
+            # Update progress after batch completion
+            # IMPORTANT: Use "finished" not "completed" - only the final orchestrator should send "completed"
+            if progress_callback:
+                batch_progress = start_progress + ((batch_end / total_urls) * (end_progress - start_progress))
+                await progress_callback(
+                    "batch_crawling",
+                    int(batch_progress),
+                    f"Finished batch {batch_start // batch_size + 1}"
+                )
+
+        return results
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -60,7 +60,7 @@ class RecursiveCrawlStrategy:
        if not self.crawler:
            logger.error("No crawler instance available for recursive crawling")
            if progress_callback:
-                await progress_callback("error", 0, "Crawler not available")
+                await progress_callback("error", 0, "Crawler not available", step_info={"currentStep": "error", "stepMessage": "Crawler not available"})
            return []

        # Load settings from database - fail fast on configuration errors
@@ -78,7 +78,8 @@ class RecursiveCrawlStrategy:
        except Exception as e:
            # For non-critical errors (e.g., network issues), use defaults but log prominently
            logger.error(
-                f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
+                f"Failed to load crawl settings from database: {e}, using defaults",
+                exc_info=True
            )
            batch_size = 50
            if max_concurrent is None:
@@ -126,11 +127,19 @@ class RecursiveCrawlStrategy:
        )

        async def report_progress(percentage: int, message: str, **kwargs):
-            """Helper to report progress if callback is available"""
+            """Helper to report progress if callback is available
+            
+            IMPORTANT: Never use "complete" or "completed" in messages here!
+            This is just an intermediate step in the overall crawl process.
+            Only the final orchestrator should send "completed" status.
+            """
            if progress_callback:
                # Add step information for multi-progress tracking
-                step_info = {"currentStep": message, "stepMessage": message, **kwargs}
-                await progress_callback("crawling", percentage, message, **step_info)
+                step_info = {
+                    "currentStep": message,
+                    "stepMessage": message
+                }
+                await progress_callback("crawling", percentage, message, step_info=step_info, **kwargs)

        visited = set()

@@ -169,14 +178,6 @@ class RecursiveCrawlStrategy:
                batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
                batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))

-                # Transform URLs and create mapping for this batch
-                url_mapping = {}
-                transformed_batch_urls = []
-                for url in batch_urls:
-                    transformed = transform_url_func(url)
-                    transformed_batch_urls.append(transformed)
-                    url_mapping[transformed] = url
-
                # Calculate progress for this batch within the depth
                batch_progress = depth_start + int(
                    (batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
@@ -191,14 +192,20 @@ class RecursiveCrawlStrategy:
                # Use arun_many for native parallel crawling with streaming
                logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
                batch_results = await self.crawler.arun_many(
-                    urls=transformed_batch_urls, config=run_config, dispatcher=dispatcher
+                    urls=batch_urls,
+                    config=run_config,
+                    dispatcher=dispatcher
                )

                # Handle streaming results from arun_many
                i = 0
                async for result in batch_results:
-                    # Map back to original URL using the mapping dict
-                    original_url = url_mapping.get(result.url, result.url)
+                    # Map back to original URL if transformed
+                    original_url = result.url
+                    for orig_url in batch_urls:
+                        if transform_url_func(orig_url) == result.url:
+                            original_url = orig_url
+                            break

                    norm_url = normalize_url(original_url)
                    visited.add(norm_url)
@@ -213,14 +220,14 @@ class RecursiveCrawlStrategy:
                        depth_successful += 1

                        # Find internal links for next depth
-                        links = getattr(result, "links", {}) or {}
-                        for link in links.get("internal", []):
+                        for link in result.links.get("internal", []):
                            next_url = normalize_url(link["href"])
                            # Skip binary files and already visited URLs
-                            is_binary = self.url_handler.is_binary_file(next_url)
-                            if next_url not in visited and not is_binary:
+                            if next_url not in visited and not self.url_handler.is_binary_file(
+                                next_url
+                            ):
                                next_level_urls.add(next_url)
-                            elif is_binary:
+                            elif self.url_handler.is_binary_file(next_url):
                                logger.debug(f"Skipping binary file from crawl queue: {next_url}")
                    else:
                        logger.warning(
@@ -243,14 +250,16 @@ class RecursiveCrawlStrategy:

            current_urls = next_level_urls

-            # Report completion of this depth
+            # Report completion of this depth - IMPORTANT: Use "finished" not "completed"!
            await report_progress(
                depth_end,
-                f"Depth {depth + 1} completed: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth",
+                f"Depth {depth + 1} finished: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth",
            )

+        # IMPORTANT: Use "finished" not "complete" - only the final orchestrator should say "completed"
        await report_progress(
            end_progress,
-            f"Recursive crawling completed: {len(results_all)} total pages crawled across {max_depth} depth levels",
+            f"Recursive crawl finished: {len(results_all)} pages successfully crawled",
        )
-        return results_all
+
+        return results_all
--- a/python/src/server/services/knowledge/knowledge_item_service.py
+++ b/python/src/server/services/knowledge/knowledge_item_service.py
@@ -1,472 +1,473 @@
-"""
-Knowledge Item Service
-
-Handles all knowledge item CRUD operations and data transformations.
-"""
-
-from typing import Any
-
-from ...config.logfire_config import safe_logfire_error, safe_logfire_info
-
-
-class KnowledgeItemService:
-    """
-    Service for managing knowledge items including listing, filtering, updating, and deletion.
-    """
-
-    def __init__(self, supabase_client):
-        """
-        Initialize the knowledge item service.
-
-        Args:
-            supabase_client: The Supabase client for database operations
-        """
-        self.supabase = supabase_client
-
-    async def list_items(
-        self,
-        page: int = 1,
-        per_page: int = 20,
-        knowledge_type: str | None = None,
-        search: str | None = None,
-    ) -> dict[str, Any]:
-        """
-        List knowledge items with pagination and filtering.
-
-        Args:
-            page: Page number (1-based)
-            per_page: Items per page
-            knowledge_type: Filter by knowledge type
-            search: Search term for filtering
-
-        Returns:
-            Dict containing items, pagination info, and total count
-        """
-        try:
-            # Build the query with filters at database level for better performance
-            query = self.supabase.from_("archon_sources").select("*")
-
-            # Apply knowledge type filter at database level if provided
-            if knowledge_type:
-                query = query.eq("metadata->>knowledge_type", knowledge_type)
-
-            # Apply search filter at database level if provided
-            if search:
-                search_pattern = f"%{search}%"
-                query = query.or_(
-                    f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
-                )
-
-            # Get total count before pagination
-            # Clone the query for counting
-            count_query = self.supabase.from_("archon_sources").select(
-                "*", count="exact", head=True
-            )
-
-            # Apply same filters to count query
-            if knowledge_type:
-                count_query = count_query.eq("metadata->>knowledge_type", knowledge_type)
-
-            if search:
-                search_pattern = f"%{search}%"
-                count_query = count_query.or_(
-                    f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
-                )
-
-            count_result = count_query.execute()
-            total = count_result.count if hasattr(count_result, "count") else 0
-
-            # Apply pagination at database level
-            start_idx = (page - 1) * per_page
-            query = query.range(start_idx, start_idx + per_page - 1)
-
-            # Execute query
-            result = query.execute()
-            sources = result.data if result.data else []
-
-            # Get source IDs for batch queries
-            source_ids = [source["source_id"] for source in sources]
-
-            # Debug log source IDs
-            safe_logfire_info(f"Source IDs for batch query: {source_ids}")
-
-            # Batch fetch related data to avoid N+1 queries
-            first_urls = {}
-            code_example_counts = {}
-            chunk_counts = {}
-
-            if source_ids:
-                # Batch fetch first URLs
-                urls_result = (
-                    self.supabase.from_("archon_crawled_pages")
-                    .select("source_id, url")
-                    .in_("source_id", source_ids)
-                    .execute()
-                )
-
-                # Group URLs by source_id (take first one for each)
-                for item in urls_result.data or []:
-                    if item["source_id"] not in first_urls:
-                        first_urls[item["source_id"]] = item["url"]
-
-                # Get code example counts per source - NO CONTENT, just counts!
-                # Fetch counts individually for each source
-                for source_id in source_ids:
-                    count_result = (
-                        self.supabase.from_("archon_code_examples")
-                        .select("id", count="exact", head=True)
-                        .eq("source_id", source_id)
-                        .execute()
-                    )
-                    code_example_counts[source_id] = (
-                        count_result.count if hasattr(count_result, "count") else 0
-                    )
-
-                # Ensure all sources have a count (default to 0)
-                for source_id in source_ids:
-                    if source_id not in code_example_counts:
-                        code_example_counts[source_id] = 0
-                    chunk_counts[source_id] = 0  # Default to 0 to avoid timeout
-
-                safe_logfire_info(f"Code example counts: {code_example_counts}")
-
-            # Transform sources to items with batched data
-            items = []
-            for source in sources:
-                source_id = source["source_id"]
-                source_metadata = source.get("metadata", {})
-
-                # Use batched data instead of individual queries
-                first_page_url = first_urls.get(source_id, f"source://{source_id}")
-                code_examples_count = code_example_counts.get(source_id, 0)
-                chunks_count = chunk_counts.get(source_id, 0)
-
-                # Determine source type
-                source_type = self._determine_source_type(source_metadata, first_page_url)
-
-                item = {
-                    "id": source_id,
-                    "title": source.get("title", source.get("summary", "Untitled")),
-                    "url": first_page_url,
-                    "source_id": source_id,
-                    "code_examples": [{"count": code_examples_count}]
-                    if code_examples_count > 0
-                    else [],  # Minimal array just for count display
-                    "metadata": {
-                        "knowledge_type": source_metadata.get("knowledge_type", "technical"),
-                        "tags": source_metadata.get("tags", []),
-                        "source_type": source_type,
-                        "status": "active",
-                        "description": source_metadata.get(
-                            "description", source.get("summary", "")
-                        ),
-                        "chunks_count": chunks_count,
-                        "word_count": source.get("total_word_count", 0),
-                        "estimated_pages": round(source.get("total_word_count", 0) / 250, 1),
-                        "pages_tooltip": f"{round(source.get('total_word_count', 0) / 250, 1)} pages (≈ {source.get('total_word_count', 0):,} words)",
-                        "last_scraped": source.get("updated_at"),
-                        "file_name": source_metadata.get("file_name"),
-                        "file_type": source_metadata.get("file_type"),
-                        "update_frequency": source_metadata.get("update_frequency", 7),
-                        "code_examples_count": code_examples_count,
-                        **source_metadata,
-                    },
-                    "created_at": source.get("created_at"),
-                    "updated_at": source.get("updated_at"),
-                }
-                items.append(item)
-
-            safe_logfire_info(
-                f"Knowledge items retrieved | total={total} | page={page} | filtered_count={len(items)}"
-            )
-
-            return {
-                "items": items,
-                "total": total,
-                "page": page,
-                "per_page": per_page,
-                "pages": (total + per_page - 1) // per_page,
-            }
-
-        except Exception as e:
-            safe_logfire_error(f"Failed to list knowledge items | error={str(e)}")
-            raise
-
-    async def get_item(self, source_id: str) -> dict[str, Any] | None:
-        """
-        Get a single knowledge item by source ID.
-
-        Args:
-            source_id: The source ID to retrieve
-
-        Returns:
-            Knowledge item dict or None if not found
-        """
-        try:
-            safe_logfire_info(f"Getting knowledge item | source_id={source_id}")
-
-            # Get the source record
-            result = (
-                self.supabase.from_("archon_sources")
-                .select("*")
-                .eq("source_id", source_id)
-                .single()
-                .execute()
-            )
-
-            if not result.data:
-                return None
-
-            # Transform the source to item format
-            item = await self._transform_source_to_item(result.data)
-            return item
-
-        except Exception as e:
-            safe_logfire_error(
-                f"Failed to get knowledge item | error={str(e)} | source_id={source_id}"
-            )
-            return None
-
-    async def update_item(
-        self, source_id: str, updates: dict[str, Any]
-    ) -> tuple[bool, dict[str, Any]]:
-        """
-        Update a knowledge item's metadata.
-
-        Args:
-            source_id: The source ID to update
-            updates: Dictionary of fields to update
-
-        Returns:
-            Tuple of (success, result)
-        """
-        try:
-            safe_logfire_info(
-                f"Updating knowledge item | source_id={source_id} | updates={updates}"
-            )
-
-            # Prepare update data
-            update_data = {}
-
-            # Handle title updates
-            if "title" in updates:
-                update_data["title"] = updates["title"]
-
-            # Handle metadata updates
-            metadata_fields = [
-                "description",
-                "knowledge_type",
-                "tags",
-                "status",
-                "update_frequency",
-                "group_name",
-            ]
-            metadata_updates = {k: v for k, v in updates.items() if k in metadata_fields}
-
-            if metadata_updates:
-                # Get current metadata
-                current_response = (
-                    self.supabase.table("archon_sources")
-                    .select("metadata")
-                    .eq("source_id", source_id)
-                    .execute()
-                )
-                if current_response.data:
-                    current_metadata = current_response.data[0].get("metadata", {})
-                    current_metadata.update(metadata_updates)
-                    update_data["metadata"] = current_metadata
-                else:
-                    update_data["metadata"] = metadata_updates
-
-            # Perform the update
-            result = (
-                self.supabase.table("archon_sources")
-                .update(update_data)
-                .eq("source_id", source_id)
-                .execute()
-            )
-
-            if result.data:
-                safe_logfire_info(f"Knowledge item updated successfully | source_id={source_id}")
-                return True, {
-                    "success": True,
-                    "message": f"Successfully updated knowledge item {source_id}",
-                    "source_id": source_id,
-                }
-            else:
-                safe_logfire_error(f"Knowledge item not found | source_id={source_id}")
-                return False, {"error": f"Knowledge item {source_id} not found"}
-
-        except Exception as e:
-            safe_logfire_error(
-                f"Failed to update knowledge item | error={str(e)} | source_id={source_id}"
-            )
-            return False, {"error": str(e)}
-
-    async def get_available_sources(self) -> dict[str, Any]:
-        """
-        Get all available sources with their details.
-
-        Returns:
-            Dict containing sources list and count
-        """
-        try:
-            # Query the sources table
-            result = self.supabase.from_("archon_sources").select("*").order("source_id").execute()
-
-            # Format the sources
-            sources = []
-            if result.data:
-                for source in result.data:
-                    sources.append({
-                        "source_id": source.get("source_id"),
-                        "title": source.get("title", source.get("summary", "Untitled")),
-                        "summary": source.get("summary"),
-                        "metadata": source.get("metadata", {}),
-                        "total_words": source.get("total_words", source.get("total_word_count", 0)),
-                        "update_frequency": source.get("update_frequency", 7),
-                        "created_at": source.get("created_at"),
-                        "updated_at": source.get("updated_at", source.get("created_at")),
-                    })
-
-            return {"success": True, "sources": sources, "count": len(sources)}
-
-        except Exception as e:
-            safe_logfire_error(f"Failed to get available sources | error={str(e)}")
-            return {"success": False, "error": str(e), "sources": [], "count": 0}
-
-    async def _get_all_sources(self) -> list[dict[str, Any]]:
-        """Get all sources from the database."""
-        result = await self.get_available_sources()
-        return result.get("sources", [])
-
-    async def _transform_source_to_item(self, source: dict[str, Any]) -> dict[str, Any]:
-        """
-        Transform a source record into a knowledge item with enriched data.
-
-        Args:
-            source: The source record from database
-
-        Returns:
-            Transformed knowledge item
-        """
-        source_metadata = source.get("metadata", {})
-        source_id = source["source_id"]
-
-        # Get first page URL
-        first_page_url = await self._get_first_page_url(source_id)
-
-        # Determine source type
-        source_type = self._determine_source_type(source_metadata, first_page_url)
-
-        # Get code examples
-        code_examples = await self._get_code_examples(source_id)
-
-        return {
-            "id": source_id,
-            "title": source.get("title", source.get("summary", "Untitled")),
-            "url": first_page_url,
-            "source_id": source_id,
-            "code_examples": code_examples,
-            "metadata": {
-                # Spread source_metadata first, then override with computed values
-                **source_metadata,
-                "knowledge_type": source_metadata.get("knowledge_type", "technical"),
-                "tags": source_metadata.get("tags", []),
-                "source_type": source_type,  # This should be the correctly determined source_type
-                "status": "active",
-                "description": source_metadata.get("description", source.get("summary", "")),
-                "chunks_count": await self._get_chunks_count(source_id),  # Get actual chunk count
-                "word_count": source.get("total_words", 0),
-                "estimated_pages": round(
-                    source.get("total_words", 0) / 250, 1
-                ),  # Average book page = 250 words
-                "pages_tooltip": f"{round(source.get('total_words', 0) / 250, 1)} pages (≈ {source.get('total_words', 0):,} words)",
-                "last_scraped": source.get("updated_at"),
-                "file_name": source_metadata.get("file_name"),
-                "file_type": source_metadata.get("file_type"),
-                "update_frequency": source.get("update_frequency", 7),
-                "code_examples_count": len(code_examples),
-            },
-            "created_at": source.get("created_at"),
-            "updated_at": source.get("updated_at"),
-        }
-
-    async def _get_first_page_url(self, source_id: str) -> str:
-        """Get the first page URL for a source."""
-        try:
-            pages_response = (
-                self.supabase.from_("archon_crawled_pages")
-                .select("url")
-                .eq("source_id", source_id)
-                .limit(1)
-                .execute()
-            )
-
-            if pages_response.data:
-                return pages_response.data[0].get("url", f"source://{source_id}")
-
-        except Exception:
-            pass
-
-        return f"source://{source_id}"
-
-    async def _get_code_examples(self, source_id: str) -> list[dict[str, Any]]:
-        """Get code examples for a source."""
-        try:
-            code_examples_response = (
-                self.supabase.from_("archon_code_examples")
-                .select("id, content, summary, metadata")
-                .eq("source_id", source_id)
-                .execute()
-            )
-
-            return code_examples_response.data if code_examples_response.data else []
-
-        except Exception:
-            return []
-
-    def _determine_source_type(self, metadata: dict[str, Any], url: str) -> str:
-        """Determine the source type from metadata or URL pattern."""
-        stored_source_type = metadata.get("source_type")
-        if stored_source_type:
-            return stored_source_type
-
-        # Legacy fallback - check URL pattern
-        return "file" if url.startswith("file://") else "url"
-
-    def _filter_by_search(self, items: list[dict[str, Any]], search: str) -> list[dict[str, Any]]:
-        """Filter items by search term."""
-        search_lower = search.lower()
-        return [
-            item
-            for item in items
-            if search_lower in item["title"].lower()
-            or search_lower in item["metadata"].get("description", "").lower()
-            or any(search_lower in tag.lower() for tag in item["metadata"].get("tags", []))
-        ]
-
-    def _filter_by_knowledge_type(
-        self, items: list[dict[str, Any]], knowledge_type: str
-    ) -> list[dict[str, Any]]:
-        """Filter items by knowledge type."""
-        return [item for item in items if item["metadata"].get("knowledge_type") == knowledge_type]
-
-    async def _get_chunks_count(self, source_id: str) -> int:
-        """Get the actual number of chunks for a source."""
-        try:
-            # Count the actual rows in crawled_pages for this source
-            result = (
-                self.supabase.table("archon_crawled_pages")
-                .select("*", count="exact")
-                .eq("source_id", source_id)
-                .execute()
-            )
-
-            # Return the count of pages (chunks)
-            return result.count if result.count else 0
-
-        except Exception as e:
-            # If we can't get chunk count, return 0
-            safe_logfire_info(f"Failed to get chunk count for {source_id}: {e}")
-            return 0
+"""
+Knowledge Item Service
+
+Handles all knowledge item CRUD operations and data transformations.
+"""
+
+from typing import Any
+
+from ...config.logfire_config import safe_logfire_error, safe_logfire_info
+
+
+class KnowledgeItemService:
+    """
+    Service for managing knowledge items including listing, filtering, updating, and deletion.
+    """
+
+    def __init__(self, supabase_client):
+        """
+        Initialize the knowledge item service.
+
+        Args:
+            supabase_client: The Supabase client for database operations
+        """
+        self.supabase = supabase_client
+
+    async def list_items(
+        self,
+        page: int = 1,
+        per_page: int = 20,
+        knowledge_type: str | None = None,
+        search: str | None = None,
+    ) -> dict[str, Any]:
+        """
+        List knowledge items with pagination and filtering.
+
+        Args:
+            page: Page number (1-based)
+            per_page: Items per page
+            knowledge_type: Filter by knowledge type
+            search: Search term for filtering
+
+        Returns:
+            Dict containing items, pagination info, and total count
+        """
+        try:
+            # Build the query with filters at database level for better performance
+            query = self.supabase.from_("archon_sources").select("*")
+
+            # Apply knowledge type filter at database level if provided
+            if knowledge_type:
+                query = query.eq("metadata->>knowledge_type", knowledge_type)
+
+            # Apply search filter at database level if provided
+            if search:
+                search_pattern = f"%{search}%"
+                query = query.or_(
+                    f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
+                )
+
+            # Get total count before pagination
+            # Clone the query for counting
+            count_query = self.supabase.from_("archon_sources").select(
+                "*", count="exact", head=True
+            )
+
+            # Apply same filters to count query
+            if knowledge_type:
+                count_query = count_query.eq("metadata->>knowledge_type", knowledge_type)
+
+            if search:
+                search_pattern = f"%{search}%"
+                count_query = count_query.or_(
+                    f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
+                )
+
+            count_result = count_query.execute()
+            total = count_result.count if hasattr(count_result, "count") else 0
+
+            # Apply pagination at database level
+            start_idx = (page - 1) * per_page
+            query = query.range(start_idx, start_idx + per_page - 1)
+
+            # Execute query
+            result = query.execute()
+            sources = result.data if result.data else []
+
+            # Get source IDs for batch queries
+            source_ids = [source["source_id"] for source in sources]
+
+            # Debug log source IDs
+            safe_logfire_info(f"Source IDs for batch query: {source_ids}")
+
+            # Batch fetch related data to avoid N+1 queries
+            first_urls = {}
+            code_example_counts = {}
+            chunk_counts = {}
+
+            if source_ids:
+                # Batch fetch first URLs
+                urls_result = (
+                    self.supabase.from_("archon_crawled_pages")
+                    .select("source_id, url")
+                    .in_("source_id", source_ids)
+                    .execute()
+                )
+
+                # Group URLs by source_id (take first one for each)
+                for item in urls_result.data or []:
+                    if item["source_id"] not in first_urls:
+                        first_urls[item["source_id"]] = item["url"]
+
+                # Get code example counts per source - NO CONTENT, just counts!
+                # Fetch counts individually for each source
+                for source_id in source_ids:
+                    count_result = (
+                        self.supabase.from_("archon_code_examples")
+                        .select("id", count="exact", head=True)
+                        .eq("source_id", source_id)
+                        .execute()
+                    )
+                    code_example_counts[source_id] = (
+                        count_result.count if hasattr(count_result, "count") else 0
+                    )
+
+                # Ensure all sources have a count (default to 0)
+                for source_id in source_ids:
+                    if source_id not in code_example_counts:
+                        code_example_counts[source_id] = 0
+                    chunk_counts[source_id] = 0  # Default to 0 to avoid timeout
+
+                safe_logfire_info("Code example counts", code_counts=code_example_counts)
+
+            # Transform sources to items with batched data
+            items = []
+            for source in sources:
+                source_id = source["source_id"]
+                source_metadata = source.get("metadata", {})
+
+                # Use batched data instead of individual queries
+                first_page_url = first_urls.get(source_id, f"source://{source_id}")
+                # Use original crawl URL instead of first page URL
+                original_url = source_metadata.get("original_url") or first_page_url
+                code_examples_count = code_example_counts.get(source_id, 0)
+                chunks_count = chunk_counts.get(source_id, 0)
+
+                # Determine source type
+                source_type = self._determine_source_type(source_metadata, original_url)
+
+                item = {
+                    "id": source_id,
+                    "title": source.get("title", source.get("summary", "Untitled")),
+                    "url": original_url,
+                    "source_id": source_id,
+                    "code_examples": [{"count": code_examples_count}]
+                    if code_examples_count > 0
+                    else [],  # Minimal array just for count display
+                    "metadata": {
+                        "knowledge_type": source_metadata.get("knowledge_type", "technical"),
+                        "tags": source_metadata.get("tags", []),
+                        "source_type": source_type,
+                        "status": "active",
+                        "description": source_metadata.get(
+                            "description", source.get("summary", "")
+                        ),
+                        "chunks_count": chunks_count,
+                        "word_count": source.get("total_word_count", 0),
+                        "estimated_pages": round(source.get("total_word_count", 0) / 250, 1),
+                        "pages_tooltip": f"{round(source.get('total_word_count', 0) / 250, 1)} pages (≈ {source.get('total_word_count', 0):,} words)",
+                        "last_scraped": source.get("updated_at"),
+                        "file_name": source_metadata.get("file_name"),
+                        "file_type": source_metadata.get("file_type"),
+                        "update_frequency": source_metadata.get("update_frequency", 7),
+                        "code_examples_count": code_examples_count,
+                        **source_metadata,
+                    },
+                    "created_at": source.get("created_at"),
+                    "updated_at": source.get("updated_at"),
+                }
+                items.append(item)
+
+            safe_logfire_info(
+                f"Knowledge items retrieved | total={total} | page={page} | filtered_count={len(items)}"
+            )
+
+            return {
+                "items": items,
+                "total": total,
+                "page": page,
+                "per_page": per_page,
+                "pages": (total + per_page - 1) // per_page,
+            }
+
+        except Exception as e:
+            safe_logfire_error(f"Failed to list knowledge items | error={str(e)}")
+            raise
+
+    async def get_item(self, source_id: str) -> dict[str, Any] | None:
+        """
+        Get a single knowledge item by source ID.
+
+        Args:
+            source_id: The source ID to retrieve
+
+        Returns:
+            Knowledge item dict or None if not found
+        """
+        try:
+            safe_logfire_info(f"Getting knowledge item | source_id={source_id}")
+
+            # Get the source record
+            result = (
+                self.supabase.from_("archon_sources")
+                .select("*")
+                .eq("source_id", source_id)
+                .single()
+                .execute()
+            )
+
+            if not result.data:
+                return None
+
+            # Transform the source to item format
+            item = await self._transform_source_to_item(result.data)
+            return item
+
+        except Exception as e:
+            safe_logfire_error(
+                f"Failed to get knowledge item | error={str(e)} | source_id={source_id}"
+            )
+            return None
+
+    async def update_item(
+        self, source_id: str, updates: dict[str, Any]
+    ) -> tuple[bool, dict[str, Any]]:
+        """
+        Update a knowledge item's metadata.
+
+        Args:
+            source_id: The source ID to update
+            updates: Dictionary of fields to update
+
+        Returns:
+            Tuple of (success, result)
+        """
+        try:
+            safe_logfire_info(
+                f"Updating knowledge item | source_id={source_id} | updates={updates}"
+            )
+
+            # Prepare update data
+            update_data = {}
+
+            # Handle title updates
+            if "title" in updates:
+                update_data["title"] = updates["title"]
+
+            # Handle metadata updates
+            metadata_fields = [
+                "description",
+                "knowledge_type",
+                "tags",
+                "status",
+                "update_frequency",
+                "group_name",
+            ]
+            metadata_updates = {k: v for k, v in updates.items() if k in metadata_fields}
+
+            if metadata_updates:
+                # Get current metadata
+                current_response = (
+                    self.supabase.table("archon_sources")
+                    .select("metadata")
+                    .eq("source_id", source_id)
+                    .execute()
+                )
+                if current_response.data:
+                    current_metadata = current_response.data[0].get("metadata", {})
+                    current_metadata.update(metadata_updates)
+                    update_data["metadata"] = current_metadata
+                else:
+                    update_data["metadata"] = metadata_updates
+
+            # Perform the update
+            result = (
+                self.supabase.table("archon_sources")
+                .update(update_data)
+                .eq("source_id", source_id)
+                .execute()
+            )
+
+            if result.data:
+                safe_logfire_info(f"Knowledge item updated successfully | source_id={source_id}")
+                return True, {
+                    "success": True,
+                    "message": f"Successfully updated knowledge item {source_id}",
+                    "source_id": source_id,
+                }
+            else:
+                safe_logfire_error(f"Knowledge item not found | source_id={source_id}")
+                return False, {"error": f"Knowledge item {source_id} not found"}
+
+        except Exception as e:
+            safe_logfire_error(
+                f"Failed to update knowledge item | error={str(e)} | source_id={source_id}"
+            )
+            return False, {"error": str(e)}
+
+    async def get_available_sources(self) -> dict[str, Any]:
+        """
+        Get all available sources with their details.
+
+        Returns:
+            Dict containing sources list and count
+        """
+        try:
+            # Query the sources table
+            result = self.supabase.from_("archon_sources").select("*").order("source_id").execute()
+
+            # Format the sources
+            sources = []
+            if result.data:
+                for source in result.data:
+                    sources.append({
+                        "source_id": source.get("source_id"),
+                        "title": source.get("title", source.get("summary", "Untitled")),
+                        "summary": source.get("summary"),
+                        "metadata": source.get("metadata", {}),
+                        "total_words": source.get("total_words", source.get("total_word_count", 0)),
+                        "update_frequency": source.get("update_frequency", 7),
+                        "created_at": source.get("created_at"),
+                        "updated_at": source.get("updated_at", source.get("created_at")),
+                    })
+
+            return {"success": True, "sources": sources, "count": len(sources)}
+
+        except Exception as e:
+            safe_logfire_error(f"Failed to get available sources | error={str(e)}")
+            return {"success": False, "error": str(e), "sources": [], "count": 0}
+
+    async def _get_all_sources(self) -> list[dict[str, Any]]:
+        """Get all sources from the database."""
+        result = await self.get_available_sources()
+        return result.get("sources", [])
+
+    async def _transform_source_to_item(self, source: dict[str, Any]) -> dict[str, Any]:
+        """
+        Transform a source record into a knowledge item with enriched data.
+
+        Args:
+            source: The source record from database
+
+        Returns:
+            Transformed knowledge item
+        """
+        source_metadata = source.get("metadata", {})
+        source_id = source["source_id"]
+
+        # Get first page URL
+        first_page_url = await self._get_first_page_url(source_id)
+
+        # Determine source type
+        source_type = self._determine_source_type(source_metadata, first_page_url)
+
+        # Get code examples
+        code_examples = await self._get_code_examples(source_id)
+
+        return {
+            "id": source_id,
+            "title": source.get("title", source.get("summary", "Untitled")),
+            "url": first_page_url,
+            "source_id": source_id,
+            "code_examples": code_examples,
+            "metadata": {
+                "knowledge_type": source_metadata.get("knowledge_type", "technical"),
+                "tags": source_metadata.get("tags", []),
+                "source_type": source_type,
+                "status": "active",
+                "description": source_metadata.get("description", source.get("summary", "")),
+                "chunks_count": await self._get_chunks_count(source_id),  # Get actual chunk count
+                "word_count": source.get("total_words", 0),
+                "estimated_pages": round(
+                    source.get("total_words", 0) / 250, 1
+                ),  # Average book page = 250 words
+                "pages_tooltip": f"{round(source.get('total_words', 0) / 250, 1)} pages (≈ {source.get('total_words', 0):,} words)",
+                "last_scraped": source.get("updated_at"),
+                "file_name": source_metadata.get("file_name"),
+                "file_type": source_metadata.get("file_type"),
+                "update_frequency": source.get("update_frequency", 7),
+                "code_examples_count": len(code_examples),
+                **source_metadata,
+            },
+            "created_at": source.get("created_at"),
+            "updated_at": source.get("updated_at"),
+        }
+
+    async def _get_first_page_url(self, source_id: str) -> str:
+        """Get the first page URL for a source."""
+        try:
+            pages_response = (
+                self.supabase.from_("archon_crawled_pages")
+                .select("url")
+                .eq("source_id", source_id)
+                .limit(1)
+                .execute()
+            )
+
+            if pages_response.data:
+                return pages_response.data[0].get("url", f"source://{source_id}")
+
+        except Exception:
+            pass
+
+        return f"source://{source_id}"
+
+    async def _get_code_examples(self, source_id: str) -> list[dict[str, Any]]:
+        """Get code examples for a source."""
+        try:
+            code_examples_response = (
+                self.supabase.from_("archon_code_examples")
+                .select("id, content, summary, metadata")
+                .eq("source_id", source_id)
+                .execute()
+            )
+
+            return code_examples_response.data if code_examples_response.data else []
+
+        except Exception:
+            return []
+
+    def _determine_source_type(self, metadata: dict[str, Any], url: str) -> str:
+        """Determine the source type from metadata or URL pattern."""
+        stored_source_type = metadata.get("source_type")
+        if stored_source_type:
+            return stored_source_type
+
+        # Legacy fallback - check URL pattern
+        return "file" if url.startswith("file://") else "url"
+
+    def _filter_by_search(self, items: list[dict[str, Any]], search: str) -> list[dict[str, Any]]:
+        """Filter items by search term."""
+        search_lower = search.lower()
+        return [
+            item
+            for item in items
+            if search_lower in item["title"].lower()
+            or search_lower in item["metadata"].get("description", "").lower()
+            or any(search_lower in tag.lower() for tag in item["metadata"].get("tags", []))
+        ]
+
+    def _filter_by_knowledge_type(
+        self, items: list[dict[str, Any]], knowledge_type: str
+    ) -> list[dict[str, Any]]:
+        """Filter items by knowledge type."""
+        return [item for item in items if item["metadata"].get("knowledge_type") == knowledge_type]
+
+    async def _get_chunks_count(self, source_id: str) -> int:
+        """Get the actual number of chunks for a source."""
+        try:
+            # Count the actual rows in crawled_pages for this source
+            result = (
+                self.supabase.table("archon_crawled_pages")
+                .select("*", count="exact")
+                .eq("source_id", source_id)
+                .execute()
+            )
+
+            # Return the count of pages (chunks)
+            return result.count if result.count else 0
+
+        except Exception as e:
+            # If we can't get chunk count, return 0
+            safe_logfire_info(f"Failed to get chunk count for {source_id}: {e}")
+            return 0
--- a/python/src/server/services/projects/progress_service.py
+++ b/python/src/server/services/projects/progress_service.py
@@ -11,13 +11,10 @@ from datetime import datetime
 from typing import Any

 from ...config.logfire_config import get_logger
-from ...socketio_app import get_socketio_instance
+from ...socketio_app import sio

 logger = get_logger(__name__)

-# Get Socket.IO instance
-sio = get_socketio_instance()
-logger.info(f"🔗 [PROGRESS] Socket.IO instance ID: {id(sio)}")


 class ProgressService:
--- a/python/src/server/services/projects/task_service.py
+++ b/python/src/server/services/projects/task_service.py
@@ -17,9 +17,7 @@ logger = get_logger(__name__)

 # Import Socket.IO instance directly to avoid circular imports
 try:
-    from ...socketio_app import get_socketio_instance
-    
-    _sio = get_socketio_instance()
+    from ...socketio_app import sio as _sio
    _broadcast_available = True
    logger.info("✅ Socket.IO broadcasting is AVAILABLE - real-time updates enabled")
    
--- a/python/src/server/services/storage/code_storage_service.py
+++ b/python/src/server/services/storage/code_storage_service.py
@@ -870,14 +870,17 @@ async def add_code_examples_to_supabase(

        # Prepare batch data - only for successful embeddings
        batch_data = []
+        used_indices = set()  # Track which indices have been mapped to prevent duplicates
+        
        for j, (embedding, text) in enumerate(
            zip(valid_embeddings, successful_texts, strict=False)
        ):
-            # Find the original index
+            # Find the original index (skip already used indices)
            orig_idx = None
            for k, orig_text in enumerate(batch_texts):
-                if orig_text == text:
+                if orig_text == text and k not in used_indices:
                    orig_idx = k
+                    used_indices.add(k)  # Mark this index as used
                    break

            if orig_idx is None:
--- a/python/src/server/services/storage/document_storage_service.py
+++ b/python/src/server/services/storage/document_storage_service.py
@@ -266,20 +266,23 @@ async def add_documents_to_supabase(
                search_logger.warning(
                    f"Skipping batch {batch_num} - no successful embeddings created"
                )
-                completed_batches += 1
+                # Don't increment completed_batches when skipping - this causes progress to jump
                continue

            # Prepare batch data - only for successful embeddings
            batch_data = []
+            used_indices = set()  # Track which indices have been mapped to prevent duplicates
+            
            # Map successful texts back to their original indices
            for j, (embedding, text) in enumerate(
                zip(batch_embeddings, successful_texts, strict=False)
            ):
-                # Find the original index of this text
+                # Find the original index of this text (skip already used indices)
                orig_idx = None
                for idx, orig_text in enumerate(contextual_contents):
-                    if orig_text == text:
+                    if orig_text == text and idx not in used_indices:
                        orig_idx = idx
+                        used_indices.add(idx)  # Mark this index as used
                        break

                if orig_idx is None:
@@ -370,6 +373,9 @@ async def add_documents_to_supabase(
                        search_logger.info(
                            f"Individual inserts: {successful_inserts}/{len(batch_data)} successful"
                        )
+                        # Even if we had to fall back to individual inserts, count this batch as processed
+                        if successful_inserts > 0:
+                            completed_batches += 1

            # Minimal delay between batches to prevent overwhelming
            if i + batch_size < len(contents):
--- a/python/src/server/services/threading_service.py
+++ b/python/src/server/services/threading_service.py
@@ -84,17 +84,10 @@ class RateLimiter:
        self.semaphore = asyncio.Semaphore(config.max_concurrent)
        self._lock = asyncio.Lock()

-    async def acquire(self, estimated_tokens: int = 8000, progress_callback: Callable | None = None) -> bool:
-        """Acquire permission to make API call with token awareness
-        
-        Args:
-            estimated_tokens: Estimated number of tokens for the operation
-            progress_callback: Optional async callback for progress updates during wait
-        """
-        while True:  # Loop instead of recursion to avoid stack overflow
-            wait_time_to_sleep = None
-            
-            async with self._lock:
+    async def acquire(self, estimated_tokens: int = 8000) -> bool:
+        """Acquire permission to make API call with token awareness"""
+        async with self._lock:
+            while True:  # Use a loop instead of recursion
                now = time.time()

                # Clean old entries
@@ -106,41 +99,30 @@ class RateLimiter:
                    self.request_times.append(now)
                    self.token_usage.append((now, estimated_tokens))
                    return True
-                
-                # Calculate wait time if we can't make the request
+
+                # Calculate wait time
                wait_time = self._calculate_wait_time(estimated_tokens)
-                if wait_time > 0:
-                    logfire_logger.info(
-                        f"Rate limiting: waiting {wait_time:.1f}s",
-                        extra={
-                            "tokens": estimated_tokens,
-                            "current_usage": self._get_current_usage(),
-                        }
-                    )
-                    wait_time_to_sleep = wait_time
-                else:
+                if wait_time <= 0:
                    return False
-            
-            # Sleep outside the lock to avoid deadlock
-            if wait_time_to_sleep is not None:
-                # For long waits, break into smaller chunks with progress updates
-                if wait_time_to_sleep > 5 and progress_callback:
-                    chunks = int(wait_time_to_sleep / 5)  # 5 second chunks
-                    for i in range(chunks):
-                        await asyncio.sleep(5)
-                        remaining = wait_time_to_sleep - (i + 1) * 5
-                        if progress_callback:
-                            await progress_callback({
-                                "type": "rate_limit_wait",
-                                "remaining_seconds": max(0, remaining),
-                                "message": f"waiting {max(0, remaining):.1f}s more..."
-                            })
-                    # Sleep any remaining time
-                    if wait_time_to_sleep % 5 > 0:
-                        await asyncio.sleep(wait_time_to_sleep % 5)
-                else:
-                    await asyncio.sleep(wait_time_to_sleep)
-                # Continue the loop to try again
+
+                logfire_logger.info(
+                    f"Rate limiting: waiting {wait_time:.1f}s",
+                    extra={
+                        "tokens": estimated_tokens,
+                        "current_usage": self._get_current_usage(),
+                    }
+                )
+                
+                # Release the lock while sleeping to allow other operations
+                self._lock.release()
+                try:
+                    await asyncio.sleep(wait_time)
+                    logfire_logger.info(f"Rate limiting: resuming after {wait_time:.1f}s wait")
+                finally:
+                    # Re-acquire the lock before continuing
+                    await self._lock.acquire()
+                
+                # Loop will continue and re-check conditions

    def _can_make_request(self, estimated_tokens: int) -> bool:
        """Check if request can be made within limits"""
@@ -540,15 +522,10 @@ class ThreadingService:
        logfire_logger.info("Threading service stopped")

    @asynccontextmanager
-    async def rate_limited_operation(self, estimated_tokens: int = 8000, progress_callback: Callable | None = None):
-        """Context manager for rate-limited operations
-        
-        Args:
-            estimated_tokens: Estimated number of tokens for the operation
-            progress_callback: Optional async callback for progress updates during wait
-        """
+    async def rate_limited_operation(self, estimated_tokens: int = 8000):
+        """Context manager for rate-limited operations"""
        async with self.rate_limiter.semaphore:
-            can_proceed = await self.rate_limiter.acquire(estimated_tokens, progress_callback)
+            can_proceed = await self.rate_limiter.acquire(estimated_tokens)
            if not can_proceed:
                raise Exception("Rate limit exceeded")

@@ -676,4 +653,4 @@ async def stop_threading_service():
    global _threading_service
    if _threading_service:
        await _threading_service.stop()
-        _threading_service = None
+        _threading_service = None
--- a/python/src/server/socketio_app.py
+++ b/python/src/server/socketio_app.py
@@ -26,17 +26,6 @@ sio = socketio.AsyncServer(
    ping_interval=60,  # 1 minute - check connection every minute
 )

-# Global Socket.IO instance for use across modules
-_socketio_instance: socketio.AsyncServer | None = None
-
-
-def get_socketio_instance() -> socketio.AsyncServer:
-    """Get the global Socket.IO server instance."""
-    global _socketio_instance
-    if _socketio_instance is None:
-        _socketio_instance = sio
-    return _socketio_instance
-

 def create_socketio_app(app: FastAPI) -> socketio.ASGIApp:
    """
@@ -63,3 +52,24 @@ def create_socketio_app(app: FastAPI) -> socketio.ASGIApp:
    sio.app = app

    return socket_app
+    
+# Default Socket.IO event handlers
+@sio.event
+async def connect(sid, environ):
+    """Handle new client connections."""
+    logger.info(f"Client connected: {sid}")
+    safe_logfire_info(f"Client connected: {sid}")
+
+
+@sio.event
+async def disconnect(sid):
+    """Handle client disconnections."""
+    logger.info(f"Client disconnected: {sid}")
+    safe_logfire_info(f"Client disconnected: {sid}")
+
+
+@sio.event
+async def message(sid, data):
+    """Handle incoming messages."""
+    logger.info(f"Received message from {sid}: {data}")
+    await sio.emit("response", {"data": "Message received!"}, to=sid)