diff --git a/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx b/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx index ede71170..349b6d20 100644 --- a/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx +++ b/archon-ui-main/src/components/knowledge-base/KnowledgeItemCard.tsx @@ -151,6 +151,7 @@ export const KnowledgeItemCard = ({ const [showEditModal, setShowEditModal] = useState(false); const [loadedCodeExamples, setLoadedCodeExamples] = useState(null); const [isLoadingCodeExamples, setIsLoadingCodeExamples] = useState(false); + const [isRecrawling, setIsRecrawling] = useState(false); const statusColorMap = { active: 'green', @@ -210,8 +211,14 @@ export const KnowledgeItemCard = ({ }; const handleRefresh = () => { - if (onRefresh) { + if (onRefresh && !isRecrawling) { + setIsRecrawling(true); onRefresh(item.source_id); + // Temporary fix: Auto-reset after timeout + // TODO: Reset based on actual crawl completion status from polling + setTimeout(() => { + setIsRecrawling(false); + }, 60000); // Reset after 60 seconds as a fallback } }; @@ -369,15 +376,18 @@ export const KnowledgeItemCard = ({ {item.metadata.source_type === 'url' && ( )} diff --git a/python/src/server/api_routes/progress_api.py b/python/src/server/api_routes/progress_api.py index b52206e0..fa5db271 100644 --- a/python/src/server/api_routes/progress_api.py +++ b/python/src/server/api_routes/progress_api.py @@ -54,6 +54,9 @@ async def get_progress( # Convert to dict with camelCase fields for API response response_data = progress_response.model_dump(by_alias=True, exclude_none=True) + # Debug logging for code extraction fields + if operation_type == "crawl" and operation.get("status") == "code_extraction": + logger.info(f"Code extraction response fields: completedSummaries={response_data.get('completedSummaries')}, totalSummaries={response_data.get('totalSummaries')}, codeBlocksFound={response_data.get('codeBlocksFound')}") # Generate ETag from stable data (excluding timestamp) etag_data = {k: v for k, v in response_data.items() if k != "timestamp"} diff --git a/python/src/server/models/progress_models.py b/python/src/server/models/progress_models.py index 177fe74b..11cc9e1a 100644 --- a/python/src/server/models/progress_models.py +++ b/python/src/server/models/progress_models.py @@ -72,8 +72,8 @@ class CrawlProgressResponse(BaseProgressResponse): status: Literal[ "starting", "analyzing", "crawling", "processing", - "source_creation", "document_storage", "code_extraction", - "finalization", "completed", "failed", "cancelled", "stopping" + "source_creation", "document_storage", "code_extraction", "code_storage", + "finalization", "completed", "failed", "cancelled", "stopping", "error" ] # Crawl-specific fields @@ -230,6 +230,12 @@ def create_progress_response( # Create the response, the model will handle field mapping try: + # Debug logging for code extraction fields + if operation_type == "crawl" and "completed_summaries" in progress_data: + from ..config.logfire_config import get_logger + logger = get_logger(__name__) + logger.info(f"Code extraction progress fields present: completed_summaries={progress_data.get('completed_summaries')}, total_summaries={progress_data.get('total_summaries')}") + return model_class(**progress_data) except Exception as e: # Log validation errors for debugging diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py index be654161..ebeda18b 100644 --- a/python/src/server/services/crawling/code_extraction_service.py +++ b/python/src/server/services/crawling/code_extraction_service.py @@ -139,6 +139,7 @@ class CodeExtractionService: progress_callback: Callable | None = None, start_progress: int = 0, end_progress: int = 100, + cancellation_check: Callable[[], None] | None = None, ) -> int: """ Extract code examples from crawled documents and store them. @@ -164,7 +165,7 @@ class CodeExtractionService: # Extract code blocks from all documents all_code_blocks = await self._extract_code_blocks_from_documents( - crawl_results, source_id, progress_callback, start_progress, extract_end + crawl_results, source_id, progress_callback, start_progress, extract_end, cancellation_check ) if not all_code_blocks: @@ -191,7 +192,7 @@ class CodeExtractionService: # Generate summaries for code blocks with mapped progress summary_results = await self._generate_code_summaries( - all_code_blocks, progress_callback, extract_end, summary_end + all_code_blocks, progress_callback, extract_end, summary_end, cancellation_check ) # Prepare code examples for storage @@ -209,6 +210,7 @@ class CodeExtractionService: progress_callback: Callable | None = None, start_progress: int = 0, end_progress: int = 100, + cancellation_check: Callable[[], None] | None = None, ) -> list[dict[str, Any]]: """ Extract code blocks from all documents. @@ -227,6 +229,10 @@ class CodeExtractionService: completed_docs = 0 for doc in crawl_results: + # Check for cancellation before processing each document + if cancellation_check: + cancellation_check() + try: source_url = doc["url"] html_content = doc.get("html", "") @@ -1348,6 +1354,7 @@ class CodeExtractionService: progress_callback: Callable | None = None, start_progress: int = 0, end_progress: int = 100, + cancellation_check: Callable[[], None] | None = None, ) -> list[dict[str, str]]: """ Generate summaries for all code blocks. @@ -1391,6 +1398,10 @@ class CodeExtractionService: if progress_callback: # Create a wrapper that maps the progress to the correct range async def mapped_callback(data: dict): + # Check for cancellation during summary generation + if cancellation_check: + cancellation_check() + # Map the progress from generate_code_summaries_batch (0-100) to our range if "progress" in data or "percentage" in data: raw_progress = data.get("progress", data.get("percentage", 0)) @@ -1408,9 +1419,35 @@ class CodeExtractionService: summary_progress_callback = mapped_callback - return await generate_code_summaries_batch( - code_blocks_for_summaries, max_workers, progress_callback=summary_progress_callback - ) + try: + results = await generate_code_summaries_batch( + code_blocks_for_summaries, max_workers, progress_callback=summary_progress_callback + ) + + # Ensure all results are valid dicts + validated_results = [] + for result in results: + if isinstance(result, dict): + validated_results.append(result) + else: + # Handle non-dict results (CancelledError, etc.) + validated_results.append({ + "example_name": "Code Example", + "summary": "Code example for demonstration purposes." + }) + + return validated_results + except asyncio.CancelledError: + # If cancelled, return default summaries for all blocks + default_summaries = [] + for item in all_code_blocks: + block = item["block"] + language = block.get("language", "") + default_summaries.append({ + "example_name": f"Code Example{f' ({language})' if language else ''}", + "summary": "Code example for demonstration purposes.", + }) + return default_summaries def _prepare_code_examples_for_storage( self, all_code_blocks: list[dict[str, Any]], summary_results: list[dict[str, str]] @@ -1432,8 +1469,14 @@ class CodeExtractionService: source_url = code_item["source_url"] source_id = code_item["source_id"] - summary = summary_result.get("summary", "Code example for demonstration purposes.") - example_name = summary_result.get("example_name", "Code Example") + # Handle cancellation errors or invalid summary results + if isinstance(summary_result, dict): + summary = summary_result.get("summary", "Code example for demonstration purposes.") + example_name = summary_result.get("example_name", "Code Example") + else: + # Handle CancelledError or other non-dict results + summary = "Code example for demonstration purposes." + example_name = "Code Example" code_urls.append(source_url) code_chunk_numbers.append(len(code_examples)) diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 84fa05bb..cddb331d 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -413,6 +413,9 @@ class CrawlingService: # Extract code examples if requested code_examples_count = 0 if request.get("extract_code_examples", True) and actual_chunks_stored > 0: + # Check for cancellation before starting code extraction + self._check_cancellation() + await update_mapped_progress("code_extraction", 0, "Starting code extraction...") # Create progress callback for code extraction @@ -437,8 +440,12 @@ class CrawlingService: code_progress_callback, 85, 95, + self._check_cancellation, ) + # Check for cancellation after code extraction + self._check_cancellation() + # Send heartbeat after code extraction await send_heartbeat_if_needed() diff --git a/python/src/server/services/crawling/document_storage_operations.py b/python/src/server/services/crawling/document_storage_operations.py index 1cbe017b..047acf73 100644 --- a/python/src/server/services/crawling/document_storage_operations.py +++ b/python/src/server/services/crawling/document_storage_operations.py @@ -334,6 +334,7 @@ class DocumentStorageOperations: progress_callback: Callable | None = None, start_progress: int = 85, end_progress: int = 95, + cancellation_check: Callable[[], None] | None = None, ) -> int: """ Extract code examples from crawled documents and store them. @@ -345,12 +346,13 @@ class DocumentStorageOperations: progress_callback: Optional callback for progress updates start_progress: Starting progress percentage end_progress: Ending progress percentage + cancellation_check: Optional function to check for cancellation Returns: Number of code examples stored """ result = await self.code_extraction_service.extract_and_store_code_examples( - crawl_results, url_to_full_document, source_id, progress_callback, start_progress, end_progress + crawl_results, url_to_full_document, source_id, progress_callback, start_progress, end_progress, cancellation_check ) return result diff --git a/python/src/server/services/storage/document_storage_service.py b/python/src/server/services/storage/document_storage_service.py index f127d8de..4fc07a18 100644 --- a/python/src/server/services/storage/document_storage_service.py +++ b/python/src/server/services/storage/document_storage_service.py @@ -175,7 +175,7 @@ async def add_documents_to_supabase( "total_batches": total_batches, "completed_batches": completed_batches, "chunks_in_batch": len(batch_contents), - "max_workers": max_workers if use_contextual_embeddings else 0, + "active_workers": max_workers if use_contextual_embeddings else 1, } ) except Exception as e: @@ -353,7 +353,7 @@ async def add_documents_to_supabase( "total_batches": total_batches, "current_batch": batch_num, "chunks_processed": len(batch_data), - "max_workers": max_workers if use_contextual_embeddings else 0, + "active_workers": max_workers if use_contextual_embeddings else 1, } await report_progress(complete_msg, new_progress, batch_info) break