From c16498ceabd1416c07de569fd4c9868fb8f01123 Mon Sep 17 00:00:00 2001 From: sean-eskerium Date: Wed, 20 Aug 2025 21:16:12 -0400 Subject: [PATCH] - Fixing the crawl errors for large crawled files like the NUXT docs. - Removing the "Completed" steps in reporting. - Cleanup Sockets from PR 250 and 395 with Code Rabbit cleanup suggestions. --- archon-ui-main/test/config/api.test.ts | 79 +-------- .../crawling/code_extraction_service.py | 133 +++++++++++++-- .../services/crawling/crawling_service.py | 3 + .../services/crawling/strategies/batch.py | 160 +++++++++--------- .../services/crawling/strategies/recursive.py | 31 +++- .../src/server/services/threading_service.py | 52 +++--- python/src/server/socketio_app.py | 31 ++-- 7 files changed, 279 insertions(+), 210 deletions(-) diff --git a/archon-ui-main/test/config/api.test.ts b/archon-ui-main/test/config/api.test.ts index ac06c78e..95e2e992 100644 --- a/archon-ui-main/test/config/api.test.ts +++ b/archon-ui-main/test/config/api.test.ts @@ -47,10 +47,10 @@ describe('API Configuration', () => { delete (import.meta.env as any).VITE_API_URL; delete (import.meta.env as any).ARCHON_SERVER_PORT; - const { getApiUrl } = await import('../../src/config/api'); - - expect(() => getApiUrl()).toThrow('ARCHON_SERVER_PORT environment variable is required'); - expect(() => getApiUrl()).toThrow('Default value: 8181'); + // The error will be thrown during module import because API_FULL_URL calls getApiUrl() + await expect(async () => { + await import('../../src/config/api'); + }).rejects.toThrow('ARCHON_SERVER_PORT environment variable is required'); }); it('should use ARCHON_SERVER_PORT when set in development', async () => { @@ -156,73 +156,4 @@ describe('API Configuration', () => { }); }); -describe('MCP Client Service Configuration', () => { - let originalEnv: any; - - beforeEach(() => { - originalEnv = { ...import.meta.env }; - vi.resetModules(); - }); - - afterEach(() => { - Object.keys(import.meta.env).forEach(key => { - delete (import.meta.env as any)[key]; - }); - Object.assign(import.meta.env, originalEnv); - }); - - it('should throw error when ARCHON_MCP_PORT is not set', async () => { - delete (import.meta.env as any).ARCHON_MCP_PORT; - - const { MCPClientService } = await import('../../src/services/mcpClientService'); - const service = new MCPClientService(); - - await expect(service.createArchonClient()).rejects.toThrow('ARCHON_MCP_PORT environment variable is required'); - await expect(service.createArchonClient()).rejects.toThrow('Default value: 8051'); - }); - - it('should use ARCHON_MCP_PORT when set', async () => { - (import.meta.env as any).ARCHON_MCP_PORT = '9051'; - (import.meta.env as any).ARCHON_SERVER_PORT = '8181'; - - // Mock window.location - Object.defineProperty(window, 'location', { - value: { - protocol: 'http:', - hostname: 'localhost' - }, - writable: true - }); - - // Mock the API call - global.fetch = vi.fn().mockResolvedValue({ - ok: true, - json: async () => ({ - id: 'test-id', - name: 'Archon', - transport_type: 'http', - connection_status: 'connected' - }) - }); - - const { MCPClientService } = await import('../../src/services/mcpClientService'); - const service = new MCPClientService(); - - try { - await service.createArchonClient(); - - // Verify the fetch was called with the correct URL - expect(global.fetch).toHaveBeenCalledWith( - expect.stringContaining('/api/mcp/clients'), - expect.objectContaining({ - method: 'POST', - body: expect.stringContaining('9051') - }) - ); - } catch (error) { - // If it fails due to actual API call, that's okay for this test - // We're mainly testing that it constructs the URL correctly - expect(error).toBeDefined(); - } - }); -}); \ No newline at end of file +// MCP Client Service Configuration tests removed - service not currently in use \ No newline at end of file diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py index 71e12ebe..d0649baf 100644 --- a/python/src/server/services/crawling/code_extraction_service.py +++ b/python/src/server/services/crawling/code_extraction_service.py @@ -211,14 +211,21 @@ class CodeExtractionService: Returns: List of code blocks with metadata """ + import asyncio + import time + # Progress will be reported during the loop below all_code_blocks = [] total_docs = len(crawl_results) completed_docs = 0 + + # PERFORMANCE: Track extraction time per document + MAX_EXTRACTION_TIME_PER_DOC = 5.0 # 5 seconds max per document for doc in crawl_results: try: + doc_start_time = time.time() source_url = doc["url"] html_content = doc.get("html", "") md = doc.get("markdown", "") @@ -228,9 +235,7 @@ class CodeExtractionService: f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}" ) - # Get dynamic minimum length based on document context - # Extract some context from the document for analysis - doc_context = md[:1000] if md else html_content[:1000] if html_content else "" + # Dynamic minimum length is handled inside the extraction methods # Check markdown first to see if it has code blocks if md: @@ -281,15 +286,32 @@ class CodeExtractionService: # If not a text file or no code blocks found, try HTML extraction first if len(code_blocks) == 0 and html_content and not is_text_file: - safe_logfire_info( - f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}" - ) - html_code_blocks = await self._extract_html_code_blocks(html_content) - if html_code_blocks: - code_blocks = html_code_blocks + # PERFORMANCE: Check if we've already spent too much time on this document + elapsed_time = time.time() - doc_start_time + if elapsed_time > MAX_EXTRACTION_TIME_PER_DOC: safe_logfire_info( - f"Found {len(code_blocks)} code blocks from HTML | url={source_url}" + f"⏱️ Skipping HTML extraction for {source_url} - already spent {elapsed_time:.1f}s" ) + else: + safe_logfire_info( + f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}" + ) + # Create a timeout for HTML extraction + remaining_time = MAX_EXTRACTION_TIME_PER_DOC - elapsed_time + try: + html_code_blocks = await asyncio.wait_for( + self._extract_html_code_blocks(html_content, source_url), + timeout=remaining_time + ) + if html_code_blocks: + code_blocks = html_code_blocks + safe_logfire_info( + f"Found {len(code_blocks)} code blocks from HTML | url={source_url}" + ) + except asyncio.TimeoutError: + safe_logfire_info( + f"⏱️ HTML extraction timed out after {remaining_time:.1f}s for {source_url}" + ) # If still no code blocks, try markdown extraction as fallback if len(code_blocks) == 0 and md and "```" in md: @@ -319,6 +341,14 @@ class CodeExtractionService: # Update progress only after completing document extraction completed_docs += 1 + extraction_time = time.time() - doc_start_time + if extraction_time > 2.0: # Log slow extractions + safe_logfire_info( + f"⏱️ Document extraction took {extraction_time:.1f}s | url={source_url} | " + f"html_size={len(html_content) if html_content else 0} | " + f"blocks_found={len([b for b in all_code_blocks if b['source_url'] == source_url])}" + ) + if progress_callback and total_docs > 0: # Calculate progress within the specified range raw_progress = completed_docs / total_docs @@ -340,13 +370,14 @@ class CodeExtractionService: return all_code_blocks - async def _extract_html_code_blocks(self, content: str) -> list[dict[str, Any]]: + async def _extract_html_code_blocks(self, content: str, source_url: str = "") -> list[dict[str, Any]]: """ Extract code blocks from HTML patterns in content. This is a fallback when markdown conversion didn't preserve code blocks. Args: content: The content to search for HTML code patterns + source_url: The URL of the document being processed min_length: Minimum length for code blocks Returns: @@ -356,6 +387,20 @@ class CodeExtractionService: # Add detailed logging safe_logfire_info(f"Processing HTML of length {len(content)} for code extraction") + + # PERFORMANCE OPTIMIZATION: Skip extremely large HTML files or chunk them + MAX_HTML_SIZE = 1_000_000 # 1MB limit for single-pass processing (increased from 500KB) + if len(content) > MAX_HTML_SIZE: + safe_logfire_info( + f"⚠️ HTML content is very large ({len(content)} bytes). " + f"Limiting to first {MAX_HTML_SIZE} bytes to prevent timeout." + ) + # For very large files, focus on the first portion where code examples are likely to be + content = content[:MAX_HTML_SIZE] + # Try to find a good cutoff point (end of a tag) + last_tag_end = content.rfind('>') + if last_tag_end > MAX_HTML_SIZE - 1000: + content = content[:last_tag_end + 1] # Check if we have actual content if len(content) < 1000: @@ -507,9 +552,71 @@ class CodeExtractionService: ), ] - for pattern_tuple in patterns: + # PERFORMANCE: Early exit checks to avoid unnecessary regex processing + # Check more content (20KB instead of 5KB) and add URL-based exceptions + check_size = min(20000, len(content)) # Check first 20KB or entire content if smaller + has_code_indicators = any(indicator in content[:check_size] for indicator in + [' bool: """Acquire permission to make API call with token awareness""" async with self._lock: - now = time.time() + while True: # Use a loop instead of recursion + now = time.time() - # Clean old entries - self._clean_old_entries(now) + # Clean old entries + self._clean_old_entries(now) - # Check if we can make the request - if not self._can_make_request(estimated_tokens): + # Check if we can make the request + if self._can_make_request(estimated_tokens): + # Record the request + self.request_times.append(now) + self.token_usage.append((now, estimated_tokens)) + return True + + # Calculate wait time wait_time = self._calculate_wait_time(estimated_tokens) - if wait_time > 0: - logfire_logger.info( - f"Rate limiting: waiting {wait_time:.1f}s", - extra={ - "tokens": estimated_tokens, - "current_usage": self._get_current_usage(), - } - ) - await asyncio.sleep(wait_time) - return await self.acquire(estimated_tokens) - return False + if wait_time <= 0: + return False - # Record the request - self.request_times.append(now) - self.token_usage.append((now, estimated_tokens)) - return True + logfire_logger.info( + f"Rate limiting: waiting {wait_time:.1f}s", + extra={ + "tokens": estimated_tokens, + "current_usage": self._get_current_usage(), + } + ) + + # Release the lock while sleeping to allow other operations + self._lock.release() + try: + await asyncio.sleep(wait_time) + logfire_logger.info(f"Rate limiting: resuming after {wait_time:.1f}s wait") + finally: + # Re-acquire the lock before continuing + await self._lock.acquire() + + # Loop will continue and re-check conditions def _can_make_request(self, estimated_tokens: int) -> bool: """Check if request can be made within limits""" diff --git a/python/src/server/socketio_app.py b/python/src/server/socketio_app.py index 0028d66b..9231751d 100644 --- a/python/src/server/socketio_app.py +++ b/python/src/server/socketio_app.py @@ -26,16 +26,6 @@ sio = socketio.AsyncServer( ping_interval=60, # 1 minute - check connection every minute ) -# Global Socket.IO instance for use across modules -_socketio_instance: socketio.AsyncServer | None = None - -def get_socketio_instance() -> socketio.AsyncServer: - """Get the global Socket.IO server instance.""" - global _socketio_instance - if _socketio_instance is None: - _socketio_instance = sio - return _socketio_instance - def create_socketio_app(app: FastAPI) -> socketio.ASGIApp: """ @@ -62,3 +52,24 @@ def create_socketio_app(app: FastAPI) -> socketio.ASGIApp: sio.app = app return socket_app + +# Default Socket.IO event handlers +@sio.event +async def connect(sid, environ): + """Handle new client connections.""" + logger.info(f"Client connected: {sid}") + safe_logfire_info(f"Client connected: {sid}") + + +@sio.event +async def disconnect(sid): + """Handle client disconnections.""" + logger.info(f"Client disconnected: {sid}") + safe_logfire_info(f"Client disconnected: {sid}") + + +@sio.event +async def message(sid, data): + """Handle incoming messages.""" + logger.info(f"Received message from {sid}: {data}") + await sio.emit("response", {"data": "Message received!"}, to=sid) \ No newline at end of file