Merge branch 'ui-changes-backup' into merge-ui-socket-fixes

2026-01-09 08:08:20 -05:00 · 2025-08-31 22:30:07 -04:00
parent f938a3d9d0 4fa0e65dd3
commit 12096ef15c
60 changed files with 6092 additions and 2259 deletions
--- a/python/src/server/services/crawling/code_extraction_service.py
+++ b/python/src/server/services/crawling/code_extraction_service.py
@@ -217,14 +217,21 @@ class CodeExtractionService:
        Returns:
            List of code blocks with metadata
        """
+        import asyncio
+        import time
+        
        # Progress will be reported during the loop below

        all_code_blocks = []
        total_docs = len(crawl_results)
        completed_docs = 0
+        
+        # PERFORMANCE: Track extraction time per document
+        MAX_EXTRACTION_TIME_PER_DOC = 5.0  # 5 seconds max per document

        for doc in crawl_results:
            try:
+                doc_start_time = time.time()
                source_url = doc["url"]
                html_content = doc.get("html", "")
                md = doc.get("markdown", "")
@@ -234,9 +241,7 @@ class CodeExtractionService:
                    f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
                )

-                # Get dynamic minimum length based on document context
-                # Extract some context from the document for analysis
-                doc_context = md[:1000] if md else html_content[:1000] if html_content else ""
+                # Dynamic minimum length is handled inside the extraction methods

                # Check markdown first to see if it has code blocks
                if md:
@@ -287,15 +292,32 @@ class CodeExtractionService:

                # If not a text file or no code blocks found, try HTML extraction first
                if len(code_blocks) == 0 and html_content and not is_text_file:
-                    safe_logfire_info(
-                        f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
-                    )
-                    html_code_blocks = await self._extract_html_code_blocks(html_content)
-                    if html_code_blocks:
-                        code_blocks = html_code_blocks
+                    # PERFORMANCE: Check if we've already spent too much time on this document
+                    elapsed_time = time.time() - doc_start_time
+                    if elapsed_time > MAX_EXTRACTION_TIME_PER_DOC:
                        safe_logfire_info(
-                            f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
+                            f"⏱️ Skipping HTML extraction for {source_url} - already spent {elapsed_time:.1f}s"
                        )
+                    else:
+                        safe_logfire_info(
+                            f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
+                        )
+                        # Create a timeout for HTML extraction
+                        remaining_time = MAX_EXTRACTION_TIME_PER_DOC - elapsed_time
+                        try:
+                            html_code_blocks = await asyncio.wait_for(
+                                self._extract_html_code_blocks(html_content, source_url),
+                                timeout=remaining_time
+                            )
+                            if html_code_blocks:
+                                code_blocks = html_code_blocks
+                                safe_logfire_info(
+                                    f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
+                                )
+                        except asyncio.TimeoutError:
+                            safe_logfire_info(
+                                f"⏱️ HTML extraction timed out after {remaining_time:.1f}s for {source_url}"
+                            )

                # If still no code blocks, try markdown extraction as fallback
                if len(code_blocks) == 0 and md and "```" in md:
@@ -322,6 +344,14 @@ class CodeExtractionService:

                # Update progress only after completing document extraction
                completed_docs += 1
+                extraction_time = time.time() - doc_start_time
+                if extraction_time > 2.0:  # Log slow extractions
+                    safe_logfire_info(
+                        f"⏱️ Document extraction took {extraction_time:.1f}s | url={source_url} | "
+                        f"html_size={len(html_content) if html_content else 0} | "
+                        f"blocks_found={len([b for b in all_code_blocks if b['source_url'] == source_url])}"
+                    )
+                    
                if progress_callback and total_docs > 0:
                    # Calculate progress within the specified range
                    raw_progress = completed_docs / total_docs
@@ -343,13 +373,14 @@ class CodeExtractionService:

        return all_code_blocks

-    async def _extract_html_code_blocks(self, content: str) -> list[dict[str, Any]]:
+    async def _extract_html_code_blocks(self, content: str, source_url: str = "") -> list[dict[str, Any]]:
        """
        Extract code blocks from HTML patterns in content.
        This is a fallback when markdown conversion didn't preserve code blocks.

        Args:
            content: The content to search for HTML code patterns
+            source_url: The URL of the document being processed
            min_length: Minimum length for code blocks

        Returns:
@@ -359,6 +390,20 @@ class CodeExtractionService:

        # Add detailed logging
        safe_logfire_info(f"Processing HTML of length {len(content)} for code extraction")
+        
+        # PERFORMANCE OPTIMIZATION: Skip extremely large HTML files or chunk them
+        MAX_HTML_SIZE = 1_000_000  # 1MB limit for single-pass processing (increased from 500KB)
+        if len(content) > MAX_HTML_SIZE:
+            safe_logfire_info(
+                f"⚠️ HTML content is very large ({len(content)} bytes). "
+                f"Limiting to first {MAX_HTML_SIZE} bytes to prevent timeout."
+            )
+            # For very large files, focus on the first portion where code examples are likely to be
+            content = content[:MAX_HTML_SIZE]
+            # Try to find a good cutoff point (end of a tag)
+            last_tag_end = content.rfind('>')
+            if last_tag_end > MAX_HTML_SIZE - 1000:
+                content = content[:last_tag_end + 1]

        # Check if we have actual content
        if len(content) < 1000:
@@ -510,9 +555,71 @@ class CodeExtractionService:
            ),
        ]

-        for pattern_tuple in patterns:
+        # PERFORMANCE: Early exit checks to avoid unnecessary regex processing
+        # Check more content (20KB instead of 5KB) and add URL-based exceptions
+        check_size = min(20000, len(content))  # Check first 20KB or entire content if smaller
+        has_code_indicators = any(indicator in content[:check_size] for indicator in 
+                                 ['<pre', '<code', 'language-', 'hljs', 'prism', 'shiki', 'highlight'])
+        
+        # Never skip certain documentation sites that we know have code
+        is_known_code_site = any(domain in source_url.lower() for domain in 
+                                ['milkdown', 'github.com', 'gitlab', 'docs.', 'dev.', 'api.'])
+        
+        if not has_code_indicators and not is_known_code_site:
+            safe_logfire_info(f"No code indicators found in first {check_size} chars and not a known code site, skipping HTML extraction | url={source_url}")
+            return []
+        
+        if is_known_code_site and not has_code_indicators:
+            safe_logfire_info(f"Known code site but no indicators in first {check_size} chars, continuing anyway | url={source_url}")
+        
+        # PERFORMANCE: Limit number of patterns to check based on detected libraries
+        patterns_to_check = []
+        content_lower = content[:10000].lower()  # Check first 10KB for library detection
+        
+        # Selectively add patterns based on what's detected
+        if 'milkdown' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'milkdown' in p[1]])
+        if 'monaco' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'monaco' in p[1]])
+        if 'codemirror' in content_lower or 'cm-' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'codemirror' in p[1]])
+        if 'prism' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'prism' in p[1]])
+        if 'hljs' in content_lower or 'highlight' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'hljs' in p[1] or 'highlight' in p[1]])
+        if 'shiki' in content_lower or 'astro' in content_lower:
+            patterns_to_check.extend([p for p in patterns if 'shiki' in p[1] or 'astro' in p[1]])
+        
+        # Always include standard patterns as fallback (get ALL standard/generic patterns, not just last 5)
+        standard_patterns = [p for p in patterns if any(tag in p[1] for tag in ['standard', 'generic', 'prism', 'hljs'])]
+        patterns_to_check.extend(standard_patterns)
+        
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_patterns = []
+        for p in patterns_to_check:
+            if p[1] not in seen:
+                unique_patterns.append(p)
+                seen.add(p[1])
+        patterns_to_check = unique_patterns
+        
+        # If we have very few patterns and it's a known code site, add more generic patterns
+        if len(patterns_to_check) < 5 and is_known_code_site:
+            safe_logfire_info(f"Known code site with few patterns ({len(patterns_to_check)}), adding more generic patterns")
+            patterns_to_check = patterns  # Use all patterns for known code sites
+        
+        safe_logfire_info(f"Checking {len(patterns_to_check)} relevant patterns out of {len(patterns)} total")
+        
+        for pattern_tuple in patterns_to_check:
            pattern_str, source_type = pattern_tuple
-            matches = list(re.finditer(pattern_str, content, re.DOTALL | re.IGNORECASE))
+            
+            # PERFORMANCE: Use re.finditer with smaller chunks for very long content
+            # Only use DOTALL for patterns that really need it (multi-line blocks)
+            flags = re.IGNORECASE
+            if 'monaco' in source_type or 'codemirror' in source_type:
+                flags |= re.DOTALL  # These need DOTALL for multi-line matching
+            
+            matches = list(re.finditer(pattern_str, content, flags))

            # Log pattern matches for Milkdown patterns and CodeMirror
            if matches and (