From 476fa57061ad61e0349dce29d0c469e073aa6c8e Mon Sep 17 00:00:00 2001 From: leex279 Date: Thu, 6 Nov 2025 00:39:37 +0100 Subject: [PATCH] Enhanced fix: Handle Crawl4AI space injection in HTML MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extended the HTML span space injection fix to handle Crawl4AI's markdown generator which adds spaces between span tags when converting HTML to markdown. The issue: Crawl4AI converts HTML like: @/lib Into markdown with spaces: @ / lib This caused import paths '@/lib/auth' to become '@ / lib / auth' in the database. Solution: - Collapse whitespace around programming punctuation (/, ., :, @, -, >, ?) - Preserve intentional spaces (like "a + b" or "() => {}") - Handle quotes and string delimiters 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../crawling/code_extraction_service.py | 66 +++++++++++++++---- 1 file changed, 52 insertions(+), 14 deletions(-) diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py index 0d17d5e7..b7fc8f17 100644 --- a/python/src/server/services/crawling/code_extraction_service.py +++ b/python/src/server/services/crawling/code_extraction_service.py @@ -1280,26 +1280,64 @@ class CodeExtractionService: # First, handle span tags that wrap individual tokens # Check if spans are being used for syntax highlighting by detecting # programming punctuation in/around spans (not just adjacent spans) + + # Check for multiple span tags (strong indicator of syntax highlighting) + # If there are 3+ span tags, it's almost certainly syntax highlighting + span_count = text.count(" " + text_normalized = re.sub(r'\s+', ' ', text) + syntax_highlight_indicators = [ - "/", "/", # Slashes (import paths, URLs) - ".", ".", # Dots (method chaining, paths) - ":", ":", # Colons (URLs, types) - "@", "@", # At-signs (Next.js imports) - "-", "-", # Hyphens (operators, kebab-case) - ">", ">", # Greater than (arrows, comparison) - "=", "=", # Equals (assignment, comparison) - "+", "+", # Plus (operators, concatenation) - "*", "*", # Asterisk (multiplication, pointers) - "&", "&", # Ampersand (logical AND, references) - "|", "|", # Pipe (logical OR, union types) - "?", "?", # Question mark (ternary, optional) + " /", "/", " /", # Slashes (import paths, URLs) + ".", ".", " .", # Dots (method chaining, paths) + ":", ":", " :", # Colons (URLs, types) + "@", "@", " @", # At-signs (Next.js imports) + "-", "-", " -", # Hyphens (operators, kebab-case) + ">", ">", # Greater than (arrows, comparison) + "=", "=", " =", # Equals (assignment, comparison) + "+", "+", # Plus (operators, concatenation) + "*", "*", # Asterisk (multiplication, pointers) + "&", "&", # Ampersand (logical AND, references) + "|", "|", # Pipe (logical OR, union types) + "?", "?", # Question mark (ternary, optional) ] - is_syntax_highlighted = any(indicator in text for indicator in syntax_highlight_indicators) + is_syntax_highlighted = (span_count >= 3) or any(indicator in text_normalized for indicator in syntax_highlight_indicators) if is_syntax_highlighted: # Syntax highlighting detected - remove all spans without adding spaces + # First, collapse whitespace between spans ONLY around programming punctuation + # This fixes Crawl4AI adding spaces: @ -> @ + # But preserves intentional spaces: + -> + + punctuation_patterns = [ + (r'\s+(/)\s*\1\s+(\.)\s*\1\s+(:)\s*\1\s+(@)\s*\1\s+(-)\s*\1\s+(>)\s*\1\s+(\?)\s*\1\s+]*>(/)', r'\1'), + (r'\s+]*>(\.)', r'\1'), + (r'\s+]*>(:)', r'\1'), + (r'\s+]*>(@)', r'\1'), + (r'(/)\s+\s+\s+\s+\s+]*>(['"``])''', r"\1"), + (r'''(['"``])\s+", "", text) text = re.sub(r"]*>", "", text) else: