From 52cb47e8a464667d5b6f167511365372626562c4 Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Thu, 6 Nov 2025 00:22:30 +0100
Subject: [PATCH] Fix HTML span space injection in code extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enhanced _decode_html_entities() to detect syntax highlighting by checking
for programming punctuation in/around spans, not just adjacent spans.

This fixes unwanted space injection in:
- Import paths: @/lib/db (was becoming "@/ lib / db")
- URLs: https://example.com/path (was becoming "https://example.com/ path")
- Method chains: object.method().chain() (was becoming "object. method(). chain()")

Changes:
- Added 13 syntax highlighting indicators (/, ., :, @, -, >, =, +, *, &, |, ?)
- Enhanced regex with negative lookahead to prevent space insertion near punctuation
- Maintains backward compatibility for non-syntax-highlighted HTML

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../crawling/code_extraction_service.py       | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)
diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py
index b1705b02..0d17d5e7 100644
--- a/python/src/server/services/crawling/code_extraction_service.py
+++ b/python/src/server/services/crawling/code_extraction_service.py
@@ -1278,15 +1278,39 @@ class CodeExtractionService:
         import re
 
         # First, handle span tags that wrap individual tokens
-        # Check if spans are being used for syntax highlighting (no spaces between tags)
-        if "</span><span" in text:
-            # This indicates syntax highlighting - preserve the structure
+        # Check if spans are being used for syntax highlighting by detecting
+        # programming punctuation in/around spans (not just adjacent spans)
+        syntax_highlight_indicators = [
+            "</span><span",      # Adjacent spans (most common pattern)
+            "</span>/", "/</span>",   # Slashes (import paths, URLs)
+            "</span>.", ".</span>",   # Dots (method chaining, paths)
+            "</span>:", ":</span>",   # Colons (URLs, types)
+            "</span>@", "@</span>",   # At-signs (Next.js imports)
+            "</span>-", "-</span>",   # Hyphens (operators, kebab-case)
+            "</span>>", "></span>",   # Greater than (arrows, comparison)
+            "</span>=", "=</span>",   # Equals (assignment, comparison)
+            "</span>+", "+</span>",   # Plus (operators, concatenation)
+            "</span>*", "*</span>",   # Asterisk (multiplication, pointers)
+            "</span>&", "&</span>",   # Ampersand (logical AND, references)
+            "</span>|", "|</span>",   # Pipe (logical OR, union types)
+            "</span>?", "?</span>",   # Question mark (ternary, optional)
+        ]
+
+        is_syntax_highlighted = any(indicator in text for indicator in syntax_highlight_indicators)
+
+        if is_syntax_highlighted:
+            # Syntax highlighting detected - remove all spans without adding spaces
             text = re.sub(r"</span>", "", text)
             text = re.sub(r"<span[^>]*>", "", text)
         else:
-            # Normal span usage - might need spacing
-            # Only add space if there isn't already whitespace
-            text = re.sub(r"</span>(?=[A-Za-z0-9])", " ", text)
+            # Normal HTML span usage - add spaces only between word boundaries
+            # Use negative lookahead to NEVER add space before:
+            # - whitespace, tags, or programming punctuation
+            text = re.sub(
+                r"</span>(?!\s|<|/|\.|\:|\@|\-|\>|\=|\+|\*|\&|\||\?|\$)(?=[A-Za-z0-9])",
+                " ",
+                text
+            )
             text = re.sub(r"<span[^>]*>", "", text)
 
         # Remove any other HTML tags but preserve their content