From 52cb47e8a464667d5b6f167511365372626562c4 Mon Sep 17 00:00:00 2001 From: leex279 Date: Thu, 6 Nov 2025 00:22:30 +0100 Subject: [PATCH] Fix HTML span space injection in code extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhanced _decode_html_entities() to detect syntax highlighting by checking for programming punctuation in/around spans, not just adjacent spans. This fixes unwanted space injection in: - Import paths: @/lib/db (was becoming "@/ lib / db") - URLs: https://example.com/path (was becoming "https://example.com/ path") - Method chains: object.method().chain() (was becoming "object. method(). chain()") Changes: - Added 13 syntax highlighting indicators (/, ., :, @, -, >, =, +, *, &, |, ?) - Enhanced regex with negative lookahead to prevent space insertion near punctuation - Maintains backward compatibility for non-syntax-highlighted HTML 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../crawling/code_extraction_service.py | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py index b1705b02..0d17d5e7 100644 --- a/python/src/server/services/crawling/code_extraction_service.py +++ b/python/src/server/services/crawling/code_extraction_service.py @@ -1278,15 +1278,39 @@ class CodeExtractionService: import re # First, handle span tags that wrap individual tokens - # Check if spans are being used for syntax highlighting (no spaces between tags) - if "/", "/", # Slashes (import paths, URLs) + ".", ".", # Dots (method chaining, paths) + ":", ":", # Colons (URLs, types) + "@", "@", # At-signs (Next.js imports) + "-", "-", # Hyphens (operators, kebab-case) + ">", ">", # Greater than (arrows, comparison) + "=", "=", # Equals (assignment, comparison) + "+", "+", # Plus (operators, concatenation) + "*", "*", # Asterisk (multiplication, pointers) + "&", "&", # Ampersand (logical AND, references) + "|", "|", # Pipe (logical OR, union types) + "?", "?", # Question mark (ternary, optional) + ] + + is_syntax_highlighted = any(indicator in text for indicator in syntax_highlight_indicators) + + if is_syntax_highlighted: + # Syntax highlighting detected - remove all spans without adding spaces text = re.sub(r"", "", text) text = re.sub(r"]*>", "", text) else: - # Normal span usage - might need spacing - # Only add space if there isn't already whitespace - text = re.sub(r"(?=[A-Za-z0-9])", " ", text) + # Normal HTML span usage - add spaces only between word boundaries + # Use negative lookahead to NEVER add space before: + # - whitespace, tags, or programming punctuation + text = re.sub( + r"(?!\s|<|/|\.|\:|\@|\-|\>|\=|\+|\*|\&|\||\?|\$)(?=[A-Za-z0-9])", + " ", + text + ) text = re.sub(r"]*>", "", text) # Remove any other HTML tags but preserve their content