From 4a33319949d1916855672391c9a7c0109cdf9d02 Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Thu, 6 Nov 2025 09:01:37 +0100
Subject: [PATCH] Add comprehensive debug logging for space injection issue

Added debug logs at key points in the pipeline:
- Crawl4AI markdown output (before content_fixer)
- content_fixer input/output (before/after fixes)
- Code extraction markdown input (what's stored in DB)
- HTML entity decoding input/output (span removal)

This will help trace where spaces are being added/preserved in the content.
---
 .../crawling/code_extraction_service.py       | 18 +++++++++++++
 .../crawling/helpers/content_fixer.py         | 25 ++++++++++++++++---
 .../crawling/strategies/single_page.py        | 14 +++++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py
index b7fc8f17..b613d351 100644
--- a/python/src/server/services/crawling/code_extraction_service.py
+++ b/python/src/server/services/crawling/code_extraction_service.py
@@ -302,6 +302,15 @@ class CodeExtractionService:
                     f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
                 )
 
+                # DEBUG: Check for spaces in markdown code blocks
+                if md and "```" in md:
+                    code_start = md.find('```')
+                    if code_start >= 0:
+                        code_sample = md[code_start:code_start+400]
+                        if ' / ' in code_sample or ' - ' in code_sample:
+                            safe_logfire_info(f"⚠️ DEBUG CODE EXTRACTION: Found spaces in markdown for {source_url}")
+                            safe_logfire_info(f"📝 DEBUG: Markdown sample with spaces: {code_sample[:200]}")
+
                 # Get dynamic minimum length based on document context
 
                 # Check markdown first to see if it has code blocks
@@ -1277,6 +1286,11 @@ class CodeExtractionService:
         """Decode common HTML entities and clean HTML tags from code."""
         import re
 
+        # DEBUG: Log if we're processing HTML with spaces
+        if '<span' in text and (' / ' in text or ' - ' in text):
+            safe_logfire_info(f"🔍 DEBUG _decode_html_entities: Processing HTML with spaces")
+            safe_logfire_info(f"📝 DEBUG: Input text sample (first 300 chars): {text[:300]}")
+
         # First, handle span tags that wrap individual tokens
         # Check if spans are being used for syntax highlighting by detecting
         # programming punctuation in/around spans (not just adjacent spans)
@@ -1387,6 +1401,10 @@ class CodeExtractionService:
 
         text = "\n".join(cleaned_lines)
 
+        # DEBUG: Log output if we processed HTML with spaces
+        if '<span' in text[:300] or (' / ' in text[:300] and 'import' in text[:300]):
+            safe_logfire_info(f"✅ DEBUG _decode_html_entities: Output text sample (first 300 chars): {text[:300]}")
+
         return text
 
     def _clean_code_content(self, code: str, language: str = "") -> str:
diff --git a/python/src/server/services/crawling/helpers/content_fixer.py b/python/src/server/services/crawling/helpers/content_fixer.py
index 399bfff0..c70d2c29 100644
--- a/python/src/server/services/crawling/helpers/content_fixer.py
+++ b/python/src/server/services/crawling/helpers/content_fixer.py
@@ -5,6 +5,10 @@ Handles post-processing of content from Crawl4AI to fix known issues.
 """
 import re
 
+from ....config.logfire_config import get_logger
+
+logger = get_logger(__name__)
+
 
 def fix_code_span_spaces(markdown: str) -> str:
     """
@@ -32,7 +36,13 @@ def fix_code_span_spaces(markdown: str) -> str:
 
     def fix_code_block(match):
         language = match.group(1) or ''
-        code = match.group(2)
+        code_before = match.group(2)
+        code = code_before
+
+        # DEBUG: Log the code block before any fixes
+        if ' / ' in code or ' - ' in code:
+            logger.info(f"🔍 DEBUG: Found code block with spaces to fix (lang={language})")
+            logger.info(f"📝 DEBUG: Code BEFORE fixes (first 200 chars): {code[:200]}")
 
         # Fix import/require paths: 'next / headers' -> 'next/headers'
         code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
@@ -54,9 +64,18 @@ def fix_code_span_spaces(markdown: str) -> str:
         code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
         code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
 
+        # DEBUG: Log the code block after fixes if it changed
+        if code != code_before:
+            logger.info(f"✅ DEBUG: Code AFTER fixes (first 200 chars): {code[:200]}")
+
         return f'```{language}\n{code}\n```'
 
     # Process all code blocks
-    markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
+    fixed_markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
 
-    return markdown
+    # DEBUG: Log summary if changes were made
+    if fixed_markdown != markdown:
+        changes = len(markdown) - len(fixed_markdown)
+        logger.info(f"✨ DEBUG: Content fixer made changes (size diff: {changes} chars)")
+
+    return fixed_markdown
diff --git a/python/src/server/services/crawling/strategies/single_page.py b/python/src/server/services/crawling/strategies/single_page.py
index 6bba8cfc..6ee92871 100644
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -305,8 +305,22 @@ class SinglePageCrawlStrategy:
                     processed_pages=1
                 )
 
+                # DEBUG: Log markdown from Crawl4AI before fixing
+                if result.markdown and ('```' in result.markdown):
+                    logger.info(f"🌐 DEBUG: Crawl4AI returned markdown for {original_url}")
+                    # Find first code block to sample
+                    code_start = result.markdown.find('```')
+                    if code_start >= 0:
+                        code_sample = result.markdown[code_start:code_start+300]
+                        logger.info(f"📄 DEBUG: Raw markdown sample from Crawl4AI: {code_sample}")
+
                 # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
                 cleaned_markdown = fix_code_span_spaces(result.markdown)
+
+                # DEBUG: Log after fixing
+                if cleaned_markdown != result.markdown:
+                    logger.info(f"🔧 DEBUG: Markdown was modified by content_fixer for {original_url}")
+
                 return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
             else:
                 logger.error(f"Failed to crawl {url}: {result.error_message}")