Add comprehensive debug logging for space injection issue

Added debug logs at key points in the pipeline: - Crawl4AI markdown output (before content_fixer) - content_fixer input/output (before/after fixes) - Code extraction markdown input (what's stored in DB) - HTML entity decoding input/output (span removal) This will help trace where spaces are being added/preserved in the content.
2025-12-23 18:29:18 -05:00 · 2025-11-06 09:01:37 +01:00
parent af46373409
commit 4a33319949
3 changed files with 54 additions and 3 deletions
--- a/python/src/server/services/crawling/code_extraction_service.py
+++ b/python/src/server/services/crawling/code_extraction_service.py
@@ -302,6 +302,15 @@ class CodeExtractionService:
                    f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
                )

+                # DEBUG: Check for spaces in markdown code blocks
+                if md and "```" in md:
+                    code_start = md.find('```')
+                    if code_start >= 0:
+                        code_sample = md[code_start:code_start+400]
+                        if ' / ' in code_sample or ' - ' in code_sample:
+                            safe_logfire_info(f"⚠️ DEBUG CODE EXTRACTION: Found spaces in markdown for {source_url}")
+                            safe_logfire_info(f"📝 DEBUG: Markdown sample with spaces: {code_sample[:200]}")
+
                # Get dynamic minimum length based on document context

                # Check markdown first to see if it has code blocks
@@ -1277,6 +1286,11 @@ class CodeExtractionService:
        """Decode common HTML entities and clean HTML tags from code."""
        import re

+        # DEBUG: Log if we're processing HTML with spaces
+        if '<span' in text and (' / ' in text or ' - ' in text):
+            safe_logfire_info(f"🔍 DEBUG _decode_html_entities: Processing HTML with spaces")
+            safe_logfire_info(f"📝 DEBUG: Input text sample (first 300 chars): {text[:300]}")
+
        # First, handle span tags that wrap individual tokens
        # Check if spans are being used for syntax highlighting by detecting
        # programming punctuation in/around spans (not just adjacent spans)
@@ -1387,6 +1401,10 @@ class CodeExtractionService:

        text = "\n".join(cleaned_lines)

+        # DEBUG: Log output if we processed HTML with spaces
+        if '<span' in text[:300] or (' / ' in text[:300] and 'import' in text[:300]):
+            safe_logfire_info(f"✅ DEBUG _decode_html_entities: Output text sample (first 300 chars): {text[:300]}")
+
        return text

    def _clean_code_content(self, code: str, language: str = "") -> str:
--- a/python/src/server/services/crawling/helpers/content_fixer.py
+++ b/python/src/server/services/crawling/helpers/content_fixer.py
@@ -5,6 +5,10 @@ Handles post-processing of content from Crawl4AI to fix known issues.
 """
 import re

+from ....config.logfire_config import get_logger
+
+logger = get_logger(__name__)
+

 def fix_code_span_spaces(markdown: str) -> str:
    """
@@ -32,7 +36,13 @@ def fix_code_span_spaces(markdown: str) -> str:

    def fix_code_block(match):
        language = match.group(1) or ''
-        code = match.group(2)
+        code_before = match.group(2)
+        code = code_before
+
+        # DEBUG: Log the code block before any fixes
+        if ' / ' in code or ' - ' in code:
+            logger.info(f"🔍 DEBUG: Found code block with spaces to fix (lang={language})")
+            logger.info(f"📝 DEBUG: Code BEFORE fixes (first 200 chars): {code[:200]}")

        # Fix import/require paths: 'next / headers' -> 'next/headers'
        code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
@@ -54,9 +64,18 @@ def fix_code_span_spaces(markdown: str) -> str:
        code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
        code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)

+        # DEBUG: Log the code block after fixes if it changed
+        if code != code_before:
+            logger.info(f"✅ DEBUG: Code AFTER fixes (first 200 chars): {code[:200]}")
+
        return f'```{language}\n{code}\n```'

    # Process all code blocks
-    markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
+    fixed_markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)

-    return markdown
+    # DEBUG: Log summary if changes were made
+    if fixed_markdown != markdown:
+        changes = len(markdown) - len(fixed_markdown)
+        logger.info(f"✨ DEBUG: Content fixer made changes (size diff: {changes} chars)")
+
+    return fixed_markdown
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -305,8 +305,22 @@ class SinglePageCrawlStrategy:
                    processed_pages=1
                )

+                # DEBUG: Log markdown from Crawl4AI before fixing
+                if result.markdown and ('```' in result.markdown):
+                    logger.info(f"🌐 DEBUG: Crawl4AI returned markdown for {original_url}")
+                    # Find first code block to sample
+                    code_start = result.markdown.find('```')
+                    if code_start >= 0:
+                        code_sample = result.markdown[code_start:code_start+300]
+                        logger.info(f"📄 DEBUG: Raw markdown sample from Crawl4AI: {code_sample}")
+
                # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
                cleaned_markdown = fix_code_span_spaces(result.markdown)
+
+                # DEBUG: Log after fixing
+                if cleaned_markdown != result.markdown:
+                    logger.info(f"🔧 DEBUG: Markdown was modified by content_fixer for {original_url}")
+
                return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
            else:
                logger.error(f"Failed to crawl {url}: {result.error_message}")