From 4a33319949d1916855672391c9a7c0109cdf9d02 Mon Sep 17 00:00:00 2001 From: leex279 Date: Thu, 6 Nov 2025 09:01:37 +0100 Subject: [PATCH] Add comprehensive debug logging for space injection issue Added debug logs at key points in the pipeline: - Crawl4AI markdown output (before content_fixer) - content_fixer input/output (before/after fixes) - Code extraction markdown input (what's stored in DB) - HTML entity decoding input/output (span removal) This will help trace where spaces are being added/preserved in the content. --- .../crawling/code_extraction_service.py | 18 +++++++++++++ .../crawling/helpers/content_fixer.py | 25 ++++++++++++++++--- .../crawling/strategies/single_page.py | 14 +++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/python/src/server/services/crawling/code_extraction_service.py b/python/src/server/services/crawling/code_extraction_service.py index b7fc8f17..b613d351 100644 --- a/python/src/server/services/crawling/code_extraction_service.py +++ b/python/src/server/services/crawling/code_extraction_service.py @@ -302,6 +302,15 @@ class CodeExtractionService: f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}" ) + # DEBUG: Check for spaces in markdown code blocks + if md and "```" in md: + code_start = md.find('```') + if code_start >= 0: + code_sample = md[code_start:code_start+400] + if ' / ' in code_sample or ' - ' in code_sample: + safe_logfire_info(f"⚠️ DEBUG CODE EXTRACTION: Found spaces in markdown for {source_url}") + safe_logfire_info(f"📝 DEBUG: Markdown sample with spaces: {code_sample[:200]}") + # Get dynamic minimum length based on document context # Check markdown first to see if it has code blocks @@ -1277,6 +1286,11 @@ class CodeExtractionService: """Decode common HTML entities and clean HTML tags from code.""" import re + # DEBUG: Log if we're processing HTML with spaces + if ' str: diff --git a/python/src/server/services/crawling/helpers/content_fixer.py b/python/src/server/services/crawling/helpers/content_fixer.py index 399bfff0..c70d2c29 100644 --- a/python/src/server/services/crawling/helpers/content_fixer.py +++ b/python/src/server/services/crawling/helpers/content_fixer.py @@ -5,6 +5,10 @@ Handles post-processing of content from Crawl4AI to fix known issues. """ import re +from ....config.logfire_config import get_logger + +logger = get_logger(__name__) + def fix_code_span_spaces(markdown: str) -> str: """ @@ -32,7 +36,13 @@ def fix_code_span_spaces(markdown: str) -> str: def fix_code_block(match): language = match.group(1) or '' - code = match.group(2) + code_before = match.group(2) + code = code_before + + # DEBUG: Log the code block before any fixes + if ' / ' in code or ' - ' in code: + logger.info(f"🔍 DEBUG: Found code block with spaces to fix (lang={language})") + logger.info(f"📝 DEBUG: Code BEFORE fixes (first 200 chars): {code[:200]}") # Fix import/require paths: 'next / headers' -> 'next/headers' code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code) @@ -54,9 +64,18 @@ def fix_code_span_spaces(markdown: str) -> str: code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code) code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code) + # DEBUG: Log the code block after fixes if it changed + if code != code_before: + logger.info(f"✅ DEBUG: Code AFTER fixes (first 200 chars): {code[:200]}") + return f'```{language}\n{code}\n```' # Process all code blocks - markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL) + fixed_markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL) - return markdown + # DEBUG: Log summary if changes were made + if fixed_markdown != markdown: + changes = len(markdown) - len(fixed_markdown) + logger.info(f"✨ DEBUG: Content fixer made changes (size diff: {changes} chars)") + + return fixed_markdown diff --git a/python/src/server/services/crawling/strategies/single_page.py b/python/src/server/services/crawling/strategies/single_page.py index 6bba8cfc..6ee92871 100644 --- a/python/src/server/services/crawling/strategies/single_page.py +++ b/python/src/server/services/crawling/strategies/single_page.py @@ -305,8 +305,22 @@ class SinglePageCrawlStrategy: processed_pages=1 ) + # DEBUG: Log markdown from Crawl4AI before fixing + if result.markdown and ('```' in result.markdown): + logger.info(f"🌐 DEBUG: Crawl4AI returned markdown for {original_url}") + # Find first code block to sample + code_start = result.markdown.find('```') + if code_start >= 0: + code_sample = result.markdown[code_start:code_start+300] + logger.info(f"📄 DEBUG: Raw markdown sample from Crawl4AI: {code_sample}") + # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds cleaned_markdown = fix_code_span_spaces(result.markdown) + + # DEBUG: Log after fixing + if cleaned_markdown != result.markdown: + logger.info(f"🔧 DEBUG: Markdown was modified by content_fixer for {original_url}") + return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}] else: logger.error(f"Failed to crawl {url}: {result.error_message}")