Add comprehensive debug logging for space injection issue

Added debug logs at key points in the pipeline:
- Crawl4AI markdown output (before content_fixer)
- content_fixer input/output (before/after fixes)
- Code extraction markdown input (what's stored in DB)
- HTML entity decoding input/output (span removal)

This will help trace where spaces are being added/preserved in the content.
This commit is contained in:
leex279
2025-11-06 09:01:37 +01:00
parent af46373409
commit 4a33319949
3 changed files with 54 additions and 3 deletions

View File

@@ -302,6 +302,15 @@ class CodeExtractionService:
f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}" f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
) )
# DEBUG: Check for spaces in markdown code blocks
if md and "```" in md:
code_start = md.find('```')
if code_start >= 0:
code_sample = md[code_start:code_start+400]
if ' / ' in code_sample or ' - ' in code_sample:
safe_logfire_info(f"⚠️ DEBUG CODE EXTRACTION: Found spaces in markdown for {source_url}")
safe_logfire_info(f"📝 DEBUG: Markdown sample with spaces: {code_sample[:200]}")
# Get dynamic minimum length based on document context # Get dynamic minimum length based on document context
# Check markdown first to see if it has code blocks # Check markdown first to see if it has code blocks
@@ -1277,6 +1286,11 @@ class CodeExtractionService:
"""Decode common HTML entities and clean HTML tags from code.""" """Decode common HTML entities and clean HTML tags from code."""
import re import re
# DEBUG: Log if we're processing HTML with spaces
if '<span' in text and (' / ' in text or ' - ' in text):
safe_logfire_info(f"🔍 DEBUG _decode_html_entities: Processing HTML with spaces")
safe_logfire_info(f"📝 DEBUG: Input text sample (first 300 chars): {text[:300]}")
# First, handle span tags that wrap individual tokens # First, handle span tags that wrap individual tokens
# Check if spans are being used for syntax highlighting by detecting # Check if spans are being used for syntax highlighting by detecting
# programming punctuation in/around spans (not just adjacent spans) # programming punctuation in/around spans (not just adjacent spans)
@@ -1387,6 +1401,10 @@ class CodeExtractionService:
text = "\n".join(cleaned_lines) text = "\n".join(cleaned_lines)
# DEBUG: Log output if we processed HTML with spaces
if '<span' in text[:300] or (' / ' in text[:300] and 'import' in text[:300]):
safe_logfire_info(f"✅ DEBUG _decode_html_entities: Output text sample (first 300 chars): {text[:300]}")
return text return text
def _clean_code_content(self, code: str, language: str = "") -> str: def _clean_code_content(self, code: str, language: str = "") -> str:

View File

@@ -5,6 +5,10 @@ Handles post-processing of content from Crawl4AI to fix known issues.
""" """
import re import re
from ....config.logfire_config import get_logger
logger = get_logger(__name__)
def fix_code_span_spaces(markdown: str) -> str: def fix_code_span_spaces(markdown: str) -> str:
""" """
@@ -32,7 +36,13 @@ def fix_code_span_spaces(markdown: str) -> str:
def fix_code_block(match): def fix_code_block(match):
language = match.group(1) or '' language = match.group(1) or ''
code = match.group(2) code_before = match.group(2)
code = code_before
# DEBUG: Log the code block before any fixes
if ' / ' in code or ' - ' in code:
logger.info(f"🔍 DEBUG: Found code block with spaces to fix (lang={language})")
logger.info(f"📝 DEBUG: Code BEFORE fixes (first 200 chars): {code[:200]}")
# Fix import/require paths: 'next / headers' -> 'next/headers' # Fix import/require paths: 'next / headers' -> 'next/headers'
code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code) code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
@@ -54,9 +64,18 @@ def fix_code_span_spaces(markdown: str) -> str:
code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code) code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code) code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
# DEBUG: Log the code block after fixes if it changed
if code != code_before:
logger.info(f"✅ DEBUG: Code AFTER fixes (first 200 chars): {code[:200]}")
return f'```{language}\n{code}\n```' return f'```{language}\n{code}\n```'
# Process all code blocks # Process all code blocks
markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL) fixed_markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
return markdown # DEBUG: Log summary if changes were made
if fixed_markdown != markdown:
changes = len(markdown) - len(fixed_markdown)
logger.info(f"✨ DEBUG: Content fixer made changes (size diff: {changes} chars)")
return fixed_markdown

View File

@@ -305,8 +305,22 @@ class SinglePageCrawlStrategy:
processed_pages=1 processed_pages=1
) )
# DEBUG: Log markdown from Crawl4AI before fixing
if result.markdown and ('```' in result.markdown):
logger.info(f"🌐 DEBUG: Crawl4AI returned markdown for {original_url}")
# Find first code block to sample
code_start = result.markdown.find('```')
if code_start >= 0:
code_sample = result.markdown[code_start:code_start+300]
logger.info(f"📄 DEBUG: Raw markdown sample from Crawl4AI: {code_sample}")
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
cleaned_markdown = fix_code_span_spaces(result.markdown) cleaned_markdown = fix_code_span_spaces(result.markdown)
# DEBUG: Log after fixing
if cleaned_markdown != result.markdown:
logger.info(f"🔧 DEBUG: Markdown was modified by content_fixer for {original_url}")
return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}] return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
else: else:
logger.error(f"Failed to crawl {url}: {result.error_message}") logger.error(f"Failed to crawl {url}: {result.error_message}")