mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-23 18:29:18 -05:00
Add comprehensive debug logging for space injection issue
Added debug logs at key points in the pipeline: - Crawl4AI markdown output (before content_fixer) - content_fixer input/output (before/after fixes) - Code extraction markdown input (what's stored in DB) - HTML entity decoding input/output (span removal) This will help trace where spaces are being added/preserved in the content.
This commit is contained in:
@@ -302,6 +302,15 @@ class CodeExtractionService:
|
||||
f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
|
||||
)
|
||||
|
||||
# DEBUG: Check for spaces in markdown code blocks
|
||||
if md and "```" in md:
|
||||
code_start = md.find('```')
|
||||
if code_start >= 0:
|
||||
code_sample = md[code_start:code_start+400]
|
||||
if ' / ' in code_sample or ' - ' in code_sample:
|
||||
safe_logfire_info(f"⚠️ DEBUG CODE EXTRACTION: Found spaces in markdown for {source_url}")
|
||||
safe_logfire_info(f"📝 DEBUG: Markdown sample with spaces: {code_sample[:200]}")
|
||||
|
||||
# Get dynamic minimum length based on document context
|
||||
|
||||
# Check markdown first to see if it has code blocks
|
||||
@@ -1277,6 +1286,11 @@ class CodeExtractionService:
|
||||
"""Decode common HTML entities and clean HTML tags from code."""
|
||||
import re
|
||||
|
||||
# DEBUG: Log if we're processing HTML with spaces
|
||||
if '<span' in text and (' / ' in text or ' - ' in text):
|
||||
safe_logfire_info(f"🔍 DEBUG _decode_html_entities: Processing HTML with spaces")
|
||||
safe_logfire_info(f"📝 DEBUG: Input text sample (first 300 chars): {text[:300]}")
|
||||
|
||||
# First, handle span tags that wrap individual tokens
|
||||
# Check if spans are being used for syntax highlighting by detecting
|
||||
# programming punctuation in/around spans (not just adjacent spans)
|
||||
@@ -1387,6 +1401,10 @@ class CodeExtractionService:
|
||||
|
||||
text = "\n".join(cleaned_lines)
|
||||
|
||||
# DEBUG: Log output if we processed HTML with spaces
|
||||
if '<span' in text[:300] or (' / ' in text[:300] and 'import' in text[:300]):
|
||||
safe_logfire_info(f"✅ DEBUG _decode_html_entities: Output text sample (first 300 chars): {text[:300]}")
|
||||
|
||||
return text
|
||||
|
||||
def _clean_code_content(self, code: str, language: str = "") -> str:
|
||||
|
||||
@@ -5,6 +5,10 @@ Handles post-processing of content from Crawl4AI to fix known issues.
|
||||
"""
|
||||
import re
|
||||
|
||||
from ....config.logfire_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def fix_code_span_spaces(markdown: str) -> str:
|
||||
"""
|
||||
@@ -32,7 +36,13 @@ def fix_code_span_spaces(markdown: str) -> str:
|
||||
|
||||
def fix_code_block(match):
|
||||
language = match.group(1) or ''
|
||||
code = match.group(2)
|
||||
code_before = match.group(2)
|
||||
code = code_before
|
||||
|
||||
# DEBUG: Log the code block before any fixes
|
||||
if ' / ' in code or ' - ' in code:
|
||||
logger.info(f"🔍 DEBUG: Found code block with spaces to fix (lang={language})")
|
||||
logger.info(f"📝 DEBUG: Code BEFORE fixes (first 200 chars): {code[:200]}")
|
||||
|
||||
# Fix import/require paths: 'next / headers' -> 'next/headers'
|
||||
code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
|
||||
@@ -54,9 +64,18 @@ def fix_code_span_spaces(markdown: str) -> str:
|
||||
code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
|
||||
code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
|
||||
|
||||
# DEBUG: Log the code block after fixes if it changed
|
||||
if code != code_before:
|
||||
logger.info(f"✅ DEBUG: Code AFTER fixes (first 200 chars): {code[:200]}")
|
||||
|
||||
return f'```{language}\n{code}\n```'
|
||||
|
||||
# Process all code blocks
|
||||
markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
|
||||
fixed_markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
|
||||
|
||||
return markdown
|
||||
# DEBUG: Log summary if changes were made
|
||||
if fixed_markdown != markdown:
|
||||
changes = len(markdown) - len(fixed_markdown)
|
||||
logger.info(f"✨ DEBUG: Content fixer made changes (size diff: {changes} chars)")
|
||||
|
||||
return fixed_markdown
|
||||
|
||||
@@ -305,8 +305,22 @@ class SinglePageCrawlStrategy:
|
||||
processed_pages=1
|
||||
)
|
||||
|
||||
# DEBUG: Log markdown from Crawl4AI before fixing
|
||||
if result.markdown and ('```' in result.markdown):
|
||||
logger.info(f"🌐 DEBUG: Crawl4AI returned markdown for {original_url}")
|
||||
# Find first code block to sample
|
||||
code_start = result.markdown.find('```')
|
||||
if code_start >= 0:
|
||||
code_sample = result.markdown[code_start:code_start+300]
|
||||
logger.info(f"📄 DEBUG: Raw markdown sample from Crawl4AI: {code_sample}")
|
||||
|
||||
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
|
||||
cleaned_markdown = fix_code_span_spaces(result.markdown)
|
||||
|
||||
# DEBUG: Log after fixing
|
||||
if cleaned_markdown != result.markdown:
|
||||
logger.info(f"🔧 DEBUG: Markdown was modified by content_fixer for {original_url}")
|
||||
|
||||
return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
|
||||
else:
|
||||
logger.error(f"Failed to crawl {url}: {result.error_message}")
|
||||
|
||||
Reference in New Issue
Block a user