Fix Crawl4AI/BeautifulSoup space injection in markdown

Added post-processing to fix spaces that Crawl4AI/BeautifulSoup adds when extracting text from nested HTML elements. BeautifulSoup's get_text() adds spaces between nested tags, corrupting code paths in the stored markdown. Problem: HTML like <span>'next<span>/</span>headers'</span> Gets extracted as: 'next / headers' (spaces added by BeautifulSoup) Solution: - Created content_fixer.py helper with fix_code_span_spaces() function - Applied to all crawling strategies (single_page, batch, recursive) - Fixes import paths: 'next / headers' -> 'next/headers' - Fixes @ paths: '@/ lib / auth' -> '@/lib/auth' - Fixes hyphenated: 'server - only' -> 'server-only' This fix runs BEFORE content is stored in the database, preventing the corruption at the source. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-24 02:39:17 -05:00 · 2025-11-06 08:47:27 +01:00
parent 476fa57061
commit af46373409
4 changed files with 76 additions and 3 deletions
--- a/python/src/server/services/crawling/helpers/content_fixer.py
+++ b/python/src/server/services/crawling/helpers/content_fixer.py
@@ -0,0 +1,62 @@
+"""
+Content fixing utilities for crawled content.
+
+Handles post-processing of content from Crawl4AI to fix known issues.
+"""
+import re
+
+
+def fix_code_span_spaces(markdown: str) -> str:
+    """
+    Fix spaces inside code spans that Crawl4AI/BeautifulSoup adds when extracting text.
+
+    BeautifulSoup's get_text() adds spaces between nested elements, which corrupts
+    code paths and imports like 'next/headers' becoming 'next / headers'.
+
+    Example fixes:
+        - 'next / headers' -> 'next/headers'
+        - '@/ lib / auth' -> '@/lib/auth'
+        - 'server - only' -> 'server-only'
+
+    Args:
+        markdown: Markdown content with potential space issues in code blocks
+
+    Returns:
+        Cleaned markdown with spaces removed from code paths
+    """
+    if not markdown:
+        return markdown
+
+    # Pattern to match code blocks with language specification
+    code_block_pattern = r'```(\w+)?\n(.*?)\n```'
+
+    def fix_code_block(match):
+        language = match.group(1) or ''
+        code = match.group(2)
+
+        # Fix import/require paths: 'next / headers' -> 'next/headers'
+        code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
+        code = re.sub(r'"([^"]*?)\s+/\s+([^"]*?)"', r'"\1/\2"', code)
+
+        # Fix multiple slashes in paths: 'lib / utils / helper' -> 'lib/utils/helper'
+        # Repeat to handle chains
+        for _ in range(5):  # Max 5 slashes in a path
+            code = re.sub(r"'([^']*?)/\s+([^']*?)'", r"'\1/\2'", code)
+            code = re.sub(r'"([^"]*?)/\s+([^"]*?)"', r'"\1/\2"', code)
+            code = re.sub(r"'([^']*?)\s+/([^']*?)'", r"'\1/\2'", code)
+            code = re.sub(r'"([^"]*?)\s+/([^"]*?)"', r'"\1/\2"', code)
+
+        # Fix @ paths: '@/ lib' -> '@/lib'
+        code = re.sub(r"'@\s*/\s+", r"'@/", code)
+        code = re.sub(r'"@\s*/\s+', r'"@/', code)
+
+        # Fix server-only and other hyphenated imports: 'server - only' -> 'server-only'
+        code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
+        code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
+
+        return f'```{language}\n{code}\n```'
+
+    # Process all code blocks
+    markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
+
+    return markdown
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -12,6 +12,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher

 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.content_fixer import fix_code_span_spaces

 logger = get_logger(__name__)

@@ -255,9 +256,12 @@ class BatchCrawlStrategy:
                        if fallback_text:
                            title = fallback_text

+                    # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
+                    cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
+
                    successful_results.append({
                        "url": original_url,
-                        "markdown": result.markdown.fit_markdown,
+                        "markdown": cleaned_markdown,
                        "html": result.html,  # Use raw HTML
                        "title": title,
                    })
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -13,6 +13,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher

 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.content_fixer import fix_code_span_spaces
 from ..helpers.url_handler import URLHandler

 logger = get_logger(__name__)
@@ -289,9 +290,12 @@ class RecursiveCrawlStrategy:
                                if extracted_title:
                                    title = extracted_title

+                        # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
+                        cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
+
                        results_all.append({
                            "url": original_url,
-                            "markdown": result.markdown.fit_markdown,
+                            "markdown": cleaned_markdown,
                            "html": result.html,  # Always use raw HTML for code extraction
                            "title": title,
                        })
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -11,6 +11,7 @@ from typing import Any
 from crawl4ai import CacheMode, CrawlerRunConfig

 from ....config.logfire_config import get_logger
+from ..helpers.content_fixer import fix_code_span_spaces

 logger = get_logger(__name__)

@@ -304,7 +305,9 @@ class SinglePageCrawlStrategy:
                    processed_pages=1
                )

-                return [{'url': original_url, 'markdown': result.markdown, 'html': result.html}]
+                # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
+                cleaned_markdown = fix_code_span_spaces(result.markdown)
+                return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
            else:
                logger.error(f"Failed to crawl {url}: {result.error_message}")
                return []