From af463734099c77580e663c6e6e67ea639ec6b7f5 Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Thu, 6 Nov 2025 08:47:27 +0100
Subject: [PATCH] Fix Crawl4AI/BeautifulSoup space injection in markdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added post-processing to fix spaces that Crawl4AI/BeautifulSoup adds when
extracting text from nested HTML elements. BeautifulSoup's get_text() adds
spaces between nested tags, corrupting code paths in the stored markdown.

Problem: HTML like <span>'next<span>/</span>headers'</span>
Gets extracted as: 'next / headers' (spaces added by BeautifulSoup)

Solution:
- Created content_fixer.py helper with fix_code_span_spaces() function
- Applied to all crawling strategies (single_page, batch, recursive)
- Fixes import paths: 'next / headers' -> 'next/headers'
- Fixes @ paths: '@/ lib / auth' -> '@/lib/auth'
- Fixes hyphenated: 'server - only' -> 'server-only'

This fix runs BEFORE content is stored in the database, preventing the
corruption at the source.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../crawling/helpers/content_fixer.py         | 62 +++++++++++++++++++
 .../services/crawling/strategies/batch.py     |  6 +-
 .../services/crawling/strategies/recursive.py |  6 +-
 .../crawling/strategies/single_page.py        |  5 +-
 4 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 python/src/server/services/crawling/helpers/content_fixer.py
diff --git a/python/src/server/services/crawling/helpers/content_fixer.py b/python/src/server/services/crawling/helpers/content_fixer.py
new file mode 100644
index 00000000..399bfff0
--- /dev/null
+++ b/python/src/server/services/crawling/helpers/content_fixer.py
@@ -0,0 +1,62 @@
+"""
+Content fixing utilities for crawled content.
+
+Handles post-processing of content from Crawl4AI to fix known issues.
+"""
+import re
+
+
+def fix_code_span_spaces(markdown: str) -> str:
+    """
+    Fix spaces inside code spans that Crawl4AI/BeautifulSoup adds when extracting text.
+
+    BeautifulSoup's get_text() adds spaces between nested elements, which corrupts
+    code paths and imports like 'next/headers' becoming 'next / headers'.
+
+    Example fixes:
+        - 'next / headers' -> 'next/headers'
+        - '@/ lib / auth' -> '@/lib/auth'
+        - 'server - only' -> 'server-only'
+
+    Args:
+        markdown: Markdown content with potential space issues in code blocks
+
+    Returns:
+        Cleaned markdown with spaces removed from code paths
+    """
+    if not markdown:
+        return markdown
+
+    # Pattern to match code blocks with language specification
+    code_block_pattern = r'```(\w+)?\n(.*?)\n```'
+
+    def fix_code_block(match):
+        language = match.group(1) or ''
+        code = match.group(2)
+
+        # Fix import/require paths: 'next / headers' -> 'next/headers'
+        code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
+        code = re.sub(r'"([^"]*?)\s+/\s+([^"]*?)"', r'"\1/\2"', code)
+
+        # Fix multiple slashes in paths: 'lib / utils / helper' -> 'lib/utils/helper'
+        # Repeat to handle chains
+        for _ in range(5):  # Max 5 slashes in a path
+            code = re.sub(r"'([^']*?)/\s+([^']*?)'", r"'\1/\2'", code)
+            code = re.sub(r'"([^"]*?)/\s+([^"]*?)"', r'"\1/\2"', code)
+            code = re.sub(r"'([^']*?)\s+/([^']*?)'", r"'\1/\2'", code)
+            code = re.sub(r'"([^"]*?)\s+/([^"]*?)"', r'"\1/\2"', code)
+
+        # Fix @ paths: '@/ lib' -> '@/lib'
+        code = re.sub(r"'@\s*/\s+", r"'@/", code)
+        code = re.sub(r'"@\s*/\s+', r'"@/', code)
+
+        # Fix server-only and other hyphenated imports: 'server - only' -> 'server-only'
+        code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
+        code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
+
+        return f'```{language}\n{code}\n```'
+
+    # Process all code blocks
+    markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
+
+    return markdown
diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py
index 16aea020..e8e69bfc 100644
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -12,6 +12,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher
 
 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.content_fixer import fix_code_span_spaces
 
 logger = get_logger(__name__)
 
@@ -255,9 +256,12 @@ class BatchCrawlStrategy:
                         if fallback_text:
                             title = fallback_text
 
+                    # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
+                    cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
+
                     successful_results.append({
                         "url": original_url,
-                        "markdown": result.markdown.fit_markdown,
+                        "markdown": cleaned_markdown,
                         "html": result.html,  # Use raw HTML
                         "title": title,
                     })
diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py
index 3cdee750..a6373126 100644
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -13,6 +13,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher
 
 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.content_fixer import fix_code_span_spaces
 from ..helpers.url_handler import URLHandler
 
 logger = get_logger(__name__)
@@ -289,9 +290,12 @@ class RecursiveCrawlStrategy:
                                 if extracted_title:
                                     title = extracted_title
 
+                        # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
+                        cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
+
                         results_all.append({
                             "url": original_url,
-                            "markdown": result.markdown.fit_markdown,
+                            "markdown": cleaned_markdown,
                             "html": result.html,  # Always use raw HTML for code extraction
                             "title": title,
                         })
diff --git a/python/src/server/services/crawling/strategies/single_page.py b/python/src/server/services/crawling/strategies/single_page.py
index 96ea5bb5..6bba8cfc 100644
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -11,6 +11,7 @@ from typing import Any
 from crawl4ai import CacheMode, CrawlerRunConfig
 
 from ....config.logfire_config import get_logger
+from ..helpers.content_fixer import fix_code_span_spaces
 
 logger = get_logger(__name__)
 
@@ -304,7 +305,9 @@ class SinglePageCrawlStrategy:
                     processed_pages=1
                 )
 
-                return [{'url': original_url, 'markdown': result.markdown, 'html': result.html}]
+                # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
+                cleaned_markdown = fix_code_span_spaces(result.markdown)
+                return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
             else:
                 logger.error(f"Failed to crawl {url}: {result.error_message}")
                 return []