From af463734099c77580e663c6e6e67ea639ec6b7f5 Mon Sep 17 00:00:00 2001 From: leex279 Date: Thu, 6 Nov 2025 08:47:27 +0100 Subject: [PATCH] Fix Crawl4AI/BeautifulSoup space injection in markdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added post-processing to fix spaces that Crawl4AI/BeautifulSoup adds when extracting text from nested HTML elements. BeautifulSoup's get_text() adds spaces between nested tags, corrupting code paths in the stored markdown. Problem: HTML like 'next/headers' Gets extracted as: 'next / headers' (spaces added by BeautifulSoup) Solution: - Created content_fixer.py helper with fix_code_span_spaces() function - Applied to all crawling strategies (single_page, batch, recursive) - Fixes import paths: 'next / headers' -> 'next/headers' - Fixes @ paths: '@/ lib / auth' -> '@/lib/auth' - Fixes hyphenated: 'server - only' -> 'server-only' This fix runs BEFORE content is stored in the database, preventing the corruption at the source. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../crawling/helpers/content_fixer.py | 62 +++++++++++++++++++ .../services/crawling/strategies/batch.py | 6 +- .../services/crawling/strategies/recursive.py | 6 +- .../crawling/strategies/single_page.py | 5 +- 4 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 python/src/server/services/crawling/helpers/content_fixer.py diff --git a/python/src/server/services/crawling/helpers/content_fixer.py b/python/src/server/services/crawling/helpers/content_fixer.py new file mode 100644 index 00000000..399bfff0 --- /dev/null +++ b/python/src/server/services/crawling/helpers/content_fixer.py @@ -0,0 +1,62 @@ +""" +Content fixing utilities for crawled content. + +Handles post-processing of content from Crawl4AI to fix known issues. +""" +import re + + +def fix_code_span_spaces(markdown: str) -> str: + """ + Fix spaces inside code spans that Crawl4AI/BeautifulSoup adds when extracting text. + + BeautifulSoup's get_text() adds spaces between nested elements, which corrupts + code paths and imports like 'next/headers' becoming 'next / headers'. + + Example fixes: + - 'next / headers' -> 'next/headers' + - '@/ lib / auth' -> '@/lib/auth' + - 'server - only' -> 'server-only' + + Args: + markdown: Markdown content with potential space issues in code blocks + + Returns: + Cleaned markdown with spaces removed from code paths + """ + if not markdown: + return markdown + + # Pattern to match code blocks with language specification + code_block_pattern = r'```(\w+)?\n(.*?)\n```' + + def fix_code_block(match): + language = match.group(1) or '' + code = match.group(2) + + # Fix import/require paths: 'next / headers' -> 'next/headers' + code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code) + code = re.sub(r'"([^"]*?)\s+/\s+([^"]*?)"', r'"\1/\2"', code) + + # Fix multiple slashes in paths: 'lib / utils / helper' -> 'lib/utils/helper' + # Repeat to handle chains + for _ in range(5): # Max 5 slashes in a path + code = re.sub(r"'([^']*?)/\s+([^']*?)'", r"'\1/\2'", code) + code = re.sub(r'"([^"]*?)/\s+([^"]*?)"', r'"\1/\2"', code) + code = re.sub(r"'([^']*?)\s+/([^']*?)'", r"'\1/\2'", code) + code = re.sub(r'"([^"]*?)\s+/([^"]*?)"', r'"\1/\2"', code) + + # Fix @ paths: '@/ lib' -> '@/lib' + code = re.sub(r"'@\s*/\s+", r"'@/", code) + code = re.sub(r'"@\s*/\s+', r'"@/', code) + + # Fix server-only and other hyphenated imports: 'server - only' -> 'server-only' + code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code) + code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code) + + return f'```{language}\n{code}\n```' + + # Process all code blocks + markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL) + + return markdown diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index 16aea020..e8e69bfc 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -12,6 +12,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher from ....config.logfire_config import get_logger from ...credential_service import credential_service +from ..helpers.content_fixer import fix_code_span_spaces logger = get_logger(__name__) @@ -255,9 +256,12 @@ class BatchCrawlStrategy: if fallback_text: title = fallback_text + # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds + cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown) + successful_results.append({ "url": original_url, - "markdown": result.markdown.fit_markdown, + "markdown": cleaned_markdown, "html": result.html, # Use raw HTML "title": title, }) diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 3cdee750..a6373126 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -13,6 +13,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher from ....config.logfire_config import get_logger from ...credential_service import credential_service +from ..helpers.content_fixer import fix_code_span_spaces from ..helpers.url_handler import URLHandler logger = get_logger(__name__) @@ -289,9 +290,12 @@ class RecursiveCrawlStrategy: if extracted_title: title = extracted_title + # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds + cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown) + results_all.append({ "url": original_url, - "markdown": result.markdown.fit_markdown, + "markdown": cleaned_markdown, "html": result.html, # Always use raw HTML for code extraction "title": title, }) diff --git a/python/src/server/services/crawling/strategies/single_page.py b/python/src/server/services/crawling/strategies/single_page.py index 96ea5bb5..6bba8cfc 100644 --- a/python/src/server/services/crawling/strategies/single_page.py +++ b/python/src/server/services/crawling/strategies/single_page.py @@ -11,6 +11,7 @@ from typing import Any from crawl4ai import CacheMode, CrawlerRunConfig from ....config.logfire_config import get_logger +from ..helpers.content_fixer import fix_code_span_spaces logger = get_logger(__name__) @@ -304,7 +305,9 @@ class SinglePageCrawlStrategy: processed_pages=1 ) - return [{'url': original_url, 'markdown': result.markdown, 'html': result.html}] + # Fix spaces in code paths that Crawl4AI/BeautifulSoup adds + cleaned_markdown = fix_code_span_spaces(result.markdown) + return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}] else: logger.error(f"Failed to crawl {url}: {result.error_message}") return []