Fix Crawl4AI/BeautifulSoup space injection in markdown

Added post-processing to fix spaces that Crawl4AI/BeautifulSoup adds when
extracting text from nested HTML elements. BeautifulSoup's get_text() adds
spaces between nested tags, corrupting code paths in the stored markdown.

Problem: HTML like <span>'next<span>/</span>headers'</span>
Gets extracted as: 'next / headers' (spaces added by BeautifulSoup)

Solution:
- Created content_fixer.py helper with fix_code_span_spaces() function
- Applied to all crawling strategies (single_page, batch, recursive)
- Fixes import paths: 'next / headers' -> 'next/headers'
- Fixes @ paths: '@/ lib / auth' -> '@/lib/auth'
- Fixes hyphenated: 'server - only' -> 'server-only'

This fix runs BEFORE content is stored in the database, preventing the
corruption at the source.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-11-06 08:47:27 +01:00
parent 476fa57061
commit af46373409
4 changed files with 76 additions and 3 deletions

View File

@@ -0,0 +1,62 @@
"""
Content fixing utilities for crawled content.
Handles post-processing of content from Crawl4AI to fix known issues.
"""
import re
def fix_code_span_spaces(markdown: str) -> str:
"""
Fix spaces inside code spans that Crawl4AI/BeautifulSoup adds when extracting text.
BeautifulSoup's get_text() adds spaces between nested elements, which corrupts
code paths and imports like 'next/headers' becoming 'next / headers'.
Example fixes:
- 'next / headers' -> 'next/headers'
- '@/ lib / auth' -> '@/lib/auth'
- 'server - only' -> 'server-only'
Args:
markdown: Markdown content with potential space issues in code blocks
Returns:
Cleaned markdown with spaces removed from code paths
"""
if not markdown:
return markdown
# Pattern to match code blocks with language specification
code_block_pattern = r'```(\w+)?\n(.*?)\n```'
def fix_code_block(match):
language = match.group(1) or ''
code = match.group(2)
# Fix import/require paths: 'next / headers' -> 'next/headers'
code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
code = re.sub(r'"([^"]*?)\s+/\s+([^"]*?)"', r'"\1/\2"', code)
# Fix multiple slashes in paths: 'lib / utils / helper' -> 'lib/utils/helper'
# Repeat to handle chains
for _ in range(5): # Max 5 slashes in a path
code = re.sub(r"'([^']*?)/\s+([^']*?)'", r"'\1/\2'", code)
code = re.sub(r'"([^"]*?)/\s+([^"]*?)"', r'"\1/\2"', code)
code = re.sub(r"'([^']*?)\s+/([^']*?)'", r"'\1/\2'", code)
code = re.sub(r'"([^"]*?)\s+/([^"]*?)"', r'"\1/\2"', code)
# Fix @ paths: '@/ lib' -> '@/lib'
code = re.sub(r"'@\s*/\s+", r"'@/", code)
code = re.sub(r'"@\s*/\s+', r'"@/', code)
# Fix server-only and other hyphenated imports: 'server - only' -> 'server-only'
code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
return f'```{language}\n{code}\n```'
# Process all code blocks
markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
return markdown

View File

@@ -12,6 +12,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher
from ....config.logfire_config import get_logger
from ...credential_service import credential_service
from ..helpers.content_fixer import fix_code_span_spaces
logger = get_logger(__name__)
@@ -255,9 +256,12 @@ class BatchCrawlStrategy:
if fallback_text:
title = fallback_text
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
successful_results.append({
"url": original_url,
"markdown": result.markdown.fit_markdown,
"markdown": cleaned_markdown,
"html": result.html, # Use raw HTML
"title": title,
})

View File

@@ -13,6 +13,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher
from ....config.logfire_config import get_logger
from ...credential_service import credential_service
from ..helpers.content_fixer import fix_code_span_spaces
from ..helpers.url_handler import URLHandler
logger = get_logger(__name__)
@@ -289,9 +290,12 @@ class RecursiveCrawlStrategy:
if extracted_title:
title = extracted_title
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
results_all.append({
"url": original_url,
"markdown": result.markdown.fit_markdown,
"markdown": cleaned_markdown,
"html": result.html, # Always use raw HTML for code extraction
"title": title,
})

View File

@@ -11,6 +11,7 @@ from typing import Any
from crawl4ai import CacheMode, CrawlerRunConfig
from ....config.logfire_config import get_logger
from ..helpers.content_fixer import fix_code_span_spaces
logger = get_logger(__name__)
@@ -304,7 +305,9 @@ class SinglePageCrawlStrategy:
processed_pages=1
)
return [{'url': original_url, 'markdown': result.markdown, 'html': result.html}]
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
cleaned_markdown = fix_code_span_spaces(result.markdown)
return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
else:
logger.error(f"Failed to crawl {url}: {result.error_message}")
return []