mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Fix Crawl4AI/BeautifulSoup space injection in markdown
Added post-processing to fix spaces that Crawl4AI/BeautifulSoup adds when extracting text from nested HTML elements. BeautifulSoup's get_text() adds spaces between nested tags, corrupting code paths in the stored markdown. Problem: HTML like <span>'next<span>/</span>headers'</span> Gets extracted as: 'next / headers' (spaces added by BeautifulSoup) Solution: - Created content_fixer.py helper with fix_code_span_spaces() function - Applied to all crawling strategies (single_page, batch, recursive) - Fixes import paths: 'next / headers' -> 'next/headers' - Fixes @ paths: '@/ lib / auth' -> '@/lib/auth' - Fixes hyphenated: 'server - only' -> 'server-only' This fix runs BEFORE content is stored in the database, preventing the corruption at the source. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
62
python/src/server/services/crawling/helpers/content_fixer.py
Normal file
62
python/src/server/services/crawling/helpers/content_fixer.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Content fixing utilities for crawled content.
|
||||
|
||||
Handles post-processing of content from Crawl4AI to fix known issues.
|
||||
"""
|
||||
import re
|
||||
|
||||
|
||||
def fix_code_span_spaces(markdown: str) -> str:
|
||||
"""
|
||||
Fix spaces inside code spans that Crawl4AI/BeautifulSoup adds when extracting text.
|
||||
|
||||
BeautifulSoup's get_text() adds spaces between nested elements, which corrupts
|
||||
code paths and imports like 'next/headers' becoming 'next / headers'.
|
||||
|
||||
Example fixes:
|
||||
- 'next / headers' -> 'next/headers'
|
||||
- '@/ lib / auth' -> '@/lib/auth'
|
||||
- 'server - only' -> 'server-only'
|
||||
|
||||
Args:
|
||||
markdown: Markdown content with potential space issues in code blocks
|
||||
|
||||
Returns:
|
||||
Cleaned markdown with spaces removed from code paths
|
||||
"""
|
||||
if not markdown:
|
||||
return markdown
|
||||
|
||||
# Pattern to match code blocks with language specification
|
||||
code_block_pattern = r'```(\w+)?\n(.*?)\n```'
|
||||
|
||||
def fix_code_block(match):
|
||||
language = match.group(1) or ''
|
||||
code = match.group(2)
|
||||
|
||||
# Fix import/require paths: 'next / headers' -> 'next/headers'
|
||||
code = re.sub(r"'([^']*?)\s+/\s+([^']*?)'", r"'\1/\2'", code)
|
||||
code = re.sub(r'"([^"]*?)\s+/\s+([^"]*?)"', r'"\1/\2"', code)
|
||||
|
||||
# Fix multiple slashes in paths: 'lib / utils / helper' -> 'lib/utils/helper'
|
||||
# Repeat to handle chains
|
||||
for _ in range(5): # Max 5 slashes in a path
|
||||
code = re.sub(r"'([^']*?)/\s+([^']*?)'", r"'\1/\2'", code)
|
||||
code = re.sub(r'"([^"]*?)/\s+([^"]*?)"', r'"\1/\2"', code)
|
||||
code = re.sub(r"'([^']*?)\s+/([^']*?)'", r"'\1/\2'", code)
|
||||
code = re.sub(r'"([^"]*?)\s+/([^"]*?)"', r'"\1/\2"', code)
|
||||
|
||||
# Fix @ paths: '@/ lib' -> '@/lib'
|
||||
code = re.sub(r"'@\s*/\s+", r"'@/", code)
|
||||
code = re.sub(r'"@\s*/\s+', r'"@/', code)
|
||||
|
||||
# Fix server-only and other hyphenated imports: 'server - only' -> 'server-only'
|
||||
code = re.sub(r"'([a-z]+)\s+-\s+([a-z]+)'", r"'\1-\2'", code)
|
||||
code = re.sub(r'"([a-z]+)\s+-\s+([a-z]+)"', r'"\1-\2"', code)
|
||||
|
||||
return f'```{language}\n{code}\n```'
|
||||
|
||||
# Process all code blocks
|
||||
markdown = re.sub(code_block_pattern, fix_code_block, markdown, flags=re.DOTALL)
|
||||
|
||||
return markdown
|
||||
@@ -12,6 +12,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher
|
||||
|
||||
from ....config.logfire_config import get_logger
|
||||
from ...credential_service import credential_service
|
||||
from ..helpers.content_fixer import fix_code_span_spaces
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -255,9 +256,12 @@ class BatchCrawlStrategy:
|
||||
if fallback_text:
|
||||
title = fallback_text
|
||||
|
||||
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
|
||||
cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
|
||||
|
||||
successful_results.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown.fit_markdown,
|
||||
"markdown": cleaned_markdown,
|
||||
"html": result.html, # Use raw HTML
|
||||
"title": title,
|
||||
})
|
||||
|
||||
@@ -13,6 +13,7 @@ from crawl4ai import CacheMode, CrawlerRunConfig, MemoryAdaptiveDispatcher
|
||||
|
||||
from ....config.logfire_config import get_logger
|
||||
from ...credential_service import credential_service
|
||||
from ..helpers.content_fixer import fix_code_span_spaces
|
||||
from ..helpers.url_handler import URLHandler
|
||||
|
||||
logger = get_logger(__name__)
|
||||
@@ -289,9 +290,12 @@ class RecursiveCrawlStrategy:
|
||||
if extracted_title:
|
||||
title = extracted_title
|
||||
|
||||
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
|
||||
cleaned_markdown = fix_code_span_spaces(result.markdown.fit_markdown)
|
||||
|
||||
results_all.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown.fit_markdown,
|
||||
"markdown": cleaned_markdown,
|
||||
"html": result.html, # Always use raw HTML for code extraction
|
||||
"title": title,
|
||||
})
|
||||
|
||||
@@ -11,6 +11,7 @@ from typing import Any
|
||||
from crawl4ai import CacheMode, CrawlerRunConfig
|
||||
|
||||
from ....config.logfire_config import get_logger
|
||||
from ..helpers.content_fixer import fix_code_span_spaces
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -304,7 +305,9 @@ class SinglePageCrawlStrategy:
|
||||
processed_pages=1
|
||||
)
|
||||
|
||||
return [{'url': original_url, 'markdown': result.markdown, 'html': result.html}]
|
||||
# Fix spaces in code paths that Crawl4AI/BeautifulSoup adds
|
||||
cleaned_markdown = fix_code_span_spaces(result.markdown)
|
||||
return [{'url': original_url, 'markdown': cleaned_markdown, 'html': result.html}]
|
||||
else:
|
||||
logger.error(f"Failed to crawl {url}: {result.error_message}")
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user