Fix crawler concurrency configuration to prevent memory crashes

Consolidate concurrent crawling limits to use single database setting
instead of hardcoded special case for documentation sites.

Changes:
- Remove hardcoded 20 concurrent limit for documentation sites
- Let strategies use CRAWL_MAX_CONCURRENT from database (default: 10)
- Apply consistent concurrency across all site types
- Improve code formatting and consistency

This fixes Playwright browser crashes caused by excessive concurrent
pages on documentation sites and provides single configuration point
for tuning crawler performance.
This commit is contained in:
Rasmus Widing
2025-08-15 15:45:04 +03:00
parent ad1b8bf70f
commit aab0721f0c
2 changed files with 290 additions and 218 deletions

View File

@@ -3,6 +3,7 @@ Batch Crawling Strategy
Handles batch crawling of multiple URLs in parallel.
"""
import asyncio
from typing import List, Dict, Any, Optional, Callable
@@ -15,18 +16,18 @@ logger = get_logger(__name__)
class BatchCrawlStrategy:
"""Strategy for crawling multiple URLs in batch."""
def __init__(self, crawler, markdown_generator):
"""
Initialize batch crawl strategy.
Args:
crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
"""
self.crawler = crawler
self.markdown_generator = markdown_generator
async def crawl_batch_with_progress(
self,
urls: List[str],
@@ -35,11 +36,11 @@ class BatchCrawlStrategy:
max_concurrent: int = None,
progress_callback: Optional[Callable] = None,
start_progress: int = 15,
end_progress: int = 60
end_progress: int = 60,
) -> List[Dict[str, Any]]:
"""
Batch crawl multiple URLs in parallel with progress reporting.
Args:
urls: List of URLs to crawl
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
@@ -48,16 +49,16 @@ class BatchCrawlStrategy:
progress_callback: Optional callback for progress updates
start_progress: Starting progress percentage
end_progress: Ending progress percentage
Returns:
List of crawl results
"""
if not self.crawler:
logger.error("No crawler instance available for batch crawling")
if progress_callback:
await progress_callback('error', 0, 'Crawler not available')
await progress_callback("error", 0, "Crawler not available")
return []
# Load settings from database first
try:
settings = await credential_service.get_credentials_by_category("rag_strategy")
@@ -74,10 +75,10 @@ class BatchCrawlStrategy:
memory_threshold = 80.0
check_interval = 0.5
settings = {} # Empty dict for defaults
# Check if any URLs are documentation sites
has_doc_sites = any(is_documentation_site_func(url) for url in urls)
if has_doc_sites:
logger.info("Detected documentation sites in batch, using enhanced configuration")
# Use generic documentation selectors for batch crawling
@@ -85,7 +86,7 @@ class BatchCrawlStrategy:
cache_mode=CacheMode.BYPASS,
stream=True, # Enable streaming for faster parallel processing
markdown_generator=self.markdown_generator,
wait_for='body', # Simple selector for batch
wait_for="body", # Simple selector for batch
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
@@ -93,7 +94,7 @@ class BatchCrawlStrategy:
scan_full_page=True, # Trigger lazy loading
exclude_all_images=False,
remove_overlay_elements=True,
process_iframes=True
process_iframes=True,
)
else:
# Configuration for regular batch crawling
@@ -104,27 +105,27 @@ class BatchCrawlStrategy:
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
scan_full_page=True
scan_full_page=True,
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=memory_threshold,
check_interval=check_interval,
max_session_permit=max_concurrent
max_session_permit=max_concurrent,
)
async def report_progress(percentage: int, message: str):
"""Helper to report progress if callback is available"""
if progress_callback:
await progress_callback('crawling', percentage, message)
await progress_callback("crawling", percentage, message)
total_urls = len(urls)
await report_progress(start_progress, f'Starting to crawl {total_urls} URLs...')
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
# Use configured batch size
successful_results = []
processed = 0
# Transform all URLs at the beginning
url_mapping = {} # Map transformed URLs back to original
transformed_urls = []
@@ -132,20 +133,29 @@ class BatchCrawlStrategy:
transformed = transform_url_func(url)
transformed_urls.append(transformed)
url_mapping[transformed] = url
for i in range(0, total_urls, batch_size):
batch_urls = transformed_urls[i:i + batch_size]
batch_urls = transformed_urls[i : i + batch_size]
batch_start = i
batch_end = min(i + batch_size, total_urls)
# Report batch start with smooth progress
progress_percentage = start_progress + int((i / total_urls) * (end_progress - start_progress))
await report_progress(progress_percentage, f'Processing batch {batch_start+1}-{batch_end} of {total_urls} URLs...')
progress_percentage = start_progress + int(
(i / total_urls) * (end_progress - start_progress)
)
await report_progress(
progress_percentage,
f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
)
# Crawl this batch using arun_many with streaming
logger.info(f"Starting parallel crawl of batch {batch_start+1}-{batch_end} ({len(batch_urls)} URLs)")
batch_results = await self.crawler.arun_many(urls=batch_urls, config=crawl_config, dispatcher=dispatcher)
logger.info(
f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
)
batch_results = await self.crawler.arun_many(
urls=batch_urls, config=crawl_config, dispatcher=dispatcher
)
# Handle streaming results
j = 0
async for result in batch_results:
@@ -154,19 +164,31 @@ class BatchCrawlStrategy:
# Map back to original URL
original_url = url_mapping.get(result.url, result.url)
successful_results.append({
'url': original_url,
'markdown': result.markdown,
'html': result.html # Use raw HTML
"url": original_url,
"markdown": result.markdown,
"html": result.html, # Use raw HTML
})
else:
logger.warning(f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}")
logger.warning(
f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
)
# Report individual URL progress with smooth increments
progress_percentage = start_progress + int((processed / total_urls) * (end_progress - start_progress))
progress_percentage = start_progress + int(
(processed / total_urls) * (end_progress - start_progress)
)
# Report more frequently for smoother progress
if processed % 5 == 0 or processed == total_urls: # Report every 5 URLs or at the end
await report_progress(progress_percentage, f'Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)')
if (
processed % 5 == 0 or processed == total_urls
): # Report every 5 URLs or at the end
await report_progress(
progress_percentage,
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
)
j += 1
await report_progress(end_progress, f'Batch crawling completed: {len(successful_results)}/{total_urls} pages successful')
return successful_results
await report_progress(
end_progress,
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
)
return successful_results