mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-06 14:48:00 -05:00
Fix crawler concurrency configuration to prevent memory crashes
Consolidate concurrent crawling limits to use single database setting instead of hardcoded special case for documentation sites. Changes: - Remove hardcoded 20 concurrent limit for documentation sites - Let strategies use CRAWL_MAX_CONCURRENT from database (default: 10) - Apply consistent concurrency across all site types - Improve code formatting and consistency This fixes Playwright browser crashes caused by excessive concurrent pages on documentation sites and provides single configuration point for tuning crawler performance.
This commit is contained in:
@@ -3,6 +3,7 @@ Batch Crawling Strategy
|
||||
|
||||
Handles batch crawling of multiple URLs in parallel.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
|
||||
@@ -15,18 +16,18 @@ logger = get_logger(__name__)
|
||||
|
||||
class BatchCrawlStrategy:
|
||||
"""Strategy for crawling multiple URLs in batch."""
|
||||
|
||||
|
||||
def __init__(self, crawler, markdown_generator):
|
||||
"""
|
||||
Initialize batch crawl strategy.
|
||||
|
||||
|
||||
Args:
|
||||
crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
|
||||
markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
|
||||
"""
|
||||
self.crawler = crawler
|
||||
self.markdown_generator = markdown_generator
|
||||
|
||||
|
||||
async def crawl_batch_with_progress(
|
||||
self,
|
||||
urls: List[str],
|
||||
@@ -35,11 +36,11 @@ class BatchCrawlStrategy:
|
||||
max_concurrent: int = None,
|
||||
progress_callback: Optional[Callable] = None,
|
||||
start_progress: int = 15,
|
||||
end_progress: int = 60
|
||||
end_progress: int = 60,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Batch crawl multiple URLs in parallel with progress reporting.
|
||||
|
||||
|
||||
Args:
|
||||
urls: List of URLs to crawl
|
||||
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
|
||||
@@ -48,16 +49,16 @@ class BatchCrawlStrategy:
|
||||
progress_callback: Optional callback for progress updates
|
||||
start_progress: Starting progress percentage
|
||||
end_progress: Ending progress percentage
|
||||
|
||||
|
||||
Returns:
|
||||
List of crawl results
|
||||
"""
|
||||
if not self.crawler:
|
||||
logger.error("No crawler instance available for batch crawling")
|
||||
if progress_callback:
|
||||
await progress_callback('error', 0, 'Crawler not available')
|
||||
await progress_callback("error", 0, "Crawler not available")
|
||||
return []
|
||||
|
||||
|
||||
# Load settings from database first
|
||||
try:
|
||||
settings = await credential_service.get_credentials_by_category("rag_strategy")
|
||||
@@ -74,10 +75,10 @@ class BatchCrawlStrategy:
|
||||
memory_threshold = 80.0
|
||||
check_interval = 0.5
|
||||
settings = {} # Empty dict for defaults
|
||||
|
||||
|
||||
# Check if any URLs are documentation sites
|
||||
has_doc_sites = any(is_documentation_site_func(url) for url in urls)
|
||||
|
||||
|
||||
if has_doc_sites:
|
||||
logger.info("Detected documentation sites in batch, using enhanced configuration")
|
||||
# Use generic documentation selectors for batch crawling
|
||||
@@ -85,7 +86,7 @@ class BatchCrawlStrategy:
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True, # Enable streaming for faster parallel processing
|
||||
markdown_generator=self.markdown_generator,
|
||||
wait_for='body', # Simple selector for batch
|
||||
wait_for="body", # Simple selector for batch
|
||||
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
|
||||
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
|
||||
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
|
||||
@@ -93,7 +94,7 @@ class BatchCrawlStrategy:
|
||||
scan_full_page=True, # Trigger lazy loading
|
||||
exclude_all_images=False,
|
||||
remove_overlay_elements=True,
|
||||
process_iframes=True
|
||||
process_iframes=True,
|
||||
)
|
||||
else:
|
||||
# Configuration for regular batch crawling
|
||||
@@ -104,27 +105,27 @@ class BatchCrawlStrategy:
|
||||
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
|
||||
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
|
||||
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
|
||||
scan_full_page=True
|
||||
scan_full_page=True,
|
||||
)
|
||||
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=memory_threshold,
|
||||
check_interval=check_interval,
|
||||
max_session_permit=max_concurrent
|
||||
max_session_permit=max_concurrent,
|
||||
)
|
||||
|
||||
|
||||
async def report_progress(percentage: int, message: str):
|
||||
"""Helper to report progress if callback is available"""
|
||||
if progress_callback:
|
||||
await progress_callback('crawling', percentage, message)
|
||||
|
||||
await progress_callback("crawling", percentage, message)
|
||||
|
||||
total_urls = len(urls)
|
||||
await report_progress(start_progress, f'Starting to crawl {total_urls} URLs...')
|
||||
|
||||
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
|
||||
|
||||
# Use configured batch size
|
||||
successful_results = []
|
||||
processed = 0
|
||||
|
||||
|
||||
# Transform all URLs at the beginning
|
||||
url_mapping = {} # Map transformed URLs back to original
|
||||
transformed_urls = []
|
||||
@@ -132,20 +133,29 @@ class BatchCrawlStrategy:
|
||||
transformed = transform_url_func(url)
|
||||
transformed_urls.append(transformed)
|
||||
url_mapping[transformed] = url
|
||||
|
||||
|
||||
for i in range(0, total_urls, batch_size):
|
||||
batch_urls = transformed_urls[i:i + batch_size]
|
||||
batch_urls = transformed_urls[i : i + batch_size]
|
||||
batch_start = i
|
||||
batch_end = min(i + batch_size, total_urls)
|
||||
|
||||
|
||||
# Report batch start with smooth progress
|
||||
progress_percentage = start_progress + int((i / total_urls) * (end_progress - start_progress))
|
||||
await report_progress(progress_percentage, f'Processing batch {batch_start+1}-{batch_end} of {total_urls} URLs...')
|
||||
|
||||
progress_percentage = start_progress + int(
|
||||
(i / total_urls) * (end_progress - start_progress)
|
||||
)
|
||||
await report_progress(
|
||||
progress_percentage,
|
||||
f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
|
||||
)
|
||||
|
||||
# Crawl this batch using arun_many with streaming
|
||||
logger.info(f"Starting parallel crawl of batch {batch_start+1}-{batch_end} ({len(batch_urls)} URLs)")
|
||||
batch_results = await self.crawler.arun_many(urls=batch_urls, config=crawl_config, dispatcher=dispatcher)
|
||||
|
||||
logger.info(
|
||||
f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
|
||||
)
|
||||
batch_results = await self.crawler.arun_many(
|
||||
urls=batch_urls, config=crawl_config, dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Handle streaming results
|
||||
j = 0
|
||||
async for result in batch_results:
|
||||
@@ -154,19 +164,31 @@ class BatchCrawlStrategy:
|
||||
# Map back to original URL
|
||||
original_url = url_mapping.get(result.url, result.url)
|
||||
successful_results.append({
|
||||
'url': original_url,
|
||||
'markdown': result.markdown,
|
||||
'html': result.html # Use raw HTML
|
||||
"url": original_url,
|
||||
"markdown": result.markdown,
|
||||
"html": result.html, # Use raw HTML
|
||||
})
|
||||
else:
|
||||
logger.warning(f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}")
|
||||
|
||||
logger.warning(
|
||||
f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
|
||||
)
|
||||
|
||||
# Report individual URL progress with smooth increments
|
||||
progress_percentage = start_progress + int((processed / total_urls) * (end_progress - start_progress))
|
||||
progress_percentage = start_progress + int(
|
||||
(processed / total_urls) * (end_progress - start_progress)
|
||||
)
|
||||
# Report more frequently for smoother progress
|
||||
if processed % 5 == 0 or processed == total_urls: # Report every 5 URLs or at the end
|
||||
await report_progress(progress_percentage, f'Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)')
|
||||
if (
|
||||
processed % 5 == 0 or processed == total_urls
|
||||
): # Report every 5 URLs or at the end
|
||||
await report_progress(
|
||||
progress_percentage,
|
||||
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
|
||||
)
|
||||
j += 1
|
||||
|
||||
await report_progress(end_progress, f'Batch crawling completed: {len(successful_results)}/{total_urls} pages successful')
|
||||
return successful_results
|
||||
|
||||
await report_progress(
|
||||
end_progress,
|
||||
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
|
||||
)
|
||||
return successful_results
|
||||
|
||||
Reference in New Issue
Block a user