Files
archon/python/src/server/services/crawling/strategies/batch.py

199 lines
8.4 KiB
Python

"""
Batch Crawling Strategy
Handles batch crawling of multiple URLs in parallel.
"""
from typing import List, Dict, Any, Optional, Callable
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
from ....config.logfire_config import get_logger
from ...credential_service import credential_service
logger = get_logger(__name__)
class BatchCrawlStrategy:
"""Strategy for crawling multiple URLs in batch."""
def __init__(self, crawler, markdown_generator):
"""
Initialize batch crawl strategy.
Args:
crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
"""
self.crawler = crawler
self.markdown_generator = markdown_generator
async def crawl_batch_with_progress(
self,
urls: List[str],
transform_url_func: Callable[[str], str],
is_documentation_site_func: Callable[[str], bool],
max_concurrent: int = None,
progress_callback: Optional[Callable] = None,
start_progress: int = 15,
end_progress: int = 60,
) -> List[Dict[str, Any]]:
"""
Batch crawl multiple URLs in parallel with progress reporting.
Args:
urls: List of URLs to crawl
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
is_documentation_site_func: Function to check if URL is a documentation site
max_concurrent: Maximum concurrent crawls
progress_callback: Optional callback for progress updates
start_progress: Starting progress percentage
end_progress: Ending progress percentage
Returns:
List of crawl results
"""
if not self.crawler:
logger.error("No crawler instance available for batch crawling")
if progress_callback:
await progress_callback("error", 0, "Crawler not available")
return []
# Load settings from database - fail fast on configuration errors
try:
settings = await credential_service.get_credentials_by_category("rag_strategy")
batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
if max_concurrent is None:
max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
except (ValueError, KeyError, TypeError) as e:
# Critical configuration errors should fail fast in alpha
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
raise ValueError(f"Failed to load crawler configuration: {e}") from e
except Exception as e:
# For non-critical errors (e.g., network issues), use defaults but log prominently
logger.error(
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
)
batch_size = 50
if max_concurrent is None:
max_concurrent = 10 # Safe default to prevent memory issues
memory_threshold = 80.0
check_interval = 0.5
settings = {} # Empty dict for defaults
# Check if any URLs are documentation sites
has_doc_sites = any(is_documentation_site_func(url) for url in urls)
if has_doc_sites:
logger.info("Detected documentation sites in batch, using enhanced configuration")
# Use generic documentation selectors for batch crawling
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True, # Enable streaming for faster parallel processing
markdown_generator=self.markdown_generator,
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
wait_for_images=False, # Skip images for faster crawling
scan_full_page=True, # Trigger lazy loading
exclude_all_images=False,
remove_overlay_elements=True,
process_iframes=True,
)
else:
# Configuration for regular batch crawling
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True, # Enable streaming
markdown_generator=self.markdown_generator,
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
scan_full_page=True,
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=memory_threshold,
check_interval=check_interval,
max_session_permit=max_concurrent,
)
async def report_progress(percentage: int, message: str):
"""Helper to report progress if callback is available"""
if progress_callback:
await progress_callback("crawling", percentage, message)
total_urls = len(urls)
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
# Use configured batch size
successful_results = []
processed = 0
# Transform all URLs at the beginning
url_mapping = {} # Map transformed URLs back to original
transformed_urls = []
for url in urls:
transformed = transform_url_func(url)
transformed_urls.append(transformed)
url_mapping[transformed] = url
for i in range(0, total_urls, batch_size):
batch_urls = transformed_urls[i : i + batch_size]
batch_start = i
batch_end = min(i + batch_size, total_urls)
# Report batch start with smooth progress
progress_percentage = start_progress + int(
(i / total_urls) * (end_progress - start_progress)
)
await report_progress(
progress_percentage,
f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
)
# Crawl this batch using arun_many with streaming
logger.info(
f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
)
batch_results = await self.crawler.arun_many(
urls=batch_urls, config=crawl_config, dispatcher=dispatcher
)
# Handle streaming results
j = 0
async for result in batch_results:
processed += 1
if result.success and result.markdown:
# Map back to original URL
original_url = url_mapping.get(result.url, result.url)
successful_results.append({
"url": original_url,
"markdown": result.markdown,
"html": result.html, # Use raw HTML
})
else:
logger.warning(
f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
)
# Report individual URL progress with smooth increments
progress_percentage = start_progress + int(
(processed / total_urls) * (end_progress - start_progress)
)
# Report more frequently for smoother progress
if (
processed % 5 == 0 or processed == total_urls
): # Report every 5 URLs or at the end
await report_progress(
progress_percentage,
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
)
j += 1
await report_progress(
end_progress,
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
)
return successful_results