archon/python/src/server/services/crawling/strategies/batch.py

"""
Batch Crawling Strategy

Handles batch crawling of multiple URLs in parallel.
"""

from typing import List, Dict, Any, Optional, Callable

from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
from ....config.logfire_config import get_logger
from ...credential_service import credential_service

logger = get_logger(__name__)


class BatchCrawlStrategy:
    """Strategy for crawling multiple URLs in batch."""

    def __init__(self, crawler, markdown_generator):
        """
        Initialize batch crawl strategy.

        Args:
            crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
            markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
        """
        self.crawler = crawler
        self.markdown_generator = markdown_generator

    async def crawl_batch_with_progress(
        self,
        urls: List[str],
        transform_url_func: Callable[[str], str],
        is_documentation_site_func: Callable[[str], bool],
        max_concurrent: int = None,
        progress_callback: Optional[Callable] = None,
        start_progress: int = 15,
        end_progress: int = 60,
    ) -> List[Dict[str, Any]]:
        """
        Batch crawl multiple URLs in parallel with progress reporting.

        Args:
            urls: List of URLs to crawl
            transform_url_func: Function to transform URLs (e.g., GitHub URLs)
            is_documentation_site_func: Function to check if URL is a documentation site
            max_concurrent: Maximum concurrent crawls
            progress_callback: Optional callback for progress updates
            start_progress: Starting progress percentage
            end_progress: Ending progress percentage

        Returns:
            List of crawl results
        """
        if not self.crawler:
            logger.error("No crawler instance available for batch crawling")
            if progress_callback:
                await progress_callback("error", 0, "Crawler not available")
            return []

        # Load settings from database - fail fast on configuration errors
        try:
            settings = await credential_service.get_credentials_by_category("rag_strategy")
            batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
            if max_concurrent is None:
                max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
            memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
            check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
        except (ValueError, KeyError, TypeError) as e:
            # Critical configuration errors should fail fast in alpha
            logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
            raise ValueError(f"Failed to load crawler configuration: {e}") from e
        except Exception as e:
            # For non-critical errors (e.g., network issues), use defaults but log prominently
            logger.error(
                f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
            )
            batch_size = 50
            if max_concurrent is None:
                max_concurrent = 10  # Safe default to prevent memory issues
            memory_threshold = 80.0
            check_interval = 0.5
            settings = {}  # Empty dict for defaults

        # Check if any URLs are documentation sites
        has_doc_sites = any(is_documentation_site_func(url) for url in urls)

        if has_doc_sites:
            logger.info("Detected documentation sites in batch, using enhanced configuration")
            # Use generic documentation selectors for batch crawling
            crawl_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                stream=True,  # Enable streaming for faster parallel processing
                markdown_generator=self.markdown_generator,
                wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
                delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
                wait_for_images=False,  # Skip images for faster crawling
                scan_full_page=True,  # Trigger lazy loading
                exclude_all_images=False,
                remove_overlay_elements=True,
                process_iframes=True,
            )
        else:
            # Configuration for regular batch crawling
            crawl_config = CrawlerRunConfig(
                cache_mode=CacheMode.BYPASS,
                stream=True,  # Enable streaming
                markdown_generator=self.markdown_generator,
                wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
                delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
                scan_full_page=True,
            )

        dispatcher = MemoryAdaptiveDispatcher(
            memory_threshold_percent=memory_threshold,
            check_interval=check_interval,
            max_session_permit=max_concurrent,
        )

        async def report_progress(percentage: int, message: str):
            """Helper to report progress if callback is available"""
            if progress_callback:
                await progress_callback("crawling", percentage, message)

        total_urls = len(urls)
        await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")

        # Use configured batch size
        successful_results = []
        processed = 0

        # Transform all URLs at the beginning
        url_mapping = {}  # Map transformed URLs back to original
        transformed_urls = []
        for url in urls:
            transformed = transform_url_func(url)
            transformed_urls.append(transformed)
            url_mapping[transformed] = url

        for i in range(0, total_urls, batch_size):
            batch_urls = transformed_urls[i : i + batch_size]
            batch_start = i
            batch_end = min(i + batch_size, total_urls)

            # Report batch start with smooth progress
            progress_percentage = start_progress + int(
                (i / total_urls) * (end_progress - start_progress)
            )
            await report_progress(
                progress_percentage,
                f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
            )

            # Crawl this batch using arun_many with streaming
            logger.info(
                f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
            )
            batch_results = await self.crawler.arun_many(
                urls=batch_urls, config=crawl_config, dispatcher=dispatcher
            )

            # Handle streaming results
            j = 0
            async for result in batch_results:
                processed += 1
                if result.success and result.markdown:
                    # Map back to original URL
                    original_url = url_mapping.get(result.url, result.url)
                    successful_results.append({
                        "url": original_url,
                        "markdown": result.markdown,
                        "html": result.html,  # Use raw HTML
                    })
                else:
                    logger.warning(
                        f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
                    )

                # Report individual URL progress with smooth increments
                progress_percentage = start_progress + int(
                    (processed / total_urls) * (end_progress - start_progress)
                )
                # Report more frequently for smoother progress
                if (
                    processed % 5 == 0 or processed == total_urls
                ):  # Report every 5 URLs or at the end
                    await report_progress(
                        progress_percentage,
                        f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
                    )
                j += 1

        await report_progress(
            end_progress,
            f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
        )
        return successful_results