mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-01 20:28:43 -05:00
199 lines
8.4 KiB
Python
199 lines
8.4 KiB
Python
"""
|
|
Batch Crawling Strategy
|
|
|
|
Handles batch crawling of multiple URLs in parallel.
|
|
"""
|
|
|
|
from typing import List, Dict, Any, Optional, Callable
|
|
|
|
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
|
from ....config.logfire_config import get_logger
|
|
from ...credential_service import credential_service
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class BatchCrawlStrategy:
|
|
"""Strategy for crawling multiple URLs in batch."""
|
|
|
|
def __init__(self, crawler, markdown_generator):
|
|
"""
|
|
Initialize batch crawl strategy.
|
|
|
|
Args:
|
|
crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
|
|
markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
|
|
"""
|
|
self.crawler = crawler
|
|
self.markdown_generator = markdown_generator
|
|
|
|
async def crawl_batch_with_progress(
|
|
self,
|
|
urls: List[str],
|
|
transform_url_func: Callable[[str], str],
|
|
is_documentation_site_func: Callable[[str], bool],
|
|
max_concurrent: int = None,
|
|
progress_callback: Optional[Callable] = None,
|
|
start_progress: int = 15,
|
|
end_progress: int = 60,
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Batch crawl multiple URLs in parallel with progress reporting.
|
|
|
|
Args:
|
|
urls: List of URLs to crawl
|
|
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
|
|
is_documentation_site_func: Function to check if URL is a documentation site
|
|
max_concurrent: Maximum concurrent crawls
|
|
progress_callback: Optional callback for progress updates
|
|
start_progress: Starting progress percentage
|
|
end_progress: Ending progress percentage
|
|
|
|
Returns:
|
|
List of crawl results
|
|
"""
|
|
if not self.crawler:
|
|
logger.error("No crawler instance available for batch crawling")
|
|
if progress_callback:
|
|
await progress_callback("error", 0, "Crawler not available")
|
|
return []
|
|
|
|
# Load settings from database - fail fast on configuration errors
|
|
try:
|
|
settings = await credential_service.get_credentials_by_category("rag_strategy")
|
|
batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
|
|
if max_concurrent is None:
|
|
max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
|
|
memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
|
|
check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
|
|
except (ValueError, KeyError, TypeError) as e:
|
|
# Critical configuration errors should fail fast in alpha
|
|
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
|
|
raise ValueError(f"Failed to load crawler configuration: {e}") from e
|
|
except Exception as e:
|
|
# For non-critical errors (e.g., network issues), use defaults but log prominently
|
|
logger.error(
|
|
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
|
|
)
|
|
batch_size = 50
|
|
if max_concurrent is None:
|
|
max_concurrent = 10 # Safe default to prevent memory issues
|
|
memory_threshold = 80.0
|
|
check_interval = 0.5
|
|
settings = {} # Empty dict for defaults
|
|
|
|
# Check if any URLs are documentation sites
|
|
has_doc_sites = any(is_documentation_site_func(url) for url in urls)
|
|
|
|
if has_doc_sites:
|
|
logger.info("Detected documentation sites in batch, using enhanced configuration")
|
|
# Use generic documentation selectors for batch crawling
|
|
crawl_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True, # Enable streaming for faster parallel processing
|
|
markdown_generator=self.markdown_generator,
|
|
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
|
|
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
|
|
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
|
|
wait_for_images=False, # Skip images for faster crawling
|
|
scan_full_page=True, # Trigger lazy loading
|
|
exclude_all_images=False,
|
|
remove_overlay_elements=True,
|
|
process_iframes=True,
|
|
)
|
|
else:
|
|
# Configuration for regular batch crawling
|
|
crawl_config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True, # Enable streaming
|
|
markdown_generator=self.markdown_generator,
|
|
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
|
|
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
|
|
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
|
|
scan_full_page=True,
|
|
)
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=memory_threshold,
|
|
check_interval=check_interval,
|
|
max_session_permit=max_concurrent,
|
|
)
|
|
|
|
async def report_progress(percentage: int, message: str):
|
|
"""Helper to report progress if callback is available"""
|
|
if progress_callback:
|
|
await progress_callback("crawling", percentage, message)
|
|
|
|
total_urls = len(urls)
|
|
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
|
|
|
|
# Use configured batch size
|
|
successful_results = []
|
|
processed = 0
|
|
|
|
# Transform all URLs at the beginning
|
|
url_mapping = {} # Map transformed URLs back to original
|
|
transformed_urls = []
|
|
for url in urls:
|
|
transformed = transform_url_func(url)
|
|
transformed_urls.append(transformed)
|
|
url_mapping[transformed] = url
|
|
|
|
for i in range(0, total_urls, batch_size):
|
|
batch_urls = transformed_urls[i : i + batch_size]
|
|
batch_start = i
|
|
batch_end = min(i + batch_size, total_urls)
|
|
|
|
# Report batch start with smooth progress
|
|
progress_percentage = start_progress + int(
|
|
(i / total_urls) * (end_progress - start_progress)
|
|
)
|
|
await report_progress(
|
|
progress_percentage,
|
|
f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
|
|
)
|
|
|
|
# Crawl this batch using arun_many with streaming
|
|
logger.info(
|
|
f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
|
|
)
|
|
batch_results = await self.crawler.arun_many(
|
|
urls=batch_urls, config=crawl_config, dispatcher=dispatcher
|
|
)
|
|
|
|
# Handle streaming results
|
|
j = 0
|
|
async for result in batch_results:
|
|
processed += 1
|
|
if result.success and result.markdown:
|
|
# Map back to original URL
|
|
original_url = url_mapping.get(result.url, result.url)
|
|
successful_results.append({
|
|
"url": original_url,
|
|
"markdown": result.markdown,
|
|
"html": result.html, # Use raw HTML
|
|
})
|
|
else:
|
|
logger.warning(
|
|
f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
|
|
)
|
|
|
|
# Report individual URL progress with smooth increments
|
|
progress_percentage = start_progress + int(
|
|
(processed / total_urls) * (end_progress - start_progress)
|
|
)
|
|
# Report more frequently for smoother progress
|
|
if (
|
|
processed % 5 == 0 or processed == total_urls
|
|
): # Report every 5 URLs or at the end
|
|
await report_progress(
|
|
progress_percentage,
|
|
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
|
|
)
|
|
j += 1
|
|
|
|
await report_progress(
|
|
end_progress,
|
|
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
|
|
)
|
|
return successful_results |