mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-02 04:39:29 -05:00
updates to the threading service and crawling from Rasmus PR's
This commit is contained in:
@@ -4,7 +4,6 @@ Batch Crawling Strategy
|
||||
Handles batch crawling of multiple URLs in parallel.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
||||
@@ -70,10 +69,12 @@ class BatchCrawlStrategy:
|
||||
except (ValueError, KeyError, TypeError) as e:
|
||||
# Critical configuration errors should fail fast in alpha
|
||||
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
|
||||
raise ValueError(f"Failed to load crawler configuration: {e}")
|
||||
raise ValueError(f"Failed to load crawler configuration: {e}") from e
|
||||
except Exception as e:
|
||||
# For non-critical errors (e.g., network issues), use defaults but log prominently
|
||||
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
|
||||
logger.error(
|
||||
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
|
||||
)
|
||||
batch_size = 50
|
||||
if max_concurrent is None:
|
||||
max_concurrent = 10 # Safe default to prevent memory issues
|
||||
@@ -91,7 +92,6 @@ class BatchCrawlStrategy:
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True, # Enable streaming for faster parallel processing
|
||||
markdown_generator=self.markdown_generator,
|
||||
wait_for="body", # Simple selector for batch
|
||||
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
|
||||
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
|
||||
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
|
||||
@@ -196,4 +196,4 @@ class BatchCrawlStrategy:
|
||||
end_progress,
|
||||
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
|
||||
)
|
||||
return successful_results
|
||||
return successful_results
|
||||
Reference in New Issue
Block a user