updates to the threading service and crawling from Rasmus PR's

This commit is contained in:
sean-eskerium
2025-08-20 16:19:15 -04:00
parent 58bda51ef5
commit c22bf07dd3
3 changed files with 162 additions and 95 deletions

View File

@@ -4,7 +4,6 @@ Batch Crawling Strategy
Handles batch crawling of multiple URLs in parallel.
"""
import asyncio
from typing import List, Dict, Any, Optional, Callable
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
@@ -70,10 +69,12 @@ class BatchCrawlStrategy:
except (ValueError, KeyError, TypeError) as e:
# Critical configuration errors should fail fast in alpha
logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
raise ValueError(f"Failed to load crawler configuration: {e}")
raise ValueError(f"Failed to load crawler configuration: {e}") from e
except Exception as e:
# For non-critical errors (e.g., network issues), use defaults but log prominently
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
logger.error(
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
)
batch_size = 50
if max_concurrent is None:
max_concurrent = 10 # Safe default to prevent memory issues
@@ -91,7 +92,6 @@ class BatchCrawlStrategy:
cache_mode=CacheMode.BYPASS,
stream=True, # Enable streaming for faster parallel processing
markdown_generator=self.markdown_generator,
wait_for="body", # Simple selector for batch
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
@@ -196,4 +196,4 @@ class BatchCrawlStrategy:
end_progress,
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
)
return successful_results
return successful_results