updates to the threading service and crawling from Rasmus PR's

This commit is contained in:
sean-eskerium
2025-08-20 16:19:15 -04:00
parent 58bda51ef5
commit c22bf07dd3
3 changed files with 162 additions and 95 deletions

View File

@@ -4,7 +4,6 @@ Batch Crawling Strategy
Handles batch crawling of multiple URLs in parallel. Handles batch crawling of multiple URLs in parallel.
""" """
import asyncio
from typing import List, Dict, Any, Optional, Callable from typing import List, Dict, Any, Optional, Callable
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
@@ -70,10 +69,12 @@ class BatchCrawlStrategy:
except (ValueError, KeyError, TypeError) as e: except (ValueError, KeyError, TypeError) as e:
# Critical configuration errors should fail fast in alpha # Critical configuration errors should fail fast in alpha
logger.error(f"Invalid crawl settings format: {e}", exc_info=True) logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
raise ValueError(f"Failed to load crawler configuration: {e}") raise ValueError(f"Failed to load crawler configuration: {e}") from e
except Exception as e: except Exception as e:
# For non-critical errors (e.g., network issues), use defaults but log prominently # For non-critical errors (e.g., network issues), use defaults but log prominently
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True) logger.error(
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
)
batch_size = 50 batch_size = 50
if max_concurrent is None: if max_concurrent is None:
max_concurrent = 10 # Safe default to prevent memory issues max_concurrent = 10 # Safe default to prevent memory issues
@@ -91,7 +92,6 @@ class BatchCrawlStrategy:
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
stream=True, # Enable streaming for faster parallel processing stream=True, # Enable streaming for faster parallel processing
markdown_generator=self.markdown_generator, markdown_generator=self.markdown_generator,
wait_for="body", # Simple selector for batch
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"), wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")), page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")), delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),

View File

@@ -3,7 +3,7 @@ Recursive Crawling Strategy
Handles recursive crawling of websites by following internal links. Handles recursive crawling of websites by following internal links.
""" """
import asyncio
from typing import List, Dict, Any, Optional, Callable from typing import List, Dict, Any, Optional, Callable
from urllib.parse import urldefrag from urllib.parse import urldefrag
@@ -39,7 +39,7 @@ class RecursiveCrawlStrategy:
max_concurrent: int = None, max_concurrent: int = None,
progress_callback: Optional[Callable] = None, progress_callback: Optional[Callable] = None,
start_progress: int = 10, start_progress: int = 10,
end_progress: int = 60 end_progress: int = 60,
) -> List[Dict[str, Any]]: ) -> List[Dict[str, Any]]:
""" """
Recursively crawl internal links from start URLs up to a maximum depth with progress reporting. Recursively crawl internal links from start URLs up to a maximum depth with progress reporting.
@@ -60,7 +60,7 @@ class RecursiveCrawlStrategy:
if not self.crawler: if not self.crawler:
logger.error("No crawler instance available for recursive crawling") logger.error("No crawler instance available for recursive crawling")
if progress_callback: if progress_callback:
await progress_callback('error', 0, 'Crawler not available') await progress_callback("error", 0, "Crawler not available")
return [] return []
# Load settings from database - fail fast on configuration errors # Load settings from database - fail fast on configuration errors
@@ -74,10 +74,12 @@ class RecursiveCrawlStrategy:
except (ValueError, KeyError, TypeError) as e: except (ValueError, KeyError, TypeError) as e:
# Critical configuration errors should fail fast in alpha # Critical configuration errors should fail fast in alpha
logger.error(f"Invalid crawl settings format: {e}", exc_info=True) logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
raise ValueError(f"Failed to load crawler configuration: {e}") raise ValueError(f"Failed to load crawler configuration: {e}") from e
except Exception as e: except Exception as e:
# For non-critical errors (e.g., network issues), use defaults but log prominently # For non-critical errors (e.g., network issues), use defaults but log prominently
logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True) logger.error(
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
)
batch_size = 50 batch_size = 50
if max_concurrent is None: if max_concurrent is None:
max_concurrent = 10 # Safe default to prevent memory issues max_concurrent = 10 # Safe default to prevent memory issues
@@ -89,12 +91,13 @@ class RecursiveCrawlStrategy:
has_doc_sites = any(is_documentation_site_func(url) for url in start_urls) has_doc_sites = any(is_documentation_site_func(url) for url in start_urls)
if has_doc_sites: if has_doc_sites:
logger.info("Detected documentation sites for recursive crawl, using enhanced configuration") logger.info(
"Detected documentation sites for recursive crawl, using enhanced configuration"
)
run_config = CrawlerRunConfig( run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
stream=True, # Enable streaming for faster parallel processing stream=True, # Enable streaming for faster parallel processing
markdown_generator=self.markdown_generator, markdown_generator=self.markdown_generator,
wait_for='body',
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"), wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")), page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")), delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
@@ -102,7 +105,7 @@ class RecursiveCrawlStrategy:
scan_full_page=True, # Trigger lazy loading scan_full_page=True, # Trigger lazy loading
exclude_all_images=False, exclude_all_images=False,
remove_overlay_elements=True, remove_overlay_elements=True,
process_iframes=True process_iframes=True,
) )
else: else:
# Configuration for regular recursive crawling # Configuration for regular recursive crawling
@@ -113,25 +116,21 @@ class RecursiveCrawlStrategy:
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"), wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")), page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")), delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
scan_full_page=True scan_full_page=True,
) )
dispatcher = MemoryAdaptiveDispatcher( dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=memory_threshold, memory_threshold_percent=memory_threshold,
check_interval=check_interval, check_interval=check_interval,
max_session_permit=max_concurrent max_session_permit=max_concurrent,
) )
async def report_progress(percentage: int, message: str, **kwargs): async def report_progress(percentage: int, message: str, **kwargs):
"""Helper to report progress if callback is available""" """Helper to report progress if callback is available"""
if progress_callback: if progress_callback:
# Add step information for multi-progress tracking # Add step information for multi-progress tracking
step_info = { step_info = {"currentStep": message, "stepMessage": message, **kwargs}
'currentStep': message, await progress_callback("crawling", percentage, message, **step_info)
'stepMessage': message,
**kwargs
}
await progress_callback('crawling', percentage, message, **step_info)
visited = set() visited = set()
@@ -143,34 +142,49 @@ class RecursiveCrawlStrategy:
total_processed = 0 total_processed = 0
for depth in range(max_depth): for depth in range(max_depth):
urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited] urls_to_crawl = [
normalize_url(url) for url in current_urls if normalize_url(url) not in visited
]
if not urls_to_crawl: if not urls_to_crawl:
break break
# Calculate progress for this depth level # Calculate progress for this depth level
depth_start = start_progress + int((depth / max_depth) * (end_progress - start_progress) * 0.8) depth_start = start_progress + int(
depth_end = start_progress + int(((depth + 1) / max_depth) * (end_progress - start_progress) * 0.8) (depth / max_depth) * (end_progress - start_progress) * 0.8
)
depth_end = start_progress + int(
((depth + 1) / max_depth) * (end_progress - start_progress) * 0.8
)
await report_progress(depth_start, f'Crawling depth {depth + 1}/{max_depth}: {len(urls_to_crawl)} URLs to process') await report_progress(
depth_start,
f"Crawling depth {depth + 1}/{max_depth}: {len(urls_to_crawl)} URLs to process",
)
# Use configured batch size for recursive crawling # Use configured batch size for recursive crawling
next_level_urls = set() next_level_urls = set()
depth_successful = 0 depth_successful = 0
for batch_idx in range(0, len(urls_to_crawl), batch_size): for batch_idx in range(0, len(urls_to_crawl), batch_size):
batch_urls = urls_to_crawl[batch_idx:batch_idx + batch_size] batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl)) batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
# Calculate progress for this batch within the depth # Calculate progress for this batch within the depth
batch_progress = depth_start + int((batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)) batch_progress = depth_start + int(
await report_progress(batch_progress, (batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
f'Depth {depth + 1}: crawling URLs {batch_idx + 1}-{batch_end_idx} of {len(urls_to_crawl)}', )
totalPages=total_processed + batch_idx, await report_progress(
processedPages=len(results_all)) batch_progress,
f"Depth {depth + 1}: crawling URLs {batch_idx + 1}-{batch_end_idx} of {len(urls_to_crawl)}",
totalPages=total_processed + batch_idx,
processedPages=len(results_all),
)
# Use arun_many for native parallel crawling with streaming # Use arun_many for native parallel crawling with streaming
logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many") logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
batch_results = await self.crawler.arun_many(urls=batch_urls, config=run_config, dispatcher=dispatcher) batch_results = await self.crawler.arun_many(
urls=batch_urls, config=run_config, dispatcher=dispatcher
)
# Handle streaming results from arun_many # Handle streaming results from arun_many
i = 0 i = 0
@@ -188,9 +202,9 @@ class RecursiveCrawlStrategy:
if result.success and result.markdown: if result.success and result.markdown:
results_all.append({ results_all.append({
'url': original_url, "url": original_url,
'markdown': result.markdown, "markdown": result.markdown,
'html': result.html # Always use raw HTML for code extraction "html": result.html, # Always use raw HTML for code extraction
}) })
depth_successful += 1 depth_successful += 1
@@ -198,28 +212,41 @@ class RecursiveCrawlStrategy:
for link in result.links.get("internal", []): for link in result.links.get("internal", []):
next_url = normalize_url(link["href"]) next_url = normalize_url(link["href"])
# Skip binary files and already visited URLs # Skip binary files and already visited URLs
if next_url not in visited and not self.url_handler.is_binary_file(next_url): if next_url not in visited and not self.url_handler.is_binary_file(
next_url
):
next_level_urls.add(next_url) next_level_urls.add(next_url)
elif self.url_handler.is_binary_file(next_url): elif self.url_handler.is_binary_file(next_url):
logger.debug(f"Skipping binary file from crawl queue: {next_url}") logger.debug(f"Skipping binary file from crawl queue: {next_url}")
else: else:
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}") logger.warning(
f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}"
)
# Report progress every few URLs # Report progress every few URLs
current_idx = batch_idx + i + 1 current_idx = batch_idx + i + 1
if current_idx % 5 == 0 or current_idx == len(urls_to_crawl): if current_idx % 5 == 0 or current_idx == len(urls_to_crawl):
current_progress = depth_start + int((current_idx / len(urls_to_crawl)) * (depth_end - depth_start)) current_progress = depth_start + int(
await report_progress(current_progress, (current_idx / len(urls_to_crawl)) * (depth_end - depth_start)
f'Depth {depth + 1}: processed {current_idx}/{len(urls_to_crawl)} URLs ({depth_successful} successful)', )
totalPages=total_processed, await report_progress(
processedPages=len(results_all)) current_progress,
f"Depth {depth + 1}: processed {current_idx}/{len(urls_to_crawl)} URLs ({depth_successful} successful)",
totalPages=total_processed,
processedPages=len(results_all),
)
i += 1 i += 1
current_urls = next_level_urls current_urls = next_level_urls
# Report completion of this depth # Report completion of this depth
await report_progress(depth_end, await report_progress(
f'Depth {depth + 1} completed: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth') depth_end,
f"Depth {depth + 1} completed: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth",
)
await report_progress(end_progress, f'Recursive crawling completed: {len(results_all)} total pages crawled across {max_depth} depth levels') await report_progress(
end_progress,
f"Recursive crawling completed: {len(results_all)} total pages crawled across {max_depth} depth levels",
)
return results_all return results_all

View File

@@ -93,18 +93,19 @@ class RateLimiter:
self._clean_old_entries(now) self._clean_old_entries(now)
# Check if we can make the request # Check if we can make the request
while not self._can_make_request(estimated_tokens): if not self._can_make_request(estimated_tokens):
wait_time = self._calculate_wait_time(estimated_tokens) wait_time = self._calculate_wait_time(estimated_tokens)
if wait_time > 0: if wait_time > 0:
logfire_logger.info( logfire_logger.info(
f"Rate limiting: waiting {wait_time:.1f}s (tokens={estimated_tokens}, current_usage={self._get_current_usage()})" f"Rate limiting: waiting {wait_time:.1f}s",
extra={
"tokens": estimated_tokens,
"current_usage": self._get_current_usage(),
}
) )
await asyncio.sleep(wait_time) await asyncio.sleep(wait_time)
# Clean old entries after waiting return await self.acquire(estimated_tokens)
now = time.time() return False
self._clean_old_entries(now)
else:
return False
# Record the request # Record the request
self.request_times.append(now) self.request_times.append(now)
@@ -199,13 +200,21 @@ class MemoryAdaptiveDispatcher:
# Reduce workers when memory is high # Reduce workers when memory is high
workers = max(1, base // 2) workers = max(1, base // 2)
logfire_logger.warning( logfire_logger.warning(
f"High memory usage detected, reducing workers (memory_percent={metrics.memory_percent}, workers={workers})" "High memory usage detected, reducing workers",
extra={
"memory_percent": metrics.memory_percent,
"workers": workers,
}
) )
elif metrics.cpu_percent > self.config.cpu_threshold * 100: elif metrics.cpu_percent > self.config.cpu_threshold * 100:
# Reduce workers when CPU is high # Reduce workers when CPU is high
workers = max(1, base // 2) workers = max(1, base // 2)
logfire_logger.warning( logfire_logger.warning(
f"High CPU usage detected, reducing workers (cpu_percent={metrics.cpu_percent}, workers={workers})" "High CPU usage detected, reducing workers",
extra={
"cpu_percent": metrics.cpu_percent,
"workers": workers,
}
) )
elif metrics.memory_percent < 50 and metrics.cpu_percent < 50: elif metrics.memory_percent < 50 and metrics.cpu_percent < 50:
# Increase workers when resources are available # Increase workers when resources are available
@@ -235,7 +244,14 @@ class MemoryAdaptiveDispatcher:
semaphore = asyncio.Semaphore(optimal_workers) semaphore = asyncio.Semaphore(optimal_workers)
logfire_logger.info( logfire_logger.info(
f"Starting adaptive processing (items_count={len(items)}, workers={optimal_workers}, mode={mode}, memory_percent={self.last_metrics.memory_percent}, cpu_percent={self.last_metrics.cpu_percent})" "Starting adaptive processing",
extra={
"items_count": len(items),
"workers": optimal_workers,
"mode": mode,
"memory_percent": self.last_metrics.memory_percent,
"cpu_percent": self.last_metrics.cpu_percent,
}
) )
# Track active workers # Track active workers
@@ -310,7 +326,8 @@ class MemoryAdaptiveDispatcher:
del active_workers[worker_id] del active_workers[worker_id]
logfire_logger.error( logfire_logger.error(
f"Processing failed for item {index} (error={str(e)}, item_index={index})" f"Processing failed for item {index}",
extra={"error": str(e), "item_index": index}
) )
return None return None
@@ -325,7 +342,13 @@ class MemoryAdaptiveDispatcher:
success_rate = len(successful_results) / len(items) * 100 success_rate = len(successful_results) / len(items) * 100
logfire_logger.info( logfire_logger.info(
f"Adaptive processing completed (total_items={len(items)}, successful={len(successful_results)}, success_rate={success_rate:.1f}%, workers_used={optimal_workers})" "Adaptive processing completed",
extra={
"total_items": len(items),
"successful": len(successful_results),
"success_rate": f"{success_rate:.1f}%",
"workers_used": optimal_workers,
}
) )
return successful_results return successful_results
@@ -343,7 +366,8 @@ class WebSocketSafeProcessor:
await websocket.accept() await websocket.accept()
self.active_connections.append(websocket) self.active_connections.append(websocket)
logfire_logger.info( logfire_logger.info(
f"WebSocket client connected (total_connections={len(self.active_connections)})" "WebSocket client connected",
extra={"total_connections": len(self.active_connections)}
) )
def disconnect(self, websocket: WebSocket): def disconnect(self, websocket: WebSocket):
@@ -351,7 +375,8 @@ class WebSocketSafeProcessor:
if websocket in self.active_connections: if websocket in self.active_connections:
self.active_connections.remove(websocket) self.active_connections.remove(websocket)
logfire_logger.info( logfire_logger.info(
f"WebSocket client disconnected (remaining_connections={len(self.active_connections)})" "WebSocket client disconnected",
extra={"remaining_connections": len(self.active_connections)}
) )
async def broadcast_progress(self, message: dict[str, Any]): async def broadcast_progress(self, message: dict[str, Any]):
@@ -462,7 +487,7 @@ class ThreadingService:
self._running = True self._running = True
self._health_check_task = asyncio.create_task(self._health_check_loop()) self._health_check_task = asyncio.create_task(self._health_check_loop())
logfire_logger.info(f"Threading service started (config={self.config.__dict__})") logfire_logger.info("Threading service started", extra={"config": self.config.__dict__})
async def stop(self): async def stop(self):
"""Stop the threading service""" """Stop the threading service"""
@@ -498,7 +523,8 @@ class ThreadingService:
finally: finally:
duration = time.time() - start_time duration = time.time() - start_time
logfire_logger.debug( logfire_logger.debug(
f"Rate limited operation completed (duration={duration}, tokens={estimated_tokens})" "Rate limited operation completed",
extra={"duration": duration, "tokens": estimated_tokens},
) )
async def run_cpu_intensive(self, func: Callable, *args, **kwargs) -> Any: async def run_cpu_intensive(self, func: Callable, *args, **kwargs) -> Any:
@@ -550,30 +576,44 @@ class ThreadingService:
# Log system metrics # Log system metrics
logfire_logger.info( logfire_logger.info(
f"System health check (memory_percent={metrics.memory_percent}, cpu_percent={metrics.cpu_percent}, available_memory_gb={metrics.available_memory_gb}, active_threads={metrics.active_threads}, active_websockets={len(self.websocket_processor.active_connections)})" "System health check",
extra={
"memory_percent": metrics.memory_percent,
"cpu_percent": metrics.cpu_percent,
"available_memory_gb": metrics.available_memory_gb,
"active_threads": metrics.active_threads,
"active_websockets": len(self.websocket_processor.active_connections),
}
) )
# Alert on critical thresholds # Alert on critical thresholds
if metrics.memory_percent > 90: if metrics.memory_percent > 90:
logfire_logger.warning( logfire_logger.warning(
f"Critical memory usage (memory_percent={metrics.memory_percent})" "Critical memory usage",
extra={"memory_percent": metrics.memory_percent}
) )
# Force garbage collection # Force garbage collection
gc.collect() gc.collect()
if metrics.cpu_percent > 95: if metrics.cpu_percent > 95:
logfire_logger.warning(f"Critical CPU usage (cpu_percent={metrics.cpu_percent})") logfire_logger.warning(
"Critical CPU usage", extra={"cpu_percent": metrics.cpu_percent}
)
# Check for memory leaks (too many threads) # Check for memory leaks (too many threads)
if metrics.active_threads > self.config.max_workers * 3: if metrics.active_threads > self.config.max_workers * 3:
logfire_logger.warning( logfire_logger.warning(
f"High thread count detected (active_threads={metrics.active_threads}, max_expected={self.config.max_workers * 3})" "High thread count detected",
extra={
"active_threads": metrics.active_threads,
"max_expected": self.config.max_workers * 3,
}
) )
await asyncio.sleep(self.config.health_check_interval) await asyncio.sleep(self.config.health_check_interval)
except Exception as e: except Exception as e:
logfire_logger.error(f"Health check failed (error={str(e)})") logfire_logger.error("Health check failed", extra={"error": str(e)})
await asyncio.sleep(self.config.health_check_interval) await asyncio.sleep(self.config.health_check_interval)