updates to the threading service and crawling from Rasmus PR's

2026-01-02 04:39:29 -05:00 · 2025-08-20 16:19:15 -04:00
parent 58bda51ef5
commit c22bf07dd3
3 changed files with 162 additions and 95 deletions
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -4,7 +4,6 @@ Batch Crawling Strategy
 Handles batch crawling of multiple URLs in parallel.
 """

-import asyncio
 from typing import List, Dict, Any, Optional, Callable

 from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
@@ -70,10 +69,12 @@ class BatchCrawlStrategy:
        except (ValueError, KeyError, TypeError) as e:
            # Critical configuration errors should fail fast in alpha
            logger.error(f"Invalid crawl settings format: {e}", exc_info=True)
-            raise ValueError(f"Failed to load crawler configuration: {e}")
+            raise ValueError(f"Failed to load crawler configuration: {e}") from e
        except Exception as e:
            # For non-critical errors (e.g., network issues), use defaults but log prominently
-            logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True)
+            logger.error(
+                f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
+            )
            batch_size = 50
            if max_concurrent is None:
                max_concurrent = 10  # Safe default to prevent memory issues
@@ -91,7 +92,6 @@ class BatchCrawlStrategy:
                cache_mode=CacheMode.BYPASS,
                stream=True,  # Enable streaming for faster parallel processing
                markdown_generator=self.markdown_generator,
-                wait_for="body",  # Simple selector for batch
                wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
                delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
@@ -196,4 +196,4 @@ class BatchCrawlStrategy:
            end_progress,
            f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
        )
-        return successful_results
+        return successful_results