diff --git a/.env.example b/.env.example index f2cf73e5..981f1da2 100644 --- a/.env.example +++ b/.env.example @@ -33,4 +33,9 @@ EMBEDDING_DIMENSIONS=1536 # - OPENAI_API_KEY (encrypted) # - MODEL_CHOICE # - TRANSPORT settings -# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.) \ No newline at end of file +# - RAG strategy flags (USE_CONTEXTUAL_EMBEDDINGS, USE_HYBRID_SEARCH, etc.) +# - Crawler settings: +# * CRAWL_MAX_CONCURRENT (default: 10) - Max concurrent pages per crawl operation +# * CRAWL_BATCH_SIZE (default: 50) - URLs processed per batch +# * MEMORY_THRESHOLD_PERCENT (default: 80) - Memory % before throttling +# * DISPATCHER_CHECK_INTERVAL (default: 0.5) - Memory check interval in seconds \ No newline at end of file diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index a17124db..d97b0bc4 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -59,7 +59,7 @@ class BatchCrawlStrategy: await progress_callback("error", 0, "Crawler not available") return [] - # Load settings from database first + # Load settings from database - fail fast on configuration errors try: settings = await credential_service.get_credentials_by_category("rag_strategy") batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50")) @@ -67,11 +67,16 @@ class BatchCrawlStrategy: max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10")) memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80")) check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5")) + except (ValueError, KeyError, TypeError) as e: + # Critical configuration errors should fail fast in alpha + logger.error(f"Invalid crawl settings format: {e}", exc_info=True) + raise ValueError(f"Failed to load crawler configuration: {e}") except Exception as e: - logger.warning(f"Failed to load crawl settings: {e}, using defaults") + # For non-critical errors (e.g., network issues), use defaults but log prominently + logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True) batch_size = 50 if max_concurrent is None: - max_concurrent = 10 + max_concurrent = 10 # Safe default to prevent memory issues memory_threshold = 80.0 check_interval = 0.5 settings = {} # Empty dict for defaults diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 675c97f0..8b9cf93e 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -61,7 +61,7 @@ class RecursiveCrawlStrategy: await progress_callback('error', 0, 'Crawler not available') return [] - # Load settings from database + # Load settings from database - fail fast on configuration errors try: settings = await credential_service.get_credentials_by_category("rag_strategy") batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50")) @@ -69,11 +69,16 @@ class RecursiveCrawlStrategy: max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10")) memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80")) check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5")) + except (ValueError, KeyError, TypeError) as e: + # Critical configuration errors should fail fast in alpha + logger.error(f"Invalid crawl settings format: {e}", exc_info=True) + raise ValueError(f"Failed to load crawler configuration: {e}") except Exception as e: - logger.warning(f"Failed to load crawl settings: {e}, using defaults") + # For non-critical errors (e.g., network issues), use defaults but log prominently + logger.error(f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True) batch_size = 50 if max_concurrent is None: - max_concurrent = 10 + max_concurrent = 10 # Safe default to prevent memory issues memory_threshold = 80.0 check_interval = 0.5 settings = {} # Empty dict for defaults