From 7ea4d99a27ae7c76bed1c14af2c178a23472d093 Mon Sep 17 00:00:00 2001 From: leex279 Date: Mon, 22 Sep 2025 13:44:10 +0200 Subject: [PATCH] fix: Improve domain filter robustness in crawling service Backend fixes for crawling stability: - Add comment clarifying DomainFilter doesn't need init params - Improve base URL selection in recursive strategy: - Check start_urls length before indexing - Use appropriate base URL for domain checks - Fallback to original_url when start_urls is empty - Add error handling for domain filter: - Wrap is_url_allowed in try/except block - Log exceptions and conservatively skip URLs on error - Prevents domain filter exceptions from crashing crawler - Better handling of relative URL resolution These changes ensure more robust crawling especially when: - start_urls array is empty - Domain filter encounters unexpected URLs - Relative links need proper base URL resolution --- .../services/crawling/crawling_service.py | 1 + .../services/crawling/strategies/recursive.py | 26 ++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 9a84fe6d..137ee58a 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -79,6 +79,7 @@ class CrawlingService: self.site_config = SiteConfig() self.markdown_generator = self.site_config.get_markdown_generator() self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator() + # DomainFilter doesn't need initialization params - it uses config passed to is_url_allowed self.domain_filter = DomainFilter() # Initialize strategies diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index 6a19ae45..775ce419 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -296,9 +296,29 @@ class RecursiveCrawlStrategy: if next_url not in visited and not is_binary: # Apply domain filtering if configured if self.domain_filter and crawl_config: - base_url = start_urls[0] if start_urls else original_url - if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config): - logger.debug(f"Filtering URL based on domain rules: {next_url}") + # Use next_url's origin for domain checks, fallback to original_url + # This ensures we're checking against the appropriate domain + base_url = original_url + if len(start_urls) > 0: + # If we have start_urls, use the first one + base_url = start_urls[0] + else: + # Try to use the current page's URL as base + # This handles relative links better + try: + from urllib.parse import urljoin + base_url = urljoin(original_url, next_url) + except Exception: + base_url = original_url + + # Wrap filter check in try/except to prevent crashes + try: + if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config): + logger.debug(f"Filtering URL based on domain rules: {next_url}") + continue + except Exception as e: + # Log error and conservatively skip the URL + logger.warning(f"Error checking domain filter for {next_url}: {str(e)}. Skipping URL.") continue if next_url not in next_level_urls: