fix: Improve domain filter robustness in crawling service

Backend fixes for crawling stability: - Add comment clarifying DomainFilter doesn't need init params - Improve base URL selection in recursive strategy: - Check start_urls length before indexing - Use appropriate base URL for domain checks - Fallback to original_url when start_urls is empty - Add error handling for domain filter: - Wrap is_url_allowed in try/except block - Log exceptions and conservatively skip URLs on error - Prevents domain filter exceptions from crashing crawler - Better handling of relative URL resolution These changes ensure more robust crawling especially when: - start_urls array is empty - Domain filter encounters unexpected URLs - Relative links need proper base URL resolution
2025-12-30 21:49:30 -05:00 · 2025-09-22 13:44:10 +02:00
parent 476e15ab67
commit 7ea4d99a27
2 changed files with 24 additions and 3 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -79,6 +79,7 @@ class CrawlingService:
        self.site_config = SiteConfig()
        self.markdown_generator = self.site_config.get_markdown_generator()
        self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
+        # DomainFilter doesn't need initialization params - it uses config passed to is_url_allowed
        self.domain_filter = DomainFilter()

        # Initialize strategies
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -296,9 +296,29 @@ class RecursiveCrawlStrategy:
                            if next_url not in visited and not is_binary:
                                # Apply domain filtering if configured
                                if self.domain_filter and crawl_config:
-                                    base_url = start_urls[0] if start_urls else original_url
-                                    if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config):
-                                        logger.debug(f"Filtering URL based on domain rules: {next_url}")
+                                    # Use next_url's origin for domain checks, fallback to original_url
+                                    # This ensures we're checking against the appropriate domain
+                                    base_url = original_url
+                                    if len(start_urls) > 0:
+                                        # If we have start_urls, use the first one
+                                        base_url = start_urls[0]
+                                    else:
+                                        # Try to use the current page's URL as base
+                                        # This handles relative links better
+                                        try:
+                                            from urllib.parse import urljoin
+                                            base_url = urljoin(original_url, next_url)
+                                        except Exception:
+                                            base_url = original_url
+
+                                    # Wrap filter check in try/except to prevent crashes
+                                    try:
+                                        if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config):
+                                            logger.debug(f"Filtering URL based on domain rules: {next_url}")
+                                            continue
+                                    except Exception as e:
+                                        # Log error and conservatively skip the URL
+                                        logger.warning(f"Error checking domain filter for {next_url}: {str(e)}. Skipping URL.")
                                        continue

                                if next_url not in next_level_urls: