Fix crawler attempting to navigate to binary files

- Add is_binary_file() method to URLHandler to detect 40+ binary extensions - Update RecursiveCrawlStrategy to filter binary URLs before crawl queue - Add comprehensive unit tests for binary file detection - Prevents net::ERR_ABORTED errors when crawler encounters ZIP, PDF, etc. This fixes the issue where the crawler was treating binary file URLs (like .zip downloads) as navigable web pages, causing errors in crawl4ai.
2025-12-30 21:49:30 -05:00 · 2025-08-15 17:24:46 +03:00
parent ad1b8bf70f
commit 8157670936
3 changed files with 179 additions and 1 deletions
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -48,6 +48,54 @@ class URLHandler:
            logger.warning(f"Error checking if URL is text file: {e}")
            return False
    
+    @staticmethod
+    def is_binary_file(url: str) -> bool:
+        """
+        Check if a URL points to a binary file that shouldn't be crawled.
+        
+        Args:
+            url: URL to check
+            
+        Returns:
+            True if URL is a binary file, False otherwise
+        """
+        try:
+            # Remove query parameters and fragments for cleaner extension checking
+            parsed = urlparse(url)
+            path = parsed.path.lower()
+            
+            # Comprehensive list of binary and non-HTML file extensions
+            binary_extensions = {
+                # Archives
+                '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
+                # Executables and installers
+                '.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
+                # Documents (non-HTML)
+                '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
+                # Images
+                '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
+                # Audio/Video
+                '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
+                # Data files
+                '.csv', '.sql', '.db', '.sqlite',
+                # Binary data
+                '.iso', '.img', '.bin', '.dat',
+                # Development files (usually not meant to be crawled as pages)
+                '.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
+            }
+            
+            # Check if the path ends with any binary extension
+            for ext in binary_extensions:
+                if path.endswith(ext):
+                    logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
+                    return True
+                    
+            return False
+        except Exception as e:
+            logger.warning(f"Error checking if URL is binary file: {e}")
+            # In case of error, don't skip the URL (safer to attempt crawl than miss content)
+            return False
+    
    @staticmethod
    def transform_github_url(url: str) -> str:
        """
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -10,6 +10,7 @@ from urllib.parse import urldefrag
 from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.url_handler import URLHandler

 logger = get_logger(__name__)

@@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
        """
        self.crawler = crawler
        self.markdown_generator = markdown_generator
+        self.url_handler = URLHandler()
    
    async def crawl_recursive_with_progress(
        self,
@@ -190,8 +192,11 @@ class RecursiveCrawlStrategy:
                        # Find internal links for next depth
                        for link in result.links.get("internal", []):
                            next_url = normalize_url(link["href"])
-                            if next_url not in visited:
+                            # Skip binary files and already visited URLs
+                            if next_url not in visited and not self.url_handler.is_binary_file(next_url):
                                next_level_urls.add(next_url)
+                            elif self.url_handler.is_binary_file(next_url):
+                                logger.debug(f"Skipping binary file from crawl queue: {next_url}")
                    else:
                        logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")