Merge pull request #218 from coleam00/fix/filter-binary-files-from-crawl

Fix crawler attempting to navigate to binary files
2025-12-30 21:49:30 -05:00 · 2025-08-16 00:39:17 +03:00
parent f96a9a4c4a 8157670936
commit 8743c059bb
3 changed files with 179 additions and 1 deletions
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -48,6 +48,54 @@ class URLHandler:
            logger.warning(f"Error checking if URL is text file: {e}")
            return False
    
+    @staticmethod
+    def is_binary_file(url: str) -> bool:
+        """
+        Check if a URL points to a binary file that shouldn't be crawled.
+        
+        Args:
+            url: URL to check
+            
+        Returns:
+            True if URL is a binary file, False otherwise
+        """
+        try:
+            # Remove query parameters and fragments for cleaner extension checking
+            parsed = urlparse(url)
+            path = parsed.path.lower()
+            
+            # Comprehensive list of binary and non-HTML file extensions
+            binary_extensions = {
+                # Archives
+                '.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
+                # Executables and installers
+                '.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
+                # Documents (non-HTML)
+                '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
+                # Images
+                '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
+                # Audio/Video
+                '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
+                # Data files
+                '.csv', '.sql', '.db', '.sqlite',
+                # Binary data
+                '.iso', '.img', '.bin', '.dat',
+                # Development files (usually not meant to be crawled as pages)
+                '.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
+            }
+            
+            # Check if the path ends with any binary extension
+            for ext in binary_extensions:
+                if path.endswith(ext):
+                    logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
+                    return True
+                    
+            return False
+        except Exception as e:
+            logger.warning(f"Error checking if URL is binary file: {e}")
+            # In case of error, don't skip the URL (safer to attempt crawl than miss content)
+            return False
+    
    @staticmethod
    def transform_github_url(url: str) -> str:
        """
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -10,6 +10,7 @@ from urllib.parse import urldefrag
 from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
 from ....config.logfire_config import get_logger
 from ...credential_service import credential_service
+from ..helpers.url_handler import URLHandler

 logger = get_logger(__name__)

@@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
        """
        self.crawler = crawler
        self.markdown_generator = markdown_generator
+        self.url_handler = URLHandler()
    
    async def crawl_recursive_with_progress(
        self,
@@ -195,8 +197,11 @@ class RecursiveCrawlStrategy:
                        # Find internal links for next depth
                        for link in result.links.get("internal", []):
                            next_url = normalize_url(link["href"])
-                            if next_url not in visited:
+                            # Skip binary files and already visited URLs
+                            if next_url not in visited and not self.url_handler.is_binary_file(next_url):
                                next_level_urls.add(next_url)
+                            elif self.url_handler.is_binary_file(next_url):
+                                logger.debug(f"Skipping binary file from crawl queue: {next_url}")
                    else:
                        logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")