Merge pull request #218 from coleam00/fix/filter-binary-files-from-crawl

Fix crawler attempting to navigate to binary files
This commit is contained in:
Wirasm
2025-08-16 00:39:17 +03:00
committed by GitHub
3 changed files with 179 additions and 1 deletions

View File

@@ -48,6 +48,54 @@ class URLHandler:
logger.warning(f"Error checking if URL is text file: {e}")
return False
@staticmethod
def is_binary_file(url: str) -> bool:
"""
Check if a URL points to a binary file that shouldn't be crawled.
Args:
url: URL to check
Returns:
True if URL is a binary file, False otherwise
"""
try:
# Remove query parameters and fragments for cleaner extension checking
parsed = urlparse(url)
path = parsed.path.lower()
# Comprehensive list of binary and non-HTML file extensions
binary_extensions = {
# Archives
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
# Executables and installers
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
# Documents (non-HTML)
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
# Images
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
# Audio/Video
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
# Data files
'.csv', '.sql', '.db', '.sqlite',
# Binary data
'.iso', '.img', '.bin', '.dat',
# Development files (usually not meant to be crawled as pages)
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
}
# Check if the path ends with any binary extension
for ext in binary_extensions:
if path.endswith(ext):
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
return True
return False
except Exception as e:
logger.warning(f"Error checking if URL is binary file: {e}")
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
return False
@staticmethod
def transform_github_url(url: str) -> str:
"""

View File

@@ -10,6 +10,7 @@ from urllib.parse import urldefrag
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
from ....config.logfire_config import get_logger
from ...credential_service import credential_service
from ..helpers.url_handler import URLHandler
logger = get_logger(__name__)
@@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
"""
self.crawler = crawler
self.markdown_generator = markdown_generator
self.url_handler = URLHandler()
async def crawl_recursive_with_progress(
self,
@@ -195,8 +197,11 @@ class RecursiveCrawlStrategy:
# Find internal links for next depth
for link in result.links.get("internal", []):
next_url = normalize_url(link["href"])
if next_url not in visited:
# Skip binary files and already visited URLs
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
next_level_urls.add(next_url)
elif self.url_handler.is_binary_file(next_url):
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
else:
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")