mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-30 21:49:30 -05:00
Merge pull request #218 from coleam00/fix/filter-binary-files-from-crawl
Fix crawler attempting to navigate to binary files
This commit is contained in:
@@ -48,6 +48,54 @@ class URLHandler:
|
||||
logger.warning(f"Error checking if URL is text file: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def is_binary_file(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL points to a binary file that shouldn't be crawled.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
True if URL is a binary file, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Remove query parameters and fragments for cleaner extension checking
|
||||
parsed = urlparse(url)
|
||||
path = parsed.path.lower()
|
||||
|
||||
# Comprehensive list of binary and non-HTML file extensions
|
||||
binary_extensions = {
|
||||
# Archives
|
||||
'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz', '.tgz',
|
||||
# Executables and installers
|
||||
'.exe', '.dmg', '.pkg', '.deb', '.rpm', '.msi', '.app', '.appimage',
|
||||
# Documents (non-HTML)
|
||||
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods',
|
||||
# Images
|
||||
'.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff',
|
||||
# Audio/Video
|
||||
'.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.wav', '.flac',
|
||||
# Data files
|
||||
'.csv', '.sql', '.db', '.sqlite',
|
||||
# Binary data
|
||||
'.iso', '.img', '.bin', '.dat',
|
||||
# Development files (usually not meant to be crawled as pages)
|
||||
'.wasm', '.pyc', '.jar', '.war', '.class', '.dll', '.so', '.dylib'
|
||||
}
|
||||
|
||||
# Check if the path ends with any binary extension
|
||||
for ext in binary_extensions:
|
||||
if path.endswith(ext):
|
||||
logger.debug(f"Skipping binary file: {url} (matched extension: {ext})")
|
||||
return True
|
||||
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if URL is binary file: {e}")
|
||||
# In case of error, don't skip the URL (safer to attempt crawl than miss content)
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def transform_github_url(url: str) -> str:
|
||||
"""
|
||||
|
||||
@@ -10,6 +10,7 @@ from urllib.parse import urldefrag
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
||||
from ....config.logfire_config import get_logger
|
||||
from ...credential_service import credential_service
|
||||
from ..helpers.url_handler import URLHandler
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
|
||||
"""
|
||||
self.crawler = crawler
|
||||
self.markdown_generator = markdown_generator
|
||||
self.url_handler = URLHandler()
|
||||
|
||||
async def crawl_recursive_with_progress(
|
||||
self,
|
||||
@@ -195,8 +197,11 @@ class RecursiveCrawlStrategy:
|
||||
# Find internal links for next depth
|
||||
for link in result.links.get("internal", []):
|
||||
next_url = normalize_url(link["href"])
|
||||
if next_url not in visited:
|
||||
# Skip binary files and already visited URLs
|
||||
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
|
||||
next_level_urls.add(next_url)
|
||||
elif self.url_handler.is_binary_file(next_url):
|
||||
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
||||
else:
|
||||
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user