mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-02 04:39:29 -05:00
Fix crawler attempting to navigate to binary files
- Add is_binary_file() method to URLHandler to detect 40+ binary extensions - Update RecursiveCrawlStrategy to filter binary URLs before crawl queue - Add comprehensive unit tests for binary file detection - Prevents net::ERR_ABORTED errors when crawler encounters ZIP, PDF, etc. This fixes the issue where the crawler was treating binary file URLs (like .zip downloads) as navigable web pages, causing errors in crawl4ai.
This commit is contained in:
@@ -10,6 +10,7 @@ from urllib.parse import urldefrag
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
|
||||
from ....config.logfire_config import get_logger
|
||||
from ...credential_service import credential_service
|
||||
from ..helpers.url_handler import URLHandler
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -27,6 +28,7 @@ class RecursiveCrawlStrategy:
|
||||
"""
|
||||
self.crawler = crawler
|
||||
self.markdown_generator = markdown_generator
|
||||
self.url_handler = URLHandler()
|
||||
|
||||
async def crawl_recursive_with_progress(
|
||||
self,
|
||||
@@ -190,8 +192,11 @@ class RecursiveCrawlStrategy:
|
||||
# Find internal links for next depth
|
||||
for link in result.links.get("internal", []):
|
||||
next_url = normalize_url(link["href"])
|
||||
if next_url not in visited:
|
||||
# Skip binary files and already visited URLs
|
||||
if next_url not in visited and not self.url_handler.is_binary_file(next_url):
|
||||
next_level_urls.add(next_url)
|
||||
elif self.url_handler.is_binary_file(next_url):
|
||||
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
||||
else:
|
||||
logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user