mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-30 21:49:30 -05:00
fix: Improve domain filter robustness in crawling service
Backend fixes for crawling stability: - Add comment clarifying DomainFilter doesn't need init params - Improve base URL selection in recursive strategy: - Check start_urls length before indexing - Use appropriate base URL for domain checks - Fallback to original_url when start_urls is empty - Add error handling for domain filter: - Wrap is_url_allowed in try/except block - Log exceptions and conservatively skip URLs on error - Prevents domain filter exceptions from crashing crawler - Better handling of relative URL resolution These changes ensure more robust crawling especially when: - start_urls array is empty - Domain filter encounters unexpected URLs - Relative links need proper base URL resolution
This commit is contained in:
@@ -79,6 +79,7 @@ class CrawlingService:
|
||||
self.site_config = SiteConfig()
|
||||
self.markdown_generator = self.site_config.get_markdown_generator()
|
||||
self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
|
||||
# DomainFilter doesn't need initialization params - it uses config passed to is_url_allowed
|
||||
self.domain_filter = DomainFilter()
|
||||
|
||||
# Initialize strategies
|
||||
|
||||
@@ -296,9 +296,29 @@ class RecursiveCrawlStrategy:
|
||||
if next_url not in visited and not is_binary:
|
||||
# Apply domain filtering if configured
|
||||
if self.domain_filter and crawl_config:
|
||||
base_url = start_urls[0] if start_urls else original_url
|
||||
if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config):
|
||||
logger.debug(f"Filtering URL based on domain rules: {next_url}")
|
||||
# Use next_url's origin for domain checks, fallback to original_url
|
||||
# This ensures we're checking against the appropriate domain
|
||||
base_url = original_url
|
||||
if len(start_urls) > 0:
|
||||
# If we have start_urls, use the first one
|
||||
base_url = start_urls[0]
|
||||
else:
|
||||
# Try to use the current page's URL as base
|
||||
# This handles relative links better
|
||||
try:
|
||||
from urllib.parse import urljoin
|
||||
base_url = urljoin(original_url, next_url)
|
||||
except Exception:
|
||||
base_url = original_url
|
||||
|
||||
# Wrap filter check in try/except to prevent crashes
|
||||
try:
|
||||
if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config):
|
||||
logger.debug(f"Filtering URL based on domain rules: {next_url}")
|
||||
continue
|
||||
except Exception as e:
|
||||
# Log error and conservatively skip the URL
|
||||
logger.warning(f"Error checking domain filter for {next_url}: {str(e)}. Skipping URL.")
|
||||
continue
|
||||
|
||||
if next_url not in next_level_urls:
|
||||
|
||||
Reference in New Issue
Block a user