mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-30 21:49:30 -05:00
fix: Address CodeRabbit feedback for discovery service
- Preserve URL case in robots.txt parsing by only lowercasing the sitemap: prefix check - Add support for relative sitemap paths in robots.txt using urljoin() - Fix HTML meta tag parsing to use case-insensitive regex instead of lowercasing content - Add URL scheme validation for discovered sitemaps (http/https only) - Fix discovery target domain filtering to use discovered URL's domain instead of input URL - Clean up whitespace and improve dict comprehension usage These changes improve discovery reliability and prevent URL corruption while maintaining backward compatibility with existing discovery behavior. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -20,11 +20,11 @@ from ..credential_service import credential_service
|
||||
# Import operations
|
||||
from .discovery_service import DiscoveryService
|
||||
from .document_storage_operations import DocumentStorageOperations
|
||||
from .page_storage_operations import PageStorageOperations
|
||||
from .helpers.site_config import SiteConfig
|
||||
|
||||
# Import helpers
|
||||
from .helpers.url_handler import URLHandler
|
||||
from .page_storage_operations import PageStorageOperations
|
||||
from .progress_mapper import ProgressMapper
|
||||
from .strategies.batch import BatchCrawlStrategy
|
||||
from .strategies.recursive import RecursiveCrawlStrategy
|
||||
@@ -413,18 +413,18 @@ class CrawlingService:
|
||||
total_pages=total_urls_to_crawl,
|
||||
processed_pages=0
|
||||
)
|
||||
|
||||
|
||||
# Crawl only the discovered file with discovery context
|
||||
discovered_url = discovered_urls[0]
|
||||
safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}")
|
||||
|
||||
|
||||
# Mark this as a discovery target for domain filtering
|
||||
discovery_request = request.copy()
|
||||
discovery_request["is_discovery_target"] = True
|
||||
discovery_request["original_domain"] = self.url_handler.get_base_url(url)
|
||||
|
||||
discovery_request["original_domain"] = self.url_handler.get_base_url(discovered_url)
|
||||
|
||||
crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request)
|
||||
|
||||
|
||||
else:
|
||||
# No discovery - crawl the main URL normally
|
||||
total_urls_to_crawl = 1
|
||||
@@ -433,7 +433,7 @@ class CrawlingService:
|
||||
total_pages=total_urls_to_crawl,
|
||||
processed_pages=0
|
||||
)
|
||||
|
||||
|
||||
# Crawl the main URL
|
||||
safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
|
||||
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
|
||||
@@ -608,7 +608,7 @@ class CrawlingService:
|
||||
logger.error("Code extraction failed, continuing crawl without code examples", exc_info=True)
|
||||
safe_logfire_error(f"Code extraction failed | error={e}")
|
||||
code_examples_count = 0
|
||||
|
||||
|
||||
# Report code extraction failure to progress tracker
|
||||
if self.progress_tracker:
|
||||
await self.progress_tracker.update(
|
||||
@@ -708,11 +708,11 @@ class CrawlingService:
|
||||
def _is_same_domain(self, url: str, base_domain: str) -> bool:
|
||||
"""
|
||||
Check if a URL belongs to the same domain as the base domain.
|
||||
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
base_domain: Base domain URL to compare against
|
||||
|
||||
|
||||
Returns:
|
||||
True if the URL is from the same domain
|
||||
"""
|
||||
@@ -842,7 +842,7 @@ class CrawlingService:
|
||||
|
||||
if extracted_links_with_text:
|
||||
# Build mapping of URL -> link text for title fallback
|
||||
url_to_link_text = {link: text for link, text in extracted_links_with_text}
|
||||
url_to_link_text = dict(extracted_links_with_text)
|
||||
extracted_links = [link for link, _ in extracted_links_with_text]
|
||||
|
||||
# For discovery targets, respect max_depth for same-domain links
|
||||
|
||||
@@ -124,10 +124,10 @@ class DiscoveryService:
|
||||
"""
|
||||
Main discovery orchestrator - selects ONE best file across all categories.
|
||||
All files contain similar AI/crawling guidance, so we only need the best one.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL to discover files for
|
||||
|
||||
|
||||
Returns:
|
||||
Single best URL found, or None if no files discovered
|
||||
"""
|
||||
@@ -182,7 +182,7 @@ class DiscoveryService:
|
||||
def _discover_best_sitemap(self, base_url: str) -> str | None:
|
||||
"""
|
||||
Discover the best available sitemap using priority-based selection.
|
||||
|
||||
|
||||
Priority order:
|
||||
1. Sitemaps from robots.txt (highest priority - explicitly declared)
|
||||
2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
|
||||
@@ -228,7 +228,7 @@ class DiscoveryService:
|
||||
def _discover_best_llms_file(self, base_url: str) -> str | None:
|
||||
"""
|
||||
Discover the best available llms file using priority-based selection.
|
||||
|
||||
|
||||
Priority order:
|
||||
1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
|
||||
2. Common subdirectory variations (static, public, docs, assets)
|
||||
@@ -270,7 +270,7 @@ class DiscoveryService:
|
||||
return robots_url
|
||||
except Exception:
|
||||
logger.exception(f"Error discovering robots file for {base_url}")
|
||||
|
||||
|
||||
return None
|
||||
|
||||
def _check_url_exists(self, url: str) -> bool:
|
||||
@@ -315,12 +315,17 @@ class DiscoveryService:
|
||||
content = self._read_response_with_limit(resp, robots_url)
|
||||
|
||||
# Parse robots.txt content for sitemap directives
|
||||
for line in content.splitlines():
|
||||
line = line.strip().lower()
|
||||
if line.startswith("sitemap:"):
|
||||
sitemap_url = line.split(":", 1)[1].strip()
|
||||
# Validate URL format before adding
|
||||
if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')):
|
||||
for raw_line in content.splitlines():
|
||||
line = raw_line.strip()
|
||||
if line.lower().startswith("sitemap:"):
|
||||
sitemap_value = line.split(":", 1)[1].strip()
|
||||
if sitemap_value:
|
||||
# Allow absolute and relative sitemap values
|
||||
if sitemap_value.lower().startswith(("http://", "https://")):
|
||||
sitemap_url = sitemap_value
|
||||
else:
|
||||
# Resolve relative path against base_url
|
||||
sitemap_url = urljoin(base_url, sitemap_value)
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
|
||||
|
||||
@@ -341,10 +346,10 @@ class DiscoveryService:
|
||||
def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
|
||||
"""
|
||||
Check common file locations for discovery targets.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check standard locations for
|
||||
|
||||
|
||||
Returns:
|
||||
Dictionary with file types and discovered URLs
|
||||
"""
|
||||
@@ -395,10 +400,10 @@ class DiscoveryService:
|
||||
def _parse_html_meta_tags(self, base_url: str) -> list[str]:
|
||||
"""
|
||||
Extract sitemap references from HTML meta tags.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check HTML for meta tags
|
||||
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs found in HTML meta tags
|
||||
"""
|
||||
@@ -415,28 +420,36 @@ class DiscoveryService:
|
||||
|
||||
# Read response with size limit
|
||||
content = self._read_response_with_limit(resp, base_url)
|
||||
content = content.lower()
|
||||
|
||||
# Look for sitemap meta tags or link elements
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Check for <link rel="sitemap" href="...">
|
||||
sitemap_link_pattern = r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']'
|
||||
matches = re.findall(sitemap_link_pattern, content)
|
||||
# Check for <link rel="sitemap" href="..."> (case-insensitive)
|
||||
sitemap_link_pattern = re.compile(
|
||||
r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']',
|
||||
re.IGNORECASE
|
||||
)
|
||||
matches = sitemap_link_pattern.findall(content)
|
||||
|
||||
for match in matches:
|
||||
sitemap_url = urljoin(base_url, match)
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
|
||||
if urlparse(sitemap_url).scheme in ("http", "https"):
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
|
||||
|
||||
# Check for <meta name="sitemap" content="...">
|
||||
sitemap_meta_pattern = r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']'
|
||||
matches = re.findall(sitemap_meta_pattern, content)
|
||||
# Check for <meta name="sitemap" content="..."> (case-insensitive)
|
||||
sitemap_meta_pattern = re.compile(
|
||||
r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']',
|
||||
re.IGNORECASE
|
||||
)
|
||||
matches = sitemap_meta_pattern.findall(content)
|
||||
|
||||
for match in matches:
|
||||
sitemap_url = urljoin(base_url, match)
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
|
||||
if urlparse(sitemap_url).scheme in ("http", "https"):
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
|
||||
|
||||
finally:
|
||||
resp.close()
|
||||
@@ -454,10 +467,10 @@ class DiscoveryService:
|
||||
def _check_well_known_directory(self, base_url: str) -> list[str]:
|
||||
"""
|
||||
Check .well-known/* files for discovery targets.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check .well-known directory for
|
||||
|
||||
|
||||
Returns:
|
||||
List of URLs found in .well-known directory
|
||||
"""
|
||||
@@ -490,10 +503,10 @@ class DiscoveryService:
|
||||
def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
|
||||
"""
|
||||
Try pattern variations for discovery targets.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL to try variations for
|
||||
|
||||
|
||||
Returns:
|
||||
Dictionary with file types and discovered variation URLs
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user