mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
fix: Discovery now respects given URL path and fix method signature mismatches
Two critical fixes for the automatic discovery feature: 1. Discovery Service path handling: - Changed from always using root domain (/) to respecting given URL path - e.g., for 'supabase.com/docs', now checks 'supabase.com/docs/robots.txt' - Previously incorrectly checked 'supabase.com/robots.txt' - Fixed all urljoin calls to use relative paths instead of absolute paths 2. Method signature mismatches: - Removed start_progress and end_progress parameters from crawl_batch_with_progress - Removed same parameters from crawl_recursive_with_progress - Fixed all calls to these methods to match the strategy implementations These fixes ensure discovery works correctly for subdirectory URLs and prevents TypeError crashes during crawling.
This commit is contained in:
@@ -202,8 +202,6 @@ class CrawlingService:
|
||||
urls: list[str],
|
||||
max_concurrent: int | None = None,
|
||||
progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
|
||||
start_progress: int = 15,
|
||||
end_progress: int = 60,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Batch crawl multiple URLs in parallel."""
|
||||
return await self.batch_strategy.crawl_batch_with_progress(
|
||||
@@ -212,8 +210,6 @@ class CrawlingService:
|
||||
self.site_config.is_documentation_site,
|
||||
max_concurrent,
|
||||
progress_callback,
|
||||
start_progress,
|
||||
end_progress,
|
||||
self._check_cancellation, # Pass cancellation check
|
||||
)
|
||||
|
||||
@@ -223,8 +219,6 @@ class CrawlingService:
|
||||
max_depth: int = 3,
|
||||
max_concurrent: int | None = None,
|
||||
progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
|
||||
start_progress: int = 10,
|
||||
end_progress: int = 60,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Recursively crawl internal links from start URLs."""
|
||||
return await self.recursive_strategy.crawl_recursive_with_progress(
|
||||
@@ -234,8 +228,6 @@ class CrawlingService:
|
||||
max_depth,
|
||||
max_concurrent,
|
||||
progress_callback,
|
||||
start_progress,
|
||||
end_progress,
|
||||
self._check_cancellation, # Pass cancellation check
|
||||
)
|
||||
|
||||
@@ -799,8 +791,6 @@ class CrawlingService:
|
||||
max_depth=max_depth - 1, # Reduce depth since we're already 1 level deep
|
||||
max_concurrent=request.get('max_concurrent'),
|
||||
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||
start_progress=10,
|
||||
end_progress=20,
|
||||
)
|
||||
else:
|
||||
# Depth limit reached, just crawl the immediate links without following further
|
||||
@@ -809,8 +799,6 @@ class CrawlingService:
|
||||
extracted_links,
|
||||
max_concurrent=request.get('max_concurrent'),
|
||||
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||
start_progress=10,
|
||||
end_progress=20,
|
||||
)
|
||||
else:
|
||||
# Use normal batch crawling for non-discovery targets
|
||||
|
||||
@@ -64,24 +64,24 @@ class DiscoveryService:
|
||||
|
||||
# Check files in global priority order
|
||||
for filename in self.DISCOVERY_PRIORITY:
|
||||
# Try root location first
|
||||
file_url = urljoin(base_url, f"/{filename}")
|
||||
# Try location relative to the given URL
|
||||
file_url = urljoin(base_url, filename)
|
||||
if self._check_url_exists(file_url):
|
||||
logger.info(f"Discovery found best file: {file_url}")
|
||||
return file_url
|
||||
|
||||
|
||||
# For llms files, also try common subdirectories
|
||||
if filename.startswith('llms'):
|
||||
for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
|
||||
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||
if self._check_url_exists(subdir_url):
|
||||
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
||||
return subdir_url
|
||||
|
||||
|
||||
# For sitemap files, also try common subdirectories
|
||||
if filename.endswith('.xml') and not filename.startswith('.well-known'):
|
||||
for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
|
||||
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||
if self._check_url_exists(subdir_url):
|
||||
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
||||
return subdir_url
|
||||
@@ -119,7 +119,7 @@ class DiscoveryService:
|
||||
|
||||
# Priority 2: Check standard locations in priority order
|
||||
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
|
||||
sitemap_url = urljoin(base_url, f"/{filename}")
|
||||
sitemap_url = urljoin(base_url, filename)
|
||||
if self._check_url_exists(sitemap_url):
|
||||
return sitemap_url
|
||||
|
||||
@@ -127,7 +127,7 @@ class DiscoveryService:
|
||||
subdirs = ["sitemaps", "sitemap", "xml", "feed"]
|
||||
for subdir in subdirs:
|
||||
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
|
||||
sitemap_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
sitemap_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||
if self._check_url_exists(sitemap_url):
|
||||
return sitemap_url
|
||||
|
||||
@@ -137,7 +137,7 @@ class DiscoveryService:
|
||||
return html_sitemaps[0] # Use first sitemap from HTML
|
||||
|
||||
# Priority 5: Check .well-known directory
|
||||
well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml")
|
||||
well_known_sitemap = urljoin(base_url, ".well-known/sitemap.xml")
|
||||
if self._check_url_exists(well_known_sitemap):
|
||||
return well_known_sitemap
|
||||
|
||||
@@ -158,7 +158,7 @@ class DiscoveryService:
|
||||
try:
|
||||
# Priority 1: Check standard root locations in priority order
|
||||
for filename in self.DISCOVERY_TARGETS["llms_files"]:
|
||||
llms_url = urljoin(base_url, f"/{filename}")
|
||||
llms_url = urljoin(base_url, filename)
|
||||
if self._check_url_exists(llms_url):
|
||||
return llms_url
|
||||
|
||||
@@ -166,13 +166,13 @@ class DiscoveryService:
|
||||
subdirs = ["static", "public", "docs", "assets", "doc", "api"]
|
||||
for subdir in subdirs:
|
||||
for filename in self.DISCOVERY_TARGETS["llms_files"]:
|
||||
llms_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
llms_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||
if self._check_url_exists(llms_url):
|
||||
return llms_url
|
||||
|
||||
# Priority 3: Check .well-known directory variants
|
||||
for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
|
||||
well_known_url = urljoin(base_url, f"/{well_known_file}")
|
||||
well_known_url = urljoin(base_url, well_known_file)
|
||||
if self._check_url_exists(well_known_url):
|
||||
return well_known_url
|
||||
|
||||
@@ -186,7 +186,7 @@ class DiscoveryService:
|
||||
Discover robots.txt file (always single file at root).
|
||||
"""
|
||||
try:
|
||||
robots_url = urljoin(base_url, "/robots.txt")
|
||||
robots_url = urljoin(base_url, "robots.txt")
|
||||
if self._check_url_exists(robots_url):
|
||||
return robots_url
|
||||
except Exception:
|
||||
@@ -210,17 +210,18 @@ class DiscoveryService:
|
||||
def _parse_robots_txt(self, base_url: str) -> list[str]:
|
||||
"""
|
||||
Extract sitemap URLs from robots.txt.
|
||||
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check robots.txt for
|
||||
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs found in robots.txt
|
||||
"""
|
||||
sitemaps: list[str] = []
|
||||
|
||||
try:
|
||||
robots_url = urljoin(base_url, "/robots.txt")
|
||||
# Use robots.txt relative to the given URL, not always root
|
||||
robots_url = urljoin(base_url, "robots.txt")
|
||||
logger.info(f"Checking robots.txt at {robots_url}")
|
||||
|
||||
resp = requests.get(robots_url, timeout=30)
|
||||
@@ -272,7 +273,7 @@ class DiscoveryService:
|
||||
|
||||
for target_type, filename in all_targets:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{filename}")
|
||||
file_url = urljoin(base_url, filename)
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
@@ -361,7 +362,7 @@ class DiscoveryService:
|
||||
try:
|
||||
for filename in self.DISCOVERY_TARGETS["well_known_files"]:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{filename}")
|
||||
file_url = urljoin(base_url, filename)
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
@@ -401,7 +402,7 @@ class DiscoveryService:
|
||||
for subdir in subdirs:
|
||||
for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{subdir}/{llms_file}")
|
||||
file_url = urljoin(base_url, f"{subdir}/{llms_file}")
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
@@ -423,7 +424,7 @@ class DiscoveryService:
|
||||
|
||||
for sitemap_path in sitemap_paths:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{sitemap_path}")
|
||||
file_url = urljoin(base_url, sitemap_path)
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
|
||||
Reference in New Issue
Block a user