fix: Discovery now respects given URL path and fix method signature mismatches

Two critical fixes for the automatic discovery feature:

1. Discovery Service path handling:
   - Changed from always using root domain (/) to respecting given URL path
   - e.g., for 'supabase.com/docs', now checks 'supabase.com/docs/robots.txt'
   - Previously incorrectly checked 'supabase.com/robots.txt'
   - Fixed all urljoin calls to use relative paths instead of absolute paths

2. Method signature mismatches:
   - Removed start_progress and end_progress parameters from crawl_batch_with_progress
   - Removed same parameters from crawl_recursive_with_progress
   - Fixed all calls to these methods to match the strategy implementations

These fixes ensure discovery works correctly for subdirectory URLs and prevents TypeError crashes during crawling.
This commit is contained in:
leex279
2025-09-20 13:06:41 +02:00
parent 0a2c43f6b4
commit 7f74aea476
2 changed files with 21 additions and 32 deletions

View File

@@ -202,8 +202,6 @@ class CrawlingService:
urls: list[str],
max_concurrent: int | None = None,
progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
start_progress: int = 15,
end_progress: int = 60,
) -> list[dict[str, Any]]:
"""Batch crawl multiple URLs in parallel."""
return await self.batch_strategy.crawl_batch_with_progress(
@@ -212,8 +210,6 @@ class CrawlingService:
self.site_config.is_documentation_site,
max_concurrent,
progress_callback,
start_progress,
end_progress,
self._check_cancellation, # Pass cancellation check
)
@@ -223,8 +219,6 @@ class CrawlingService:
max_depth: int = 3,
max_concurrent: int | None = None,
progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
start_progress: int = 10,
end_progress: int = 60,
) -> list[dict[str, Any]]:
"""Recursively crawl internal links from start URLs."""
return await self.recursive_strategy.crawl_recursive_with_progress(
@@ -234,8 +228,6 @@ class CrawlingService:
max_depth,
max_concurrent,
progress_callback,
start_progress,
end_progress,
self._check_cancellation, # Pass cancellation check
)
@@ -799,8 +791,6 @@ class CrawlingService:
max_depth=max_depth - 1, # Reduce depth since we're already 1 level deep
max_concurrent=request.get('max_concurrent'),
progress_callback=await self._create_crawl_progress_callback("crawling"),
start_progress=10,
end_progress=20,
)
else:
# Depth limit reached, just crawl the immediate links without following further
@@ -809,8 +799,6 @@ class CrawlingService:
extracted_links,
max_concurrent=request.get('max_concurrent'),
progress_callback=await self._create_crawl_progress_callback("crawling"),
start_progress=10,
end_progress=20,
)
else:
# Use normal batch crawling for non-discovery targets

View File

@@ -64,24 +64,24 @@ class DiscoveryService:
# Check files in global priority order
for filename in self.DISCOVERY_PRIORITY:
# Try root location first
file_url = urljoin(base_url, f"/{filename}")
# Try location relative to the given URL
file_url = urljoin(base_url, filename)
if self._check_url_exists(file_url):
logger.info(f"Discovery found best file: {file_url}")
return file_url
# For llms files, also try common subdirectories
if filename.startswith('llms'):
for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(subdir_url):
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
return subdir_url
# For sitemap files, also try common subdirectories
if filename.endswith('.xml') and not filename.startswith('.well-known'):
for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(subdir_url):
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
return subdir_url
@@ -119,7 +119,7 @@ class DiscoveryService:
# Priority 2: Check standard locations in priority order
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
sitemap_url = urljoin(base_url, f"/{filename}")
sitemap_url = urljoin(base_url, filename)
if self._check_url_exists(sitemap_url):
return sitemap_url
@@ -127,7 +127,7 @@ class DiscoveryService:
subdirs = ["sitemaps", "sitemap", "xml", "feed"]
for subdir in subdirs:
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
sitemap_url = urljoin(base_url, f"/{subdir}/{filename}")
sitemap_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(sitemap_url):
return sitemap_url
@@ -137,7 +137,7 @@ class DiscoveryService:
return html_sitemaps[0] # Use first sitemap from HTML
# Priority 5: Check .well-known directory
well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml")
well_known_sitemap = urljoin(base_url, ".well-known/sitemap.xml")
if self._check_url_exists(well_known_sitemap):
return well_known_sitemap
@@ -158,7 +158,7 @@ class DiscoveryService:
try:
# Priority 1: Check standard root locations in priority order
for filename in self.DISCOVERY_TARGETS["llms_files"]:
llms_url = urljoin(base_url, f"/{filename}")
llms_url = urljoin(base_url, filename)
if self._check_url_exists(llms_url):
return llms_url
@@ -166,13 +166,13 @@ class DiscoveryService:
subdirs = ["static", "public", "docs", "assets", "doc", "api"]
for subdir in subdirs:
for filename in self.DISCOVERY_TARGETS["llms_files"]:
llms_url = urljoin(base_url, f"/{subdir}/{filename}")
llms_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(llms_url):
return llms_url
# Priority 3: Check .well-known directory variants
for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
well_known_url = urljoin(base_url, f"/{well_known_file}")
well_known_url = urljoin(base_url, well_known_file)
if self._check_url_exists(well_known_url):
return well_known_url
@@ -186,7 +186,7 @@ class DiscoveryService:
Discover robots.txt file (always single file at root).
"""
try:
robots_url = urljoin(base_url, "/robots.txt")
robots_url = urljoin(base_url, "robots.txt")
if self._check_url_exists(robots_url):
return robots_url
except Exception:
@@ -210,17 +210,18 @@ class DiscoveryService:
def _parse_robots_txt(self, base_url: str) -> list[str]:
"""
Extract sitemap URLs from robots.txt.
Args:
base_url: Base URL to check robots.txt for
Returns:
List of sitemap URLs found in robots.txt
"""
sitemaps: list[str] = []
try:
robots_url = urljoin(base_url, "/robots.txt")
# Use robots.txt relative to the given URL, not always root
robots_url = urljoin(base_url, "robots.txt")
logger.info(f"Checking robots.txt at {robots_url}")
resp = requests.get(robots_url, timeout=30)
@@ -272,7 +273,7 @@ class DiscoveryService:
for target_type, filename in all_targets:
try:
file_url = urljoin(base_url, f"/{filename}")
file_url = urljoin(base_url, filename)
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
@@ -361,7 +362,7 @@ class DiscoveryService:
try:
for filename in self.DISCOVERY_TARGETS["well_known_files"]:
try:
file_url = urljoin(base_url, f"/{filename}")
file_url = urljoin(base_url, filename)
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
@@ -401,7 +402,7 @@ class DiscoveryService:
for subdir in subdirs:
for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
try:
file_url = urljoin(base_url, f"/{subdir}/{llms_file}")
file_url = urljoin(base_url, f"{subdir}/{llms_file}")
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
@@ -423,7 +424,7 @@ class DiscoveryService:
for sitemap_path in sitemap_paths:
try:
file_url = urljoin(base_url, f"/{sitemap_path}")
file_url = urljoin(base_url, sitemap_path)
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200: