mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
feat: Prioritize same-directory discovery for llms.txt and sitemaps
Improve discovery logic to check the same directory as the base URL first before falling back to root-level and subdirectories. This ensures files like https://supabase.com/docs/llms.txt are found when crawling https://supabase.com/docs. Changes: - Check same directory as base_url first (e.g., /docs/llms.txt for /docs URL) - Fall back to root-level urljoin behavior - Include base directory name in subdirectory checks (e.g., /docs subdirectory) - Maintain priority order: same-dir > root > subdirectories - Log discovery location for better debugging This addresses cases where documentation directories contain their own llms.txt or sitemap files that should take precedence over root-level files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -143,23 +143,57 @@ class DiscoveryService:
|
||||
|
||||
# Check files in global priority order
|
||||
for filename in self.DISCOVERY_PRIORITY:
|
||||
# Try location relative to the given URL
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Get the directory path of the base URL
|
||||
parsed = urlparse(base_url)
|
||||
base_path = parsed.path.rstrip('/')
|
||||
# Extract directory (remove filename if present)
|
||||
if '.' in base_path.split('/')[-1]:
|
||||
base_dir = '/'.join(base_path.split('/')[:-1])
|
||||
else:
|
||||
base_dir = base_path
|
||||
|
||||
# Priority 1: Check same directory as base_url (e.g., /docs/llms.txt for /docs URL)
|
||||
if base_dir and base_dir != '/':
|
||||
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
|
||||
if self._check_url_exists(same_dir_url):
|
||||
logger.info(f"Discovery found best file in same directory: {same_dir_url}")
|
||||
return same_dir_url
|
||||
|
||||
# Priority 2: Check root-level (standard urljoin behavior)
|
||||
file_url = urljoin(base_url, filename)
|
||||
if self._check_url_exists(file_url):
|
||||
logger.info(f"Discovery found best file: {file_url}")
|
||||
logger.info(f"Discovery found best file at root: {file_url}")
|
||||
return file_url
|
||||
|
||||
# For llms files, also try common subdirectories
|
||||
# Priority 3: For llms files, check common subdirectories (including base directory name)
|
||||
if filename.startswith('llms'):
|
||||
for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
|
||||
# Extract base directory name to check it first
|
||||
subdirs = []
|
||||
if base_dir and base_dir != '/':
|
||||
base_dir_name = base_dir.split('/')[-1]
|
||||
if base_dir_name:
|
||||
subdirs.append(base_dir_name)
|
||||
subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
|
||||
|
||||
for subdir in subdirs:
|
||||
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||
if self._check_url_exists(subdir_url):
|
||||
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
||||
return subdir_url
|
||||
|
||||
# For sitemap files, also try common subdirectories
|
||||
# Priority 4: For sitemap files, check common subdirectories (including base directory name)
|
||||
if filename.endswith('.xml') and not filename.startswith('.well-known'):
|
||||
for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
|
||||
# Extract base directory name to check it first
|
||||
subdirs = []
|
||||
if base_dir and base_dir != '/':
|
||||
base_dir_name = base_dir.split('/')[-1]
|
||||
if base_dir_name:
|
||||
subdirs.append(base_dir_name)
|
||||
subdirs.extend(["docs", "sitemaps", "sitemap", "xml", "feed"])
|
||||
|
||||
for subdir in subdirs:
|
||||
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||
if self._check_url_exists(subdir_url):
|
||||
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
||||
|
||||
Reference in New Issue
Block a user