Files
archon/python/src/server/services/crawling/discovery_service.py
leex279 cdf4323534 feat: Implement llms.txt link following with discovery priority fix
Implements complete llms.txt link following functionality that crawls
linked llms.txt files on the same domain/subdomain, along with critical
bug fixes for discovery priority and variant detection.

Backend Core Functionality:
- Add _is_same_domain_or_subdomain method for subdomain matching
- Fix is_llms_variant to detect .txt files in /llms/ directories
- Implement llms.txt link extraction and following logic
- Add two-phase discovery: prioritize ALL llms.txt before sitemaps
- Enhanced progress reporting with discovery metadata

Critical Bug Fixes:
- Discovery priority: Fixed sitemap.xml being found before llms.txt
- is_llms_variant: Now matches /llms/guides.txt, /llms/swift.txt, etc.
- These were blocking bugs preventing link following from working

Frontend UI:
- Add discovery and linked files display to CrawlingProgress component
- Update progress types to include discoveredFile, linkedFiles fields
- Add new crawl types: llms_txt_with_linked_files, discovery_*
- Add "discovery" to ProgressStatus enum and active statuses

Testing:
- 8 subdomain matching unit tests (test_crawling_service_subdomain.py)
- 7 integration tests for link following (test_llms_txt_link_following.py)
- All 15 tests passing
- Validated against real Supabase llms.txt structure (1 main + 8 linked)

Files Modified:
Backend:
- crawling_service.py: Core link following logic (lines 744-788, 862-920)
- url_handler.py: Fixed variant detection (lines 633-665)
- discovery_service.py: Two-phase discovery (lines 137-214)
- 2 new comprehensive test files

Frontend:
- progress/types/progress.ts: Updated types with new fields
- progress/components/CrawlingProgress.tsx: Added UI sections

Real-world testing: Crawling supabase.com/docs now discovers
/docs/llms.txt and automatically follows 8 linked llms.txt files,
indexing complete documentation from all files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 22:05:15 +02:00

620 lines
24 KiB
Python

"""
Discovery Service for Automatic File Detection
Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
to enhance crawling capabilities with priority-based discovery methods.
"""
from urllib.parse import urljoin
import requests
from ...config.logfire_config import get_logger
logger = get_logger(__name__)
class DiscoveryService:
"""Service for discovering related files automatically during crawls."""
# Maximum response size to prevent memory exhaustion (10MB default)
MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10 MB
# Global priority order - select ONE best file from all categories
# All these files contain similar AI/crawling guidance content
DISCOVERY_PRIORITY = [
# LLMs files (highest priority - most comprehensive AI guidance)
"llms-full.txt",
"llms.txt",
"llms.md",
"llms.mdx",
"llms.markdown",
# Sitemap files (structural crawling guidance)
"sitemap_index.xml",
"sitemap-index.xml",
"sitemap.xml",
# Robots file (basic crawling rules)
"robots.txt",
# Well-known variants (alternative locations)
".well-known/ai.txt",
".well-known/llms.txt",
".well-known/sitemap.xml"
]
# Categorized discovery targets for helper methods
# Maintains the same order and values as DISCOVERY_PRIORITY
DISCOVERY_TARGETS = {
"llms_files": [
"llms-full.txt",
"llms.txt",
"llms.md",
"llms.mdx",
"llms.markdown",
],
"sitemap_files": [
"sitemap_index.xml",
"sitemap-index.xml",
"sitemap.xml",
],
"robots_files": [
"robots.txt",
],
"well_known_files": [
".well-known/ai.txt",
".well-known/llms.txt",
".well-known/sitemap.xml",
],
}
def _read_response_with_limit(self, response: requests.Response, url: str, max_size: int | None = None) -> str:
"""
Read response content with size limit to prevent memory exhaustion.
Args:
response: The response object to read from
url: URL being read (for logging)
max_size: Maximum bytes to read (defaults to MAX_RESPONSE_SIZE)
Returns:
Response text content
Raises:
ValueError: If response exceeds size limit
"""
if max_size is None:
max_size = self.MAX_RESPONSE_SIZE
try:
chunks = []
total_size = 0
# Read response in chunks to enforce size limit
for chunk in response.iter_content(chunk_size=8192, decode_unicode=False):
if chunk:
total_size += len(chunk)
if total_size > max_size:
response.close()
size_mb = max_size / (1024 * 1024)
logger.warning(
f"Response size exceeded limit of {size_mb:.1f}MB for {url}, "
f"received {total_size / (1024 * 1024):.1f}MB"
)
raise ValueError(f"Response size exceeds {size_mb:.1f}MB limit")
chunks.append(chunk)
# Decode the complete response
content_bytes = b''.join(chunks)
# Try to decode with the response encoding or fall back to utf-8
encoding = response.encoding or 'utf-8'
try:
return content_bytes.decode(encoding)
except UnicodeDecodeError:
# Fallback to utf-8 with error replacement
return content_bytes.decode('utf-8', errors='replace')
except Exception:
# Ensure response is closed on any error
response.close()
raise
def discover_files(self, base_url: str) -> str | None:
"""
Main discovery orchestrator - selects ONE best file across all categories.
All files contain similar AI/crawling guidance, so we only need the best one.
Args:
base_url: Base URL to discover files for
Returns:
Single best URL found, or None if no files discovered
"""
try:
logger.info(f"Starting single-file discovery for {base_url}")
# Check files in global priority order
# IMPORTANT: Check root-level llms files BEFORE same-directory sitemaps
# This ensures llms.txt at root is preferred over /docs/sitemap.xml
from urllib.parse import urlparse
# Get the directory path of the base URL
parsed = urlparse(base_url)
base_path = parsed.path.rstrip('/')
# Extract directory (remove filename if present)
if '.' in base_path.split('/')[-1]:
base_dir = '/'.join(base_path.split('/')[:-1])
else:
base_dir = base_path
# Phase 1: Check llms files at ALL priority levels before checking sitemaps
for filename in self.DISCOVERY_PRIORITY:
if not filename.startswith('llms') and not filename.startswith('.well-known/llms') and not filename.startswith('.well-known/ai'):
continue # Skip non-llms files in this phase
# Priority 1a: Check same directory for llms files
if base_dir and base_dir != '/':
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
if self._check_url_exists(same_dir_url):
logger.info(f"Discovery found best file in same directory: {same_dir_url}")
return same_dir_url
# Priority 1b: Check root-level for llms files
file_url = urljoin(base_url, filename)
if self._check_url_exists(file_url):
logger.info(f"Discovery found best file at root: {file_url}")
return file_url
# Priority 1c: Check subdirectories for llms files
subdirs = []
if base_dir and base_dir != '/':
base_dir_name = base_dir.split('/')[-1]
if base_dir_name:
subdirs.append(base_dir_name)
subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
for subdir in subdirs:
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(subdir_url):
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
return subdir_url
# Phase 2: Check sitemaps and robots.txt (only if no llms files found)
for filename in self.DISCOVERY_PRIORITY:
if filename.startswith('llms') or filename.startswith('.well-known/llms') or filename.startswith('.well-known/ai'):
continue # Skip llms files, already checked
# Priority 2a: Check same directory
if base_dir and base_dir != '/':
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
if self._check_url_exists(same_dir_url):
logger.info(f"Discovery found best file in same directory: {same_dir_url}")
return same_dir_url
# Priority 2b: Check root-level
file_url = urljoin(base_url, filename)
if self._check_url_exists(file_url):
logger.info(f"Discovery found best file at root: {file_url}")
return file_url
# Priority 2c: For sitemap files, check common subdirectories
if filename.endswith('.xml') and not filename.startswith('.well-known'):
subdirs = []
if base_dir and base_dir != '/':
base_dir_name = base_dir.split('/')[-1]
if base_dir_name:
subdirs.append(base_dir_name)
subdirs.extend(["docs", "sitemaps", "sitemap", "xml", "feed"])
for subdir in subdirs:
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(subdir_url):
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
return subdir_url
# Check HTML meta tags for sitemap references as final fallback
html_sitemaps = self._parse_html_meta_tags(base_url)
if html_sitemaps:
best_file = html_sitemaps[0]
logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
return best_file
logger.info(f"Discovery completed for {base_url}: no files found")
return None
except Exception:
logger.exception(f"Unexpected error during discovery for {base_url}")
return None
def _discover_best_sitemap(self, base_url: str) -> str | None:
"""
Discover the best available sitemap using priority-based selection.
Priority order:
1. Sitemaps from robots.txt (highest priority - explicitly declared)
2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
3. Common subdirectory variations
4. HTML meta tag references
5. .well-known directory
"""
try:
# Priority 1: Check robots.txt for sitemap declarations
robots_sitemaps = self._parse_robots_txt(base_url)
if robots_sitemaps:
return robots_sitemaps[0] # Use first sitemap from robots.txt
# Priority 2: Check standard locations in priority order
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
sitemap_url = urljoin(base_url, filename)
if self._check_url_exists(sitemap_url):
return sitemap_url
# Priority 3: Check common subdirectory variations
subdirs = ["sitemaps", "sitemap", "xml", "feed"]
for subdir in subdirs:
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
sitemap_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(sitemap_url):
return sitemap_url
# Priority 4: Check HTML meta tag references
html_sitemaps = self._parse_html_meta_tags(base_url)
if html_sitemaps:
return html_sitemaps[0] # Use first sitemap from HTML
# Priority 5: Check .well-known directory
well_known_sitemap = urljoin(base_url, ".well-known/sitemap.xml")
if self._check_url_exists(well_known_sitemap):
return well_known_sitemap
except Exception:
logger.exception(f"Error discovering best sitemap for {base_url}")
return None
def _discover_best_llms_file(self, base_url: str) -> str | None:
"""
Discover the best available llms file using priority-based selection.
Priority order:
1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
2. Common subdirectory variations (static, public, docs, assets)
3. .well-known directory variants
"""
try:
# Priority 1: Check standard root locations in priority order
for filename in self.DISCOVERY_TARGETS["llms_files"]:
llms_url = urljoin(base_url, filename)
if self._check_url_exists(llms_url):
return llms_url
# Priority 2: Check common subdirectory variations
subdirs = ["static", "public", "docs", "assets", "doc", "api"]
for subdir in subdirs:
for filename in self.DISCOVERY_TARGETS["llms_files"]:
llms_url = urljoin(base_url, f"{subdir}/{filename}")
if self._check_url_exists(llms_url):
return llms_url
# Priority 3: Check .well-known directory variants
for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
well_known_url = urljoin(base_url, well_known_file)
if self._check_url_exists(well_known_url):
return well_known_url
except Exception:
logger.exception(f"Error discovering best llms file for {base_url}")
return None
def _discover_robots_file(self, base_url: str) -> str | None:
"""
Discover robots.txt file (always single file at root).
"""
try:
robots_url = urljoin(base_url, "robots.txt")
if self._check_url_exists(robots_url):
return robots_url
except Exception:
logger.exception(f"Error discovering robots file for {base_url}")
return None
def _check_url_exists(self, url: str) -> bool:
"""
Check if a URL exists and returns a successful response.
"""
try:
resp = requests.get(url, timeout=5, allow_redirects=True, verify=True)
success = resp.status_code == 200
logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
resp.close()
return success
except Exception as e:
logger.debug(f"URL check failed: {url} -> {e}")
return False
def _parse_robots_txt(self, base_url: str) -> list[str]:
"""
Extract sitemap URLs from robots.txt.
Args:
base_url: Base URL to check robots.txt for
Returns:
List of sitemap URLs found in robots.txt
"""
sitemaps: list[str] = []
try:
# Use robots.txt relative to the given URL, not always root
robots_url = urljoin(base_url, "robots.txt")
logger.info(f"Checking robots.txt at {robots_url}")
resp = requests.get(robots_url, timeout=30, stream=True, verify=True)
try:
if resp.status_code != 200:
logger.info(f"No robots.txt found: HTTP {resp.status_code}")
return sitemaps
# Read response with size limit
content = self._read_response_with_limit(resp, robots_url)
# Parse robots.txt content for sitemap directives
for raw_line in content.splitlines():
line = raw_line.strip()
if line.lower().startswith("sitemap:"):
sitemap_value = line.split(":", 1)[1].strip()
if sitemap_value:
# Allow absolute and relative sitemap values
if sitemap_value.lower().startswith(("http://", "https://")):
sitemap_url = sitemap_value
else:
# Resolve relative path against base_url
sitemap_url = urljoin(base_url, sitemap_value)
sitemaps.append(sitemap_url)
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
finally:
# Ensure response is always closed
resp.close()
except requests.exceptions.RequestException:
logger.exception(f"Network error fetching robots.txt from {base_url}")
except ValueError as e:
# Size limit exceeded
logger.warning(f"robots.txt too large at {base_url}: {e}")
except Exception:
logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
return sitemaps
def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
"""
Check common file locations for discovery targets.
Args:
base_url: Base URL to check standard locations for
Returns:
Dictionary with file types and discovered URLs
"""
discovered: dict[str, list[str]] = {
"sitemaps": [],
"llms_files": [],
"robots_files": []
}
try:
# Check all discovery targets at standard locations
all_targets = []
for target_type, files in self.DISCOVERY_TARGETS.items():
if target_type != "well_known_files": # Skip well-known, handled separately
for filename in files:
all_targets.append((target_type, filename))
for target_type, filename in all_targets:
try:
file_url = urljoin(base_url, filename)
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
try:
if resp.status_code == 200:
# Map target type to discovery category
if target_type == "sitemap_files":
discovered["sitemaps"].append(file_url)
elif target_type == "llms_files":
discovered["llms_files"].append(file_url)
elif target_type == "robots_files":
discovered["robots_files"].append(file_url)
logger.info(f"Found {target_type} file: {file_url}")
finally:
resp.close()
except requests.exceptions.RequestException:
logger.debug(f"File not found or network error: {filename}")
except Exception:
logger.exception(f"Unexpected error checking {filename}")
except Exception:
logger.exception(f"Unexpected error in standard pattern checking for {base_url}")
return discovered
def _parse_html_meta_tags(self, base_url: str) -> list[str]:
"""
Extract sitemap references from HTML meta tags.
Args:
base_url: Base URL to check HTML for meta tags
Returns:
List of sitemap URLs found in HTML meta tags
"""
sitemaps: list[str] = []
try:
logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
resp = requests.get(base_url, timeout=30, stream=True, verify=True)
try:
if resp.status_code != 200:
logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
return sitemaps
# Read response with size limit
content = self._read_response_with_limit(resp, base_url)
# Look for sitemap meta tags or link elements
import re
from urllib.parse import urlparse
# Check for <link rel="sitemap" href="..."> (case-insensitive)
sitemap_link_pattern = re.compile(
r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']',
re.IGNORECASE
)
matches = sitemap_link_pattern.findall(content)
for match in matches:
sitemap_url = urljoin(base_url, match)
if urlparse(sitemap_url).scheme in ("http", "https"):
sitemaps.append(sitemap_url)
logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
# Check for <meta name="sitemap" content="..."> (case-insensitive)
sitemap_meta_pattern = re.compile(
r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']',
re.IGNORECASE
)
matches = sitemap_meta_pattern.findall(content)
for match in matches:
sitemap_url = urljoin(base_url, match)
if urlparse(sitemap_url).scheme in ("http", "https"):
sitemaps.append(sitemap_url)
logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
finally:
resp.close()
except requests.exceptions.RequestException:
logger.exception(f"Network error fetching HTML from {base_url}")
except ValueError as e:
# Size limit exceeded
logger.warning(f"HTML response too large at {base_url}: {e}")
except Exception:
logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
return sitemaps
def _check_well_known_directory(self, base_url: str) -> list[str]:
"""
Check .well-known/* files for discovery targets.
Args:
base_url: Base URL to check .well-known directory for
Returns:
List of URLs found in .well-known directory
"""
well_known_files: list[str] = []
try:
for filename in self.DISCOVERY_TARGETS["well_known_files"]:
try:
file_url = urljoin(base_url, filename)
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
try:
if resp.status_code == 200:
well_known_files.append(file_url)
logger.info(f"Found .well-known file: {file_url}")
finally:
resp.close()
except requests.exceptions.RequestException:
logger.debug(f"Well-known file not found or network error: {filename}")
except Exception:
logger.exception(f"Unexpected error checking well-known file: {filename}")
except Exception:
logger.exception(f"Unexpected error checking .well-known directory for {base_url}")
return well_known_files
def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
"""
Try pattern variations for discovery targets.
Args:
base_url: Base URL to try variations for
Returns:
Dictionary with file types and discovered variation URLs
"""
discovered: dict[str, list[str]] = {
"sitemaps": [],
"llms_files": []
}
try:
# Common subdirectories to check
subdirs = ["public", "static", "assets", "docs", "doc", "api"]
# Try llms.txt variants in subdirectories
for subdir in subdirs:
for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
try:
file_url = urljoin(base_url, f"{subdir}/{llms_file}")
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
try:
if resp.status_code == 200:
discovered["llms_files"].append(file_url)
logger.info(f"Found llms file variant: {file_url}")
finally:
resp.close()
except requests.exceptions.RequestException:
logger.debug(f"Variant not found: {subdir}/{llms_file}")
except Exception:
logger.exception(f"Error checking variant: {subdir}/{llms_file}")
# Try sitemap variants with different paths
sitemap_paths = [
"sitemaps/sitemap.xml",
"sitemap/sitemap.xml",
"xml/sitemap.xml",
"feed/sitemap.xml"
]
for sitemap_path in sitemap_paths:
try:
file_url = urljoin(base_url, sitemap_path)
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
try:
if resp.status_code == 200:
discovered["sitemaps"].append(file_url)
logger.info(f"Found sitemap variant: {file_url}")
finally:
resp.close()
except requests.exceptions.RequestException:
logger.debug(f"Sitemap variant not found: {sitemap_path}")
except Exception:
logger.exception(f"Error checking sitemap variant: {sitemap_path}")
except Exception:
logger.exception(f"Unexpected error trying common variations for {base_url}")
return discovered