mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Implements complete llms.txt link following functionality that crawls linked llms.txt files on the same domain/subdomain, along with critical bug fixes for discovery priority and variant detection. Backend Core Functionality: - Add _is_same_domain_or_subdomain method for subdomain matching - Fix is_llms_variant to detect .txt files in /llms/ directories - Implement llms.txt link extraction and following logic - Add two-phase discovery: prioritize ALL llms.txt before sitemaps - Enhanced progress reporting with discovery metadata Critical Bug Fixes: - Discovery priority: Fixed sitemap.xml being found before llms.txt - is_llms_variant: Now matches /llms/guides.txt, /llms/swift.txt, etc. - These were blocking bugs preventing link following from working Frontend UI: - Add discovery and linked files display to CrawlingProgress component - Update progress types to include discoveredFile, linkedFiles fields - Add new crawl types: llms_txt_with_linked_files, discovery_* - Add "discovery" to ProgressStatus enum and active statuses Testing: - 8 subdomain matching unit tests (test_crawling_service_subdomain.py) - 7 integration tests for link following (test_llms_txt_link_following.py) - All 15 tests passing - Validated against real Supabase llms.txt structure (1 main + 8 linked) Files Modified: Backend: - crawling_service.py: Core link following logic (lines 744-788, 862-920) - url_handler.py: Fixed variant detection (lines 633-665) - discovery_service.py: Two-phase discovery (lines 137-214) - 2 new comprehensive test files Frontend: - progress/types/progress.ts: Updated types with new fields - progress/components/CrawlingProgress.tsx: Added UI sections Real-world testing: Crawling supabase.com/docs now discovers /docs/llms.txt and automatically follows 8 linked llms.txt files, indexing complete documentation from all files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
620 lines
24 KiB
Python
620 lines
24 KiB
Python
"""
|
|
Discovery Service for Automatic File Detection
|
|
|
|
Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
|
|
to enhance crawling capabilities with priority-based discovery methods.
|
|
"""
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
import requests
|
|
|
|
from ...config.logfire_config import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class DiscoveryService:
|
|
"""Service for discovering related files automatically during crawls."""
|
|
|
|
# Maximum response size to prevent memory exhaustion (10MB default)
|
|
MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
|
|
# Global priority order - select ONE best file from all categories
|
|
# All these files contain similar AI/crawling guidance content
|
|
DISCOVERY_PRIORITY = [
|
|
# LLMs files (highest priority - most comprehensive AI guidance)
|
|
"llms-full.txt",
|
|
"llms.txt",
|
|
"llms.md",
|
|
"llms.mdx",
|
|
"llms.markdown",
|
|
|
|
# Sitemap files (structural crawling guidance)
|
|
"sitemap_index.xml",
|
|
"sitemap-index.xml",
|
|
"sitemap.xml",
|
|
|
|
# Robots file (basic crawling rules)
|
|
"robots.txt",
|
|
|
|
# Well-known variants (alternative locations)
|
|
".well-known/ai.txt",
|
|
".well-known/llms.txt",
|
|
".well-known/sitemap.xml"
|
|
]
|
|
|
|
# Categorized discovery targets for helper methods
|
|
# Maintains the same order and values as DISCOVERY_PRIORITY
|
|
DISCOVERY_TARGETS = {
|
|
"llms_files": [
|
|
"llms-full.txt",
|
|
"llms.txt",
|
|
"llms.md",
|
|
"llms.mdx",
|
|
"llms.markdown",
|
|
],
|
|
"sitemap_files": [
|
|
"sitemap_index.xml",
|
|
"sitemap-index.xml",
|
|
"sitemap.xml",
|
|
],
|
|
"robots_files": [
|
|
"robots.txt",
|
|
],
|
|
"well_known_files": [
|
|
".well-known/ai.txt",
|
|
".well-known/llms.txt",
|
|
".well-known/sitemap.xml",
|
|
],
|
|
}
|
|
|
|
def _read_response_with_limit(self, response: requests.Response, url: str, max_size: int | None = None) -> str:
|
|
"""
|
|
Read response content with size limit to prevent memory exhaustion.
|
|
|
|
Args:
|
|
response: The response object to read from
|
|
url: URL being read (for logging)
|
|
max_size: Maximum bytes to read (defaults to MAX_RESPONSE_SIZE)
|
|
|
|
Returns:
|
|
Response text content
|
|
|
|
Raises:
|
|
ValueError: If response exceeds size limit
|
|
"""
|
|
if max_size is None:
|
|
max_size = self.MAX_RESPONSE_SIZE
|
|
|
|
try:
|
|
chunks = []
|
|
total_size = 0
|
|
|
|
# Read response in chunks to enforce size limit
|
|
for chunk in response.iter_content(chunk_size=8192, decode_unicode=False):
|
|
if chunk:
|
|
total_size += len(chunk)
|
|
if total_size > max_size:
|
|
response.close()
|
|
size_mb = max_size / (1024 * 1024)
|
|
logger.warning(
|
|
f"Response size exceeded limit of {size_mb:.1f}MB for {url}, "
|
|
f"received {total_size / (1024 * 1024):.1f}MB"
|
|
)
|
|
raise ValueError(f"Response size exceeds {size_mb:.1f}MB limit")
|
|
chunks.append(chunk)
|
|
|
|
# Decode the complete response
|
|
content_bytes = b''.join(chunks)
|
|
# Try to decode with the response encoding or fall back to utf-8
|
|
encoding = response.encoding or 'utf-8'
|
|
try:
|
|
return content_bytes.decode(encoding)
|
|
except UnicodeDecodeError:
|
|
# Fallback to utf-8 with error replacement
|
|
return content_bytes.decode('utf-8', errors='replace')
|
|
|
|
except Exception:
|
|
# Ensure response is closed on any error
|
|
response.close()
|
|
raise
|
|
|
|
def discover_files(self, base_url: str) -> str | None:
|
|
"""
|
|
Main discovery orchestrator - selects ONE best file across all categories.
|
|
All files contain similar AI/crawling guidance, so we only need the best one.
|
|
|
|
Args:
|
|
base_url: Base URL to discover files for
|
|
|
|
Returns:
|
|
Single best URL found, or None if no files discovered
|
|
"""
|
|
try:
|
|
logger.info(f"Starting single-file discovery for {base_url}")
|
|
|
|
# Check files in global priority order
|
|
# IMPORTANT: Check root-level llms files BEFORE same-directory sitemaps
|
|
# This ensures llms.txt at root is preferred over /docs/sitemap.xml
|
|
from urllib.parse import urlparse
|
|
|
|
# Get the directory path of the base URL
|
|
parsed = urlparse(base_url)
|
|
base_path = parsed.path.rstrip('/')
|
|
# Extract directory (remove filename if present)
|
|
if '.' in base_path.split('/')[-1]:
|
|
base_dir = '/'.join(base_path.split('/')[:-1])
|
|
else:
|
|
base_dir = base_path
|
|
|
|
# Phase 1: Check llms files at ALL priority levels before checking sitemaps
|
|
for filename in self.DISCOVERY_PRIORITY:
|
|
if not filename.startswith('llms') and not filename.startswith('.well-known/llms') and not filename.startswith('.well-known/ai'):
|
|
continue # Skip non-llms files in this phase
|
|
|
|
# Priority 1a: Check same directory for llms files
|
|
if base_dir and base_dir != '/':
|
|
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
|
|
if self._check_url_exists(same_dir_url):
|
|
logger.info(f"Discovery found best file in same directory: {same_dir_url}")
|
|
return same_dir_url
|
|
|
|
# Priority 1b: Check root-level for llms files
|
|
file_url = urljoin(base_url, filename)
|
|
if self._check_url_exists(file_url):
|
|
logger.info(f"Discovery found best file at root: {file_url}")
|
|
return file_url
|
|
|
|
# Priority 1c: Check subdirectories for llms files
|
|
subdirs = []
|
|
if base_dir and base_dir != '/':
|
|
base_dir_name = base_dir.split('/')[-1]
|
|
if base_dir_name:
|
|
subdirs.append(base_dir_name)
|
|
subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
|
|
|
|
for subdir in subdirs:
|
|
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
|
if self._check_url_exists(subdir_url):
|
|
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
|
return subdir_url
|
|
|
|
# Phase 2: Check sitemaps and robots.txt (only if no llms files found)
|
|
for filename in self.DISCOVERY_PRIORITY:
|
|
if filename.startswith('llms') or filename.startswith('.well-known/llms') or filename.startswith('.well-known/ai'):
|
|
continue # Skip llms files, already checked
|
|
|
|
# Priority 2a: Check same directory
|
|
if base_dir and base_dir != '/':
|
|
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
|
|
if self._check_url_exists(same_dir_url):
|
|
logger.info(f"Discovery found best file in same directory: {same_dir_url}")
|
|
return same_dir_url
|
|
|
|
# Priority 2b: Check root-level
|
|
file_url = urljoin(base_url, filename)
|
|
if self._check_url_exists(file_url):
|
|
logger.info(f"Discovery found best file at root: {file_url}")
|
|
return file_url
|
|
|
|
# Priority 2c: For sitemap files, check common subdirectories
|
|
if filename.endswith('.xml') and not filename.startswith('.well-known'):
|
|
subdirs = []
|
|
if base_dir and base_dir != '/':
|
|
base_dir_name = base_dir.split('/')[-1]
|
|
if base_dir_name:
|
|
subdirs.append(base_dir_name)
|
|
subdirs.extend(["docs", "sitemaps", "sitemap", "xml", "feed"])
|
|
|
|
for subdir in subdirs:
|
|
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
|
if self._check_url_exists(subdir_url):
|
|
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
|
return subdir_url
|
|
|
|
# Check HTML meta tags for sitemap references as final fallback
|
|
html_sitemaps = self._parse_html_meta_tags(base_url)
|
|
if html_sitemaps:
|
|
best_file = html_sitemaps[0]
|
|
logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
|
|
return best_file
|
|
|
|
logger.info(f"Discovery completed for {base_url}: no files found")
|
|
return None
|
|
|
|
except Exception:
|
|
logger.exception(f"Unexpected error during discovery for {base_url}")
|
|
return None
|
|
|
|
def _discover_best_sitemap(self, base_url: str) -> str | None:
|
|
"""
|
|
Discover the best available sitemap using priority-based selection.
|
|
|
|
Priority order:
|
|
1. Sitemaps from robots.txt (highest priority - explicitly declared)
|
|
2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
|
|
3. Common subdirectory variations
|
|
4. HTML meta tag references
|
|
5. .well-known directory
|
|
"""
|
|
try:
|
|
# Priority 1: Check robots.txt for sitemap declarations
|
|
robots_sitemaps = self._parse_robots_txt(base_url)
|
|
if robots_sitemaps:
|
|
return robots_sitemaps[0] # Use first sitemap from robots.txt
|
|
|
|
# Priority 2: Check standard locations in priority order
|
|
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
|
|
sitemap_url = urljoin(base_url, filename)
|
|
if self._check_url_exists(sitemap_url):
|
|
return sitemap_url
|
|
|
|
# Priority 3: Check common subdirectory variations
|
|
subdirs = ["sitemaps", "sitemap", "xml", "feed"]
|
|
for subdir in subdirs:
|
|
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
|
|
sitemap_url = urljoin(base_url, f"{subdir}/{filename}")
|
|
if self._check_url_exists(sitemap_url):
|
|
return sitemap_url
|
|
|
|
# Priority 4: Check HTML meta tag references
|
|
html_sitemaps = self._parse_html_meta_tags(base_url)
|
|
if html_sitemaps:
|
|
return html_sitemaps[0] # Use first sitemap from HTML
|
|
|
|
# Priority 5: Check .well-known directory
|
|
well_known_sitemap = urljoin(base_url, ".well-known/sitemap.xml")
|
|
if self._check_url_exists(well_known_sitemap):
|
|
return well_known_sitemap
|
|
|
|
except Exception:
|
|
logger.exception(f"Error discovering best sitemap for {base_url}")
|
|
|
|
return None
|
|
|
|
def _discover_best_llms_file(self, base_url: str) -> str | None:
|
|
"""
|
|
Discover the best available llms file using priority-based selection.
|
|
|
|
Priority order:
|
|
1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
|
|
2. Common subdirectory variations (static, public, docs, assets)
|
|
3. .well-known directory variants
|
|
"""
|
|
try:
|
|
# Priority 1: Check standard root locations in priority order
|
|
for filename in self.DISCOVERY_TARGETS["llms_files"]:
|
|
llms_url = urljoin(base_url, filename)
|
|
if self._check_url_exists(llms_url):
|
|
return llms_url
|
|
|
|
# Priority 2: Check common subdirectory variations
|
|
subdirs = ["static", "public", "docs", "assets", "doc", "api"]
|
|
for subdir in subdirs:
|
|
for filename in self.DISCOVERY_TARGETS["llms_files"]:
|
|
llms_url = urljoin(base_url, f"{subdir}/{filename}")
|
|
if self._check_url_exists(llms_url):
|
|
return llms_url
|
|
|
|
# Priority 3: Check .well-known directory variants
|
|
for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
|
|
well_known_url = urljoin(base_url, well_known_file)
|
|
if self._check_url_exists(well_known_url):
|
|
return well_known_url
|
|
|
|
except Exception:
|
|
logger.exception(f"Error discovering best llms file for {base_url}")
|
|
|
|
return None
|
|
|
|
def _discover_robots_file(self, base_url: str) -> str | None:
|
|
"""
|
|
Discover robots.txt file (always single file at root).
|
|
"""
|
|
try:
|
|
robots_url = urljoin(base_url, "robots.txt")
|
|
if self._check_url_exists(robots_url):
|
|
return robots_url
|
|
except Exception:
|
|
logger.exception(f"Error discovering robots file for {base_url}")
|
|
|
|
return None
|
|
|
|
def _check_url_exists(self, url: str) -> bool:
|
|
"""
|
|
Check if a URL exists and returns a successful response.
|
|
"""
|
|
try:
|
|
resp = requests.get(url, timeout=5, allow_redirects=True, verify=True)
|
|
success = resp.status_code == 200
|
|
logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
|
|
resp.close()
|
|
return success
|
|
except Exception as e:
|
|
logger.debug(f"URL check failed: {url} -> {e}")
|
|
return False
|
|
|
|
def _parse_robots_txt(self, base_url: str) -> list[str]:
|
|
"""
|
|
Extract sitemap URLs from robots.txt.
|
|
|
|
Args:
|
|
base_url: Base URL to check robots.txt for
|
|
|
|
Returns:
|
|
List of sitemap URLs found in robots.txt
|
|
"""
|
|
sitemaps: list[str] = []
|
|
|
|
try:
|
|
# Use robots.txt relative to the given URL, not always root
|
|
robots_url = urljoin(base_url, "robots.txt")
|
|
logger.info(f"Checking robots.txt at {robots_url}")
|
|
|
|
resp = requests.get(robots_url, timeout=30, stream=True, verify=True)
|
|
|
|
try:
|
|
if resp.status_code != 200:
|
|
logger.info(f"No robots.txt found: HTTP {resp.status_code}")
|
|
return sitemaps
|
|
|
|
# Read response with size limit
|
|
content = self._read_response_with_limit(resp, robots_url)
|
|
|
|
# Parse robots.txt content for sitemap directives
|
|
for raw_line in content.splitlines():
|
|
line = raw_line.strip()
|
|
if line.lower().startswith("sitemap:"):
|
|
sitemap_value = line.split(":", 1)[1].strip()
|
|
if sitemap_value:
|
|
# Allow absolute and relative sitemap values
|
|
if sitemap_value.lower().startswith(("http://", "https://")):
|
|
sitemap_url = sitemap_value
|
|
else:
|
|
# Resolve relative path against base_url
|
|
sitemap_url = urljoin(base_url, sitemap_value)
|
|
sitemaps.append(sitemap_url)
|
|
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
|
|
|
|
finally:
|
|
# Ensure response is always closed
|
|
resp.close()
|
|
|
|
except requests.exceptions.RequestException:
|
|
logger.exception(f"Network error fetching robots.txt from {base_url}")
|
|
except ValueError as e:
|
|
# Size limit exceeded
|
|
logger.warning(f"robots.txt too large at {base_url}: {e}")
|
|
except Exception:
|
|
logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
|
|
|
|
return sitemaps
|
|
|
|
def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
|
|
"""
|
|
Check common file locations for discovery targets.
|
|
|
|
Args:
|
|
base_url: Base URL to check standard locations for
|
|
|
|
Returns:
|
|
Dictionary with file types and discovered URLs
|
|
"""
|
|
discovered: dict[str, list[str]] = {
|
|
"sitemaps": [],
|
|
"llms_files": [],
|
|
"robots_files": []
|
|
}
|
|
|
|
try:
|
|
# Check all discovery targets at standard locations
|
|
all_targets = []
|
|
for target_type, files in self.DISCOVERY_TARGETS.items():
|
|
if target_type != "well_known_files": # Skip well-known, handled separately
|
|
for filename in files:
|
|
all_targets.append((target_type, filename))
|
|
|
|
for target_type, filename in all_targets:
|
|
try:
|
|
file_url = urljoin(base_url, filename)
|
|
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
|
|
|
|
try:
|
|
if resp.status_code == 200:
|
|
# Map target type to discovery category
|
|
if target_type == "sitemap_files":
|
|
discovered["sitemaps"].append(file_url)
|
|
elif target_type == "llms_files":
|
|
discovered["llms_files"].append(file_url)
|
|
elif target_type == "robots_files":
|
|
discovered["robots_files"].append(file_url)
|
|
|
|
logger.info(f"Found {target_type} file: {file_url}")
|
|
|
|
finally:
|
|
resp.close()
|
|
|
|
except requests.exceptions.RequestException:
|
|
logger.debug(f"File not found or network error: {filename}")
|
|
except Exception:
|
|
logger.exception(f"Unexpected error checking {filename}")
|
|
|
|
except Exception:
|
|
logger.exception(f"Unexpected error in standard pattern checking for {base_url}")
|
|
|
|
return discovered
|
|
|
|
def _parse_html_meta_tags(self, base_url: str) -> list[str]:
|
|
"""
|
|
Extract sitemap references from HTML meta tags.
|
|
|
|
Args:
|
|
base_url: Base URL to check HTML for meta tags
|
|
|
|
Returns:
|
|
List of sitemap URLs found in HTML meta tags
|
|
"""
|
|
sitemaps: list[str] = []
|
|
|
|
try:
|
|
logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
|
|
resp = requests.get(base_url, timeout=30, stream=True, verify=True)
|
|
|
|
try:
|
|
if resp.status_code != 200:
|
|
logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
|
|
return sitemaps
|
|
|
|
# Read response with size limit
|
|
content = self._read_response_with_limit(resp, base_url)
|
|
|
|
# Look for sitemap meta tags or link elements
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
# Check for <link rel="sitemap" href="..."> (case-insensitive)
|
|
sitemap_link_pattern = re.compile(
|
|
r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']',
|
|
re.IGNORECASE
|
|
)
|
|
matches = sitemap_link_pattern.findall(content)
|
|
|
|
for match in matches:
|
|
sitemap_url = urljoin(base_url, match)
|
|
if urlparse(sitemap_url).scheme in ("http", "https"):
|
|
sitemaps.append(sitemap_url)
|
|
logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
|
|
|
|
# Check for <meta name="sitemap" content="..."> (case-insensitive)
|
|
sitemap_meta_pattern = re.compile(
|
|
r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']',
|
|
re.IGNORECASE
|
|
)
|
|
matches = sitemap_meta_pattern.findall(content)
|
|
|
|
for match in matches:
|
|
sitemap_url = urljoin(base_url, match)
|
|
if urlparse(sitemap_url).scheme in ("http", "https"):
|
|
sitemaps.append(sitemap_url)
|
|
logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
|
|
|
|
finally:
|
|
resp.close()
|
|
|
|
except requests.exceptions.RequestException:
|
|
logger.exception(f"Network error fetching HTML from {base_url}")
|
|
except ValueError as e:
|
|
# Size limit exceeded
|
|
logger.warning(f"HTML response too large at {base_url}: {e}")
|
|
except Exception:
|
|
logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
|
|
|
|
return sitemaps
|
|
|
|
def _check_well_known_directory(self, base_url: str) -> list[str]:
|
|
"""
|
|
Check .well-known/* files for discovery targets.
|
|
|
|
Args:
|
|
base_url: Base URL to check .well-known directory for
|
|
|
|
Returns:
|
|
List of URLs found in .well-known directory
|
|
"""
|
|
well_known_files: list[str] = []
|
|
|
|
try:
|
|
for filename in self.DISCOVERY_TARGETS["well_known_files"]:
|
|
try:
|
|
file_url = urljoin(base_url, filename)
|
|
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
|
|
|
|
try:
|
|
if resp.status_code == 200:
|
|
well_known_files.append(file_url)
|
|
logger.info(f"Found .well-known file: {file_url}")
|
|
|
|
finally:
|
|
resp.close()
|
|
|
|
except requests.exceptions.RequestException:
|
|
logger.debug(f"Well-known file not found or network error: {filename}")
|
|
except Exception:
|
|
logger.exception(f"Unexpected error checking well-known file: {filename}")
|
|
|
|
except Exception:
|
|
logger.exception(f"Unexpected error checking .well-known directory for {base_url}")
|
|
|
|
return well_known_files
|
|
|
|
def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
|
|
"""
|
|
Try pattern variations for discovery targets.
|
|
|
|
Args:
|
|
base_url: Base URL to try variations for
|
|
|
|
Returns:
|
|
Dictionary with file types and discovered variation URLs
|
|
"""
|
|
discovered: dict[str, list[str]] = {
|
|
"sitemaps": [],
|
|
"llms_files": []
|
|
}
|
|
|
|
try:
|
|
# Common subdirectories to check
|
|
subdirs = ["public", "static", "assets", "docs", "doc", "api"]
|
|
|
|
# Try llms.txt variants in subdirectories
|
|
for subdir in subdirs:
|
|
for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
|
|
try:
|
|
file_url = urljoin(base_url, f"{subdir}/{llms_file}")
|
|
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
|
|
|
|
try:
|
|
if resp.status_code == 200:
|
|
discovered["llms_files"].append(file_url)
|
|
logger.info(f"Found llms file variant: {file_url}")
|
|
|
|
finally:
|
|
resp.close()
|
|
|
|
except requests.exceptions.RequestException:
|
|
logger.debug(f"Variant not found: {subdir}/{llms_file}")
|
|
except Exception:
|
|
logger.exception(f"Error checking variant: {subdir}/{llms_file}")
|
|
|
|
# Try sitemap variants with different paths
|
|
sitemap_paths = [
|
|
"sitemaps/sitemap.xml",
|
|
"sitemap/sitemap.xml",
|
|
"xml/sitemap.xml",
|
|
"feed/sitemap.xml"
|
|
]
|
|
|
|
for sitemap_path in sitemap_paths:
|
|
try:
|
|
file_url = urljoin(base_url, sitemap_path)
|
|
resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True)
|
|
|
|
try:
|
|
if resp.status_code == 200:
|
|
discovered["sitemaps"].append(file_url)
|
|
logger.info(f"Found sitemap variant: {file_url}")
|
|
|
|
finally:
|
|
resp.close()
|
|
|
|
except requests.exceptions.RequestException:
|
|
logger.debug(f"Sitemap variant not found: {sitemap_path}")
|
|
except Exception:
|
|
logger.exception(f"Error checking sitemap variant: {sitemap_path}")
|
|
|
|
except Exception:
|
|
logger.exception(f"Unexpected error trying common variations for {base_url}")
|
|
|
|
return discovered
|