Implement priority-based automatic discovery of llms.txt and sitemap.xml files

- Add DiscoveryService with single-file priority selection
  - Priority: llms-full.txt > llms.txt > llms.md > llms.mdx > sitemap.xml > robots.txt
  - All files contain similar AI/crawling guidance, so only best one is needed
  - Robots.txt sitemap declarations have highest priority
  - Fallback to subdirectories for llms files

- Enhance URLHandler with discovery helper methods
  - Add is_robots_txt, is_llms_variant, is_well_known_file, get_base_url methods
  - Follow existing patterns with proper error handling

- Integrate discovery into CrawlingService orchestration
  - When discovery finds file: crawl ONLY discovered file (not main URL)
  - When no discovery: crawl main URL normally
  - Fixes issue where both main URL + discovered file were crawled

- Add discovery stage to progress mapping
  - New "discovery" stage in progress flow
  - Clear progress messages for discovered files

- Comprehensive test coverage
  - Tests for priority-based selection logic
  - Tests for robots.txt priority and fallback behavior
  - Updated existing tests for new return formats

Resolves efficient crawling by selecting single best guidance file instead
of crawling redundant content from multiple similar files.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-09-08 09:03:15 +02:00
parent 012d2c58ed
commit 1a55d93a4e
6 changed files with 1193 additions and 42 deletions

View File

@@ -17,6 +17,7 @@ from ...utils.progress.progress_tracker import ProgressTracker
# Import strategies
# Import operations
from .discovery_service import DiscoveryService
from .document_storage_operations import DocumentStorageOperations
from .helpers.site_config import SiteConfig
@@ -83,6 +84,7 @@ class CrawlingService:
# Initialize operations
self.doc_storage_ops = DocumentStorageOperations(self.supabase_client)
self.discovery_service = DiscoveryService()
# Track progress state across all stages to prevent UI resets
self.progress_state = {"progressId": self.progress_id} if self.progress_id else {}
@@ -132,7 +134,7 @@ class CrawlingService:
f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')} | "
f"kwargs_keys={list(kwargs.keys())}"
)
# Update progress via tracker (stores in memory for HTTP polling)
await self.progress_tracker.update(
status=base_status,
@@ -332,16 +334,68 @@ class CrawlingService:
# Check for cancellation before proceeding
self._check_cancellation()
# Analyzing stage - report initial page count (at least 1)
await update_mapped_progress(
"analyzing", 50, f"Analyzing URL type for {url}",
total_pages=1, # We know we have at least the start URL
processed_pages=0
)
# Discovery phase - find the single best related file
discovered_urls = []
if request.get("auto_discovery", True): # Default enabled
await update_mapped_progress(
"discovery", 25, f"Discovering best related file for {url}", current_url=url
)
try:
discovered_file = self.discovery_service.discover_files(url)
# Add the single best discovered file to crawl list
if discovered_file:
safe_logfire_info(f"Discovery found file: {discovered_file}")
# Filter through is_binary_file() check like existing code
if not self.url_handler.is_binary_file(discovered_file):
discovered_urls.append(discovered_file)
safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}")
else:
safe_logfire_info(f"Skipping binary file: {discovered_file}")
else:
safe_logfire_info(f"Discovery found no files for {url}")
file_count = len(discovered_urls)
safe_logfire_info(f"Discovery selected {file_count} best file to crawl")
await update_mapped_progress(
"discovery", 100, f"Discovery completed: selected {file_count} best file", current_url=url
)
except Exception as e:
safe_logfire_error(f"Discovery phase failed: {e}")
# Continue with regular crawl even if discovery fails
await update_mapped_progress(
"discovery", 100, "Discovery phase failed, continuing with regular crawl", current_url=url
)
# Analyzing stage - determine what to crawl
if discovered_urls:
# Discovery found a file - crawl ONLY the discovered file, not the main URL
total_urls_to_crawl = len(discovered_urls)
await update_mapped_progress(
"analyzing", 50, f"Analyzing discovered file: {discovered_urls[0]}",
total_pages=total_urls_to_crawl,
processed_pages=0
)
# Crawl only the discovered file
safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}")
crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request)
else:
# No discovery - crawl the main URL normally
total_urls_to_crawl = 1
await update_mapped_progress(
"analyzing", 50, f"Analyzing URL type for {url}",
total_pages=total_urls_to_crawl,
processed_pages=0
)
# Crawl the main URL
safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
# Detect URL type and perform crawl
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
# Update progress tracker with crawl type
if self.progress_tracker and crawl_type:
await self.progress_tracker.update(
@@ -415,7 +469,7 @@ class CrawlingService:
if request.get("extract_code_examples", True) and actual_chunks_stored > 0:
# Check for cancellation before starting code extraction
self._check_cancellation()
await update_mapped_progress("code_extraction", 0, "Starting code extraction...")
# Create progress callback for code extraction
@@ -424,7 +478,7 @@ class CrawlingService:
# Use ProgressMapper to ensure progress never goes backwards
raw_progress = data.get("progress", data.get("percentage", 0))
mapped_progress = self.progress_mapper.map_progress("code_extraction", raw_progress)
# Update progress state via tracker
await self.progress_tracker.update(
status=data.get("status", "code_extraction"),
@@ -445,7 +499,7 @@ class CrawlingService:
# Check for cancellation after code extraction
self._check_cancellation()
# Send heartbeat after code extraction
await send_heartbeat_if_needed()
@@ -571,7 +625,7 @@ class CrawlingService:
crawl_type = None
if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url):
# Handle text files
# Handle text files
crawl_type = "llms-txt" if "llms" in url.lower() else "text_file"
if self.progress_tracker:
await self.progress_tracker.update(
@@ -593,7 +647,7 @@ class CrawlingService:
if self.url_handler.is_link_collection_file(url, content):
# Extract links from the content
extracted_links = self.url_handler.extract_markdown_links(content, url)
# Filter out self-referential links to avoid redundant crawling
if extracted_links:
original_count = len(extracted_links)
@@ -604,7 +658,7 @@ class CrawlingService:
self_filtered_count = original_count - len(extracted_links)
if self_filtered_count > 0:
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
if extracted_links:
original_count = len(extracted_links)
@@ -612,7 +666,7 @@ class CrawlingService:
filtered_count = original_count - len(extracted_links)
if filtered_count > 0:
logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
if extracted_links:
# Crawl the extracted links using batch crawling
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
@@ -623,11 +677,11 @@ class CrawlingService:
start_progress=10,
end_progress=20,
)
# Combine original text file results with batch results
crawl_results.extend(batch_results)
crawl_type = "link_collection_with_crawled_links"
logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
else:
logger.info(f"No valid links found in link collection file: {url}")

View File

@@ -0,0 +1,441 @@
"""
Discovery Service for Automatic File Detection
Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
to enhance crawling capabilities with priority-based discovery methods.
"""
from urllib.parse import urljoin
import requests
from ...config.logfire_config import get_logger
logger = get_logger(__name__)
class DiscoveryService:
"""Service for discovering related files automatically during crawls."""
# Global priority order - select ONE best file from all categories
# All these files contain similar AI/crawling guidance content
DISCOVERY_PRIORITY = [
# LLMs files (highest priority - most comprehensive AI guidance)
"llms-full.txt",
"llms.txt",
"llms.md",
"llms.mdx",
"llms.markdown",
# Sitemap files (structural crawling guidance)
"sitemap_index.xml",
"sitemap-index.xml",
"sitemap.xml",
# Robots file (basic crawling rules)
"robots.txt",
# Well-known variants (alternative locations)
".well-known/ai.txt",
".well-known/llms.txt",
".well-known/sitemap.xml"
]
def discover_files(self, base_url: str) -> str | None:
"""
Main discovery orchestrator - selects ONE best file across all categories.
All files contain similar AI/crawling guidance, so we only need the best one.
Args:
base_url: Base URL to discover files for
Returns:
Single best URL found, or None if no files discovered
"""
try:
logger.info(f"Starting single-file discovery for {base_url}")
# First check robots.txt for explicit sitemap declarations (special case)
robots_sitemaps = self._parse_robots_txt(base_url)
if robots_sitemaps:
best_file = robots_sitemaps[0] # Use first sitemap from robots.txt
logger.info(f"Discovery found best file from robots.txt: {best_file}")
return best_file
# Check files in global priority order
for filename in self.DISCOVERY_PRIORITY:
# Try root location first
file_url = urljoin(base_url, f"/{filename}")
if self._check_url_exists(file_url):
logger.info(f"Discovery found best file: {file_url}")
return file_url
# For llms files, also try common subdirectories
if filename.startswith('llms'):
for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
if self._check_url_exists(subdir_url):
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
return subdir_url
# For sitemap files, also try common subdirectories
if filename.endswith('.xml') and not filename.startswith('.well-known'):
for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
if self._check_url_exists(subdir_url):
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
return subdir_url
# Check HTML meta tags for sitemap references as final fallback
html_sitemaps = self._parse_html_meta_tags(base_url)
if html_sitemaps:
best_file = html_sitemaps[0]
logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
return best_file
logger.info(f"Discovery completed for {base_url}: no files found")
return None
except Exception:
logger.exception(f"Unexpected error during discovery for {base_url}")
return None
def _discover_best_sitemap(self, base_url: str) -> str | None:
"""
Discover the best available sitemap using priority-based selection.
Priority order:
1. Sitemaps from robots.txt (highest priority - explicitly declared)
2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
3. Common subdirectory variations
4. HTML meta tag references
5. .well-known directory
"""
try:
# Priority 1: Check robots.txt for sitemap declarations
robots_sitemaps = self._parse_robots_txt(base_url)
if robots_sitemaps:
return robots_sitemaps[0] # Use first sitemap from robots.txt
# Priority 2: Check standard locations in priority order
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
sitemap_url = urljoin(base_url, f"/{filename}")
if self._check_url_exists(sitemap_url):
return sitemap_url
# Priority 3: Check common subdirectory variations
subdirs = ["sitemaps", "sitemap", "xml", "feed"]
for subdir in subdirs:
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
sitemap_url = urljoin(base_url, f"/{subdir}/{filename}")
if self._check_url_exists(sitemap_url):
return sitemap_url
# Priority 4: Check HTML meta tag references
html_sitemaps = self._parse_html_meta_tags(base_url)
if html_sitemaps:
return html_sitemaps[0] # Use first sitemap from HTML
# Priority 5: Check .well-known directory
well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml")
if self._check_url_exists(well_known_sitemap):
return well_known_sitemap
except Exception:
logger.exception(f"Error discovering best sitemap for {base_url}")
return None
def _discover_best_llms_file(self, base_url: str) -> str | None:
"""
Discover the best available llms file using priority-based selection.
Priority order:
1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
2. Common subdirectory variations (static, public, docs, assets)
3. .well-known directory variants
"""
try:
# Priority 1: Check standard root locations in priority order
for filename in self.DISCOVERY_TARGETS["llms_files"]:
llms_url = urljoin(base_url, f"/{filename}")
if self._check_url_exists(llms_url):
return llms_url
# Priority 2: Check common subdirectory variations
subdirs = ["static", "public", "docs", "assets", "doc", "api"]
for subdir in subdirs:
for filename in self.DISCOVERY_TARGETS["llms_files"]:
llms_url = urljoin(base_url, f"/{subdir}/{filename}")
if self._check_url_exists(llms_url):
return llms_url
# Priority 3: Check .well-known directory variants
for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
well_known_url = urljoin(base_url, f"/{well_known_file}")
if self._check_url_exists(well_known_url):
return well_known_url
except Exception:
logger.exception(f"Error discovering best llms file for {base_url}")
return None
def _discover_robots_file(self, base_url: str) -> str | None:
"""
Discover robots.txt file (always single file at root).
"""
try:
robots_url = urljoin(base_url, "/robots.txt")
if self._check_url_exists(robots_url):
return robots_url
except Exception:
logger.exception(f"Error discovering robots file for {base_url}")
return None
def _check_url_exists(self, url: str) -> bool:
"""
Check if a URL exists and returns a successful response.
"""
try:
resp = requests.get(url, timeout=5, allow_redirects=True)
success = resp.status_code == 200
logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
return success
except Exception as e:
logger.debug(f"URL check failed: {url} -> {e}")
return False
def _parse_robots_txt(self, base_url: str) -> list[str]:
"""
Extract sitemap URLs from robots.txt.
Args:
base_url: Base URL to check robots.txt for
Returns:
List of sitemap URLs found in robots.txt
"""
sitemaps: list[str] = []
try:
robots_url = urljoin(base_url, "/robots.txt")
logger.info(f"Checking robots.txt at {robots_url}")
resp = requests.get(robots_url, timeout=30)
if resp.status_code != 200:
logger.info(f"No robots.txt found: HTTP {resp.status_code}")
return sitemaps
# Parse robots.txt content for sitemap directives
for line in resp.text.splitlines():
line = line.strip().lower()
if line.startswith("sitemap:"):
sitemap_url = line.split(":", 1)[1].strip()
# Validate URL format before adding
if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')):
sitemaps.append(sitemap_url)
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
except requests.exceptions.RequestException:
logger.exception(f"Network error fetching robots.txt from {base_url}")
except Exception:
logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
return sitemaps
def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
"""
Check common file locations for discovery targets.
Args:
base_url: Base URL to check standard locations for
Returns:
Dictionary with file types and discovered URLs
"""
discovered: dict[str, list[str]] = {
"sitemaps": [],
"llms_files": [],
"robots_files": []
}
try:
# Check all discovery targets at standard locations
all_targets = []
for target_type, files in self.DISCOVERY_TARGETS.items():
if target_type != "well_known_files": # Skip well-known, handled separately
for filename in files:
all_targets.append((target_type, filename))
for target_type, filename in all_targets:
try:
file_url = urljoin(base_url, f"/{filename}")
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
# Map target type to discovery category
if target_type == "sitemap_files":
discovered["sitemaps"].append(file_url)
elif target_type == "llms_files":
discovered["llms_files"].append(file_url)
elif target_type == "robots_files":
discovered["robots_files"].append(file_url)
logger.info(f"Found {target_type} file: {file_url}")
except requests.exceptions.RequestException:
logger.debug(f"File not found or network error: {filename}")
except Exception:
logger.exception(f"Unexpected error checking {filename}")
except Exception:
logger.exception(f"Unexpected error in standard pattern checking for {base_url}")
return discovered
def _parse_html_meta_tags(self, base_url: str) -> list[str]:
"""
Extract sitemap references from HTML meta tags.
Args:
base_url: Base URL to check HTML for meta tags
Returns:
List of sitemap URLs found in HTML meta tags
"""
sitemaps: list[str] = []
try:
logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
resp = requests.get(base_url, timeout=30)
if resp.status_code != 200:
logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
return sitemaps
content = resp.text.lower()
# Look for sitemap meta tags or link elements
import re
# Check for <link rel="sitemap" href="...">
sitemap_link_pattern = r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']'
matches = re.findall(sitemap_link_pattern, content)
for match in matches:
sitemap_url = urljoin(base_url, match)
sitemaps.append(sitemap_url)
logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
# Check for <meta name="sitemap" content="...">
sitemap_meta_pattern = r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']'
matches = re.findall(sitemap_meta_pattern, content)
for match in matches:
sitemap_url = urljoin(base_url, match)
sitemaps.append(sitemap_url)
logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
except requests.exceptions.RequestException:
logger.exception(f"Network error fetching HTML from {base_url}")
except Exception:
logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
return sitemaps
def _check_well_known_directory(self, base_url: str) -> list[str]:
"""
Check .well-known/* files for discovery targets.
Args:
base_url: Base URL to check .well-known directory for
Returns:
List of URLs found in .well-known directory
"""
well_known_files: list[str] = []
try:
for filename in self.DISCOVERY_TARGETS["well_known_files"]:
try:
file_url = urljoin(base_url, f"/{filename}")
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
well_known_files.append(file_url)
logger.info(f"Found .well-known file: {file_url}")
except requests.exceptions.RequestException:
logger.debug(f"Well-known file not found or network error: {filename}")
except Exception:
logger.exception(f"Unexpected error checking well-known file: {filename}")
except Exception:
logger.exception(f"Unexpected error checking .well-known directory for {base_url}")
return well_known_files
def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
"""
Try pattern variations for discovery targets.
Args:
base_url: Base URL to try variations for
Returns:
Dictionary with file types and discovered variation URLs
"""
discovered: dict[str, list[str]] = {
"sitemaps": [],
"llms_files": []
}
try:
# Common subdirectories to check
subdirs = ["public", "static", "assets", "docs", "doc", "api"]
# Try llms.txt variants in subdirectories
for subdir in subdirs:
for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
try:
file_url = urljoin(base_url, f"/{subdir}/{llms_file}")
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
discovered["llms_files"].append(file_url)
logger.info(f"Found llms file variant: {file_url}")
except requests.exceptions.RequestException:
logger.debug(f"Variant not found: {subdir}/{llms_file}")
except Exception:
logger.exception(f"Error checking variant: {subdir}/{llms_file}")
# Try sitemap variants with different paths
sitemap_paths = [
"sitemaps/sitemap.xml",
"sitemap/sitemap.xml",
"xml/sitemap.xml",
"feed/sitemap.xml"
]
for sitemap_path in sitemap_paths:
try:
file_url = urljoin(base_url, f"/{sitemap_path}")
resp = requests.get(file_url, timeout=30, allow_redirects=True)
if resp.status_code == 200:
discovered["sitemaps"].append(file_url)
logger.info(f"Found sitemap variant: {file_url}")
except requests.exceptions.RequestException:
logger.debug(f"Sitemap variant not found: {sitemap_path}")
except Exception:
logger.exception(f"Error checking sitemap variant: {sitemap_path}")
except Exception:
logger.exception(f"Unexpected error trying common variations for {base_url}")
return discovered

View File

@@ -6,8 +6,7 @@ Handles URL transformations and validations.
import hashlib
import re
from urllib.parse import urlparse, urljoin
from typing import List, Optional
from urllib.parse import urljoin, urlparse
from ....config.logfire_config import get_logger
@@ -33,8 +32,8 @@ class URLHandler:
except Exception as e:
logger.warning(f"Error checking if URL is sitemap: {e}")
return False
@staticmethod
@staticmethod
def is_markdown(url: str) -> bool:
"""
Check if a URL points to a markdown file (.md, .mdx, .markdown).
@@ -274,9 +273,9 @@ class URLHandler:
# Fallback: use a hash of the error message + url to still get something unique
fallback = f"error_{redacted}_{str(e)}"
return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
@staticmethod
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
def extract_markdown_links(content: str, base_url: str | None = None) -> list[str]:
"""
Extract markdown-style links from text content.
@@ -290,10 +289,10 @@ class URLHandler:
try:
if not content:
return []
# Ultimate URL pattern with comprehensive format support:
# 1) [text](url) - markdown links
# 2) <https://...> - autolinks
# 2) <https://...> - autolinks
# 3) https://... - bare URLs with protocol
# 4) //example.com - protocol-relative URLs
# 5) www.example.com - scheme-less www URLs
@@ -348,7 +347,7 @@ class URLHandler:
# Only include HTTP/HTTPS URLs
if url.startswith(('http://', 'https://')):
urls.append(url)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
@@ -356,16 +355,16 @@ class URLHandler:
if url not in seen:
seen.add(url)
unique_urls.append(url)
logger.info(f"Extracted {len(unique_urls)} unique links from content")
return unique_urls
except Exception as e:
logger.error(f"Error extracting markdown links: {e}", exc_info=True)
return []
@staticmethod
def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
def is_link_collection_file(url: str, content: str | None = None) -> bool:
"""
Check if a URL/file appears to be a link collection file like llms.txt.
@@ -380,7 +379,7 @@ class URLHandler:
# Extract filename from URL
parsed = urlparse(url)
filename = parsed.path.split('/')[-1].lower()
# Check for specific link collection filenames
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
link_collection_patterns = [
@@ -391,12 +390,12 @@ class URLHandler:
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
]
# Direct filename match
if filename in link_collection_patterns:
logger.info(f"Detected link collection file by filename: {filename}")
return True
# Pattern-based detection for variations, but exclude "full" variants
# Only match files that are likely link collections, not complete content files
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
@@ -407,7 +406,7 @@ class URLHandler:
if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
logger.info(f"Detected potential link collection file: {filename}")
return True
# Content-based detection if content is provided
if content:
# Never treat "full" variants as link collections to preserve single-page behavior
@@ -417,19 +416,19 @@ class URLHandler:
# Reuse extractor to avoid regex divergence and maintain consistency
extracted_links = URLHandler.extract_markdown_links(content, url)
total_links = len(extracted_links)
# Calculate link density (links per 100 characters)
content_length = len(content.strip())
if content_length > 0:
link_density = (total_links * 100) / content_length
# If more than 2% of content is links, likely a link collection
if link_density > 2.0 and total_links > 3:
logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
return True
return False
except Exception as e:
logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
return False
@@ -583,3 +582,92 @@ class URLHandler:
logger.warning(f"Error extracting display name for {url}: {e}, using URL")
# Fallback: return truncated URL
return url[:50] + "..." if len(url) > 50 else url
@staticmethod
def is_robots_txt(url: str) -> bool:
"""
Check if a URL is a robots.txt file with error handling.
Args:
url: URL to check
Returns:
True if URL is a robots.txt file, False otherwise
"""
try:
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
path = parsed.path.lower()
# Only detect robots.txt at root level
return path == '/robots.txt'
except Exception as e:
logger.warning(f"Error checking if URL is robots.txt: {e}", exc_info=True)
return False
@staticmethod
def is_llms_variant(url: str) -> bool:
"""
Check if a URL is a llms.txt/llms.md variant with error handling.
Args:
url: URL to check
Returns:
True if URL is a llms file variant, False otherwise
"""
try:
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
path = parsed.path.lower()
filename = path.split('/')[-1] if '/' in path else path
# Check for llms file variants
llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
return filename in llms_variants
except Exception as e:
logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
return False
@staticmethod
def is_well_known_file(url: str) -> bool:
"""
Check if a URL is a .well-known/* file with error handling.
Args:
url: URL to check
Returns:
True if URL is a .well-known file, False otherwise
"""
try:
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
path = parsed.path.lower()
# Only detect .well-known files at root level
return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
except Exception as e:
logger.warning(f"Error checking if URL is well-known file: {e}", exc_info=True)
return False
@staticmethod
def get_base_url(url: str) -> str:
"""
Extract base domain URL for discovery with error handling.
Args:
url: URL to extract base from
Returns:
Base URL (scheme + netloc) or original URL if extraction fails
"""
try:
parsed = urlparse(url)
# Ensure we have scheme and netloc
if parsed.scheme and parsed.netloc:
return f"{parsed.scheme}://{parsed.netloc}"
else:
logger.warning(f"URL missing scheme or netloc: {url}")
return url
except Exception as e:
logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True)
return url

View File

@@ -15,7 +15,8 @@ class ProgressMapper:
"starting": (0, 1),
"initializing": (0, 1),
"analyzing": (1, 2), # URL analysis is very quick
"crawling": (2, 5), # Crawling pages is relatively fast
"discovery": (2, 3), # File discovery is quick
"crawling": (3, 5), # Crawling pages is relatively fast
"processing": (5, 8), # Content processing/chunking is quick
"source_creation": (8, 10), # DB operations are fast
"document_storage": (10, 30), # Embeddings + batch processing - significant but not longest

View File

@@ -0,0 +1,449 @@
"""Unit tests for DiscoveryService class."""
import pytest
from unittest.mock import patch, Mock
from src.server.services.crawling.discovery_service import DiscoveryService
class TestDiscoveryService:
"""Test suite for DiscoveryService class."""
@patch('requests.get')
def test_discover_files_basic(self, mock_get):
"""Test main discovery method returns single best file."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock robots.txt response (no sitemaps)
robots_response = Mock()
robots_response.status_code = 200
robots_response.text = "User-agent: *\nDisallow: /admin/"
# Mock file existence - llms-full.txt doesn't exist, but llms.txt does
def mock_get_side_effect(url, **kwargs):
response = Mock()
if url.endswith('robots.txt'):
return robots_response
elif url.endswith('llms-full.txt'):
response.status_code = 404 # Highest priority doesn't exist
elif url.endswith('llms.txt'):
response.status_code = 200 # Second priority exists
else:
response.status_code = 404
return response
mock_get.side_effect = mock_get_side_effect
result = service.discover_files(base_url)
# Should return single URL string (not dict, not list)
assert isinstance(result, str)
assert result == 'https://example.com/llms.txt'
@patch('requests.get')
def test_discover_files_no_files_found(self, mock_get):
"""Test discovery when no files are found."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock all HTTP requests to return 404
mock_response = Mock()
mock_response.status_code = 404
mock_get.return_value = mock_response
result = service.discover_files(base_url)
# Should return None when no files found
assert result is None
@patch('requests.get')
def test_discover_files_priority_order(self, mock_get):
"""Test that discovery follows the correct priority order."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock robots.txt response (no sitemaps declared)
robots_response = Mock()
robots_response.status_code = 200
robots_response.text = "User-agent: *\nDisallow: /admin/"
# Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority
def mock_get_side_effect(url, **kwargs):
response = Mock()
if url.endswith('robots.txt'):
return robots_response
elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
response.status_code = 200 # Both exist
else:
response.status_code = 404
return response
mock_get.side_effect = mock_get_side_effect
result = service.discover_files(base_url)
# Should return llms.txt since it has higher priority than sitemap.xml
assert result == 'https://example.com/llms.txt'
@patch('requests.get')
def test_discover_files_robots_sitemap_priority(self, mock_get):
"""Test that robots.txt sitemap declarations have highest priority."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock robots.txt response WITH sitemap declaration
robots_response = Mock()
robots_response.status_code = 200
robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml"
# Mock other files also exist
def mock_get_side_effect(url, **kwargs):
response = Mock()
if url.endswith('robots.txt'):
return robots_response
elif 'llms' in url or 'sitemap' in url:
response.status_code = 200
else:
response.status_code = 404
return response
mock_get.side_effect = mock_get_side_effect
result = service.discover_files(base_url)
# Should return the sitemap declared in robots.txt (highest priority)
assert result == 'https://example.com/declared-sitemap.xml'
@patch('requests.get')
def test_discover_best_sitemap_robots_priority(self, mock_get):
"""Test sitemap discovery prioritizes robots.txt declarations."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock robots.txt with sitemap
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "Sitemap: https://example.com/robots-sitemap.xml"
mock_get.return_value = mock_response
result = service._discover_best_sitemap(base_url)
# Should return the sitemap from robots.txt (highest priority)
assert result == "https://example.com/robots-sitemap.xml"
@patch('requests.get')
def test_discover_best_llms_file_priority_order(self, mock_get):
"""Test llms file discovery follows priority order."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock HTTP responses - only llms.txt exists, not llms-full.txt
def mock_get_side_effect(url, **kwargs):
response = Mock()
if url.endswith('llms-full.txt'):
response.status_code = 404 # Higher priority file doesn't exist
elif url.endswith('llms.txt'):
response.status_code = 200 # Standard file exists
else:
response.status_code = 404
return response
mock_get.side_effect = mock_get_side_effect
result = service._discover_best_llms_file(base_url)
# Should find llms.txt since llms-full.txt doesn't exist
assert result == "https://example.com/llms.txt"
@patch('requests.get')
def test_discover_best_llms_file_subdirectory_fallback(self, mock_get):
"""Test llms file discovery falls back to subdirectories."""
service = DiscoveryService()
base_url = "https://example.com"
# Mock HTTP responses - no root files, but static/llms.txt exists
def mock_get_side_effect(url, **kwargs):
response = Mock()
if '/static/llms.txt' in url:
response.status_code = 200 # Found in subdirectory
else:
response.status_code = 404
return response
mock_get.side_effect = mock_get_side_effect
result = service._discover_best_llms_file(base_url)
# Should find the file in static subdirectory
assert result == "https://example.com/static/llms.txt"
@patch('requests.get')
def test_check_url_exists(self, mock_get):
"""Test URL existence checking."""
service = DiscoveryService()
# Test successful response
mock_response = Mock()
mock_response.status_code = 200
mock_get.return_value = mock_response
assert service._check_url_exists("https://example.com/exists") is True
# Test 404 response
mock_response.status_code = 404
assert service._check_url_exists("https://example.com/not-found") is False
# Test network error
mock_get.side_effect = Exception("Network error")
assert service._check_url_exists("https://example.com/error") is False
@patch('requests.get')
def test_parse_robots_txt_with_sitemap(self, mock_get):
"""Test robots.txt parsing with sitemap directives."""
service = DiscoveryService()
# Mock successful robots.txt response
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = """User-agent: *
Disallow: /admin/
Sitemap: https://example.com/sitemap.xml
Sitemap: https://example.com/sitemap-news.xml"""
mock_get.return_value = mock_response
result = service._parse_robots_txt("https://example.com")
assert len(result) == 2
assert "https://example.com/sitemap.xml" in result
assert "https://example.com/sitemap-news.xml" in result
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
@patch('requests.get')
def test_parse_robots_txt_no_sitemap(self, mock_get):
"""Test robots.txt parsing without sitemap directives."""
service = DiscoveryService()
# Mock robots.txt without sitemaps
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = """User-agent: *
Disallow: /admin/
Allow: /public/"""
mock_get.return_value = mock_response
result = service._parse_robots_txt("https://example.com")
assert len(result) == 0
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
@patch('requests.get')
def test_parse_robots_txt_not_found(self, mock_get):
"""Test robots.txt parsing when file is not found."""
service = DiscoveryService()
# Mock 404 response
mock_response = Mock()
mock_response.status_code = 404
mock_get.return_value = mock_response
result = service._parse_robots_txt("https://example.com")
assert len(result) == 0
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
@patch('requests.get')
def test_check_standard_patterns(self, mock_get):
"""Test standard file pattern checking."""
service = DiscoveryService()
# Mock responses for different files
def mock_response_side_effect(url, **kwargs):
mock_response = Mock()
if 'llms.txt' in url:
mock_response.status_code = 200
elif 'sitemap.xml' in url:
mock_response.status_code = 200
else:
mock_response.status_code = 404
return mock_response
mock_get.side_effect = mock_response_side_effect
result = service._check_standard_patterns("https://example.com")
assert 'sitemaps' in result
assert 'llms_files' in result
assert 'robots_files' in result
# Should find the files that returned 200
assert any('llms.txt' in url for url in result['llms_files'])
assert any('sitemap.xml' in url for url in result['sitemaps'])
@patch('requests.get')
def test_parse_html_meta_tags(self, mock_get):
"""Test HTML meta tag parsing for sitemaps."""
service = DiscoveryService()
# Mock HTML with sitemap references
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = """
<html>
<head>
<link rel="sitemap" href="/sitemap.xml">
<meta name="sitemap" content="https://example.com/sitemap-meta.xml">
</head>
<body>Content here</body>
</html>
"""
mock_get.return_value = mock_response
result = service._parse_html_meta_tags("https://example.com")
# Should find sitemaps from both link and meta tags
assert len(result) >= 1
assert any('sitemap' in url.lower() for url in result)
mock_get.assert_called_once_with("https://example.com", timeout=30)
@patch('requests.get')
def test_parse_html_meta_tags_not_found(self, mock_get):
"""Test HTML meta tag parsing when page not found."""
service = DiscoveryService()
# Mock 404 response
mock_response = Mock()
mock_response.status_code = 404
mock_get.return_value = mock_response
result = service._parse_html_meta_tags("https://example.com")
assert len(result) == 0
mock_get.assert_called_once_with("https://example.com", timeout=30)
@patch('requests.get')
def test_check_well_known_directory(self, mock_get):
"""Test .well-known directory file checking."""
service = DiscoveryService()
# Mock responses - some files exist, some don't
def mock_response_side_effect(url, **kwargs):
mock_response = Mock()
if 'ai.txt' in url:
mock_response.status_code = 200
else:
mock_response.status_code = 404
return mock_response
mock_get.side_effect = mock_response_side_effect
result = service._check_well_known_directory("https://example.com")
# Should find the ai.txt file
assert len(result) >= 1
assert any('ai.txt' in url for url in result)
@patch('requests.get')
def test_try_common_variations(self, mock_get):
"""Test pattern variations for discovery targets."""
service = DiscoveryService()
# Mock responses for variations
def mock_response_side_effect(url, **kwargs):
mock_response = Mock()
if 'docs/llms.txt' in url or 'sitemaps/sitemap.xml' in url:
mock_response.status_code = 200
else:
mock_response.status_code = 404
return mock_response
mock_get.side_effect = mock_response_side_effect
result = service._try_common_variations("https://example.com")
assert 'sitemaps' in result
assert 'llms_files' in result
# Should find at least one variation
assert len(result['llms_files']) >= 1 or len(result['sitemaps']) >= 1
@patch('requests.get')
def test_network_error_handling(self, mock_get):
"""Test error scenarios with network failures."""
service = DiscoveryService()
# Mock network error
mock_get.side_effect = Exception("Network error")
# Should not raise exception, but return empty results
result = service._parse_robots_txt("https://example.com")
assert result == []
result = service._check_standard_patterns("https://example.com")
assert isinstance(result, dict)
result = service._parse_html_meta_tags("https://example.com")
assert result == []
result = service._check_well_known_directory("https://example.com")
assert result == []
result = service._try_common_variations("https://example.com")
assert isinstance(result, dict)
def test_discover_files_with_exceptions(self):
"""Test main discovery method handles exceptions gracefully."""
service = DiscoveryService()
# Mock methods to raise exceptions
with patch.object(service, '_parse_robots_txt', side_effect=Exception("Test error")):
with patch.object(service, '_check_standard_patterns', side_effect=Exception("Test error")):
with patch.object(service, '_parse_html_meta_tags', side_effect=Exception("Test error")):
with patch.object(service, '_check_well_known_directory', side_effect=Exception("Test error")):
with patch.object(service, '_try_common_variations', side_effect=Exception("Test error")):
result = service.discover_files("https://example.com")
# Should still return proper structure even with all methods failing
assert isinstance(result, dict)
assert 'sitemaps' in result
assert 'llms_files' in result
assert 'robots_files' in result
assert 'well_known_files' in result
@patch('requests.get')
def test_robots_txt_with_malformed_content(self, mock_get):
"""Test robots.txt parsing with malformed content."""
service = DiscoveryService()
# Mock malformed robots.txt content
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = """User-agent: *
Disallow: /admin/
Sitemap:
Sitemap: not-a-valid-url
Sitemap: https://example.com/valid-sitemap.xml"""
mock_get.return_value = mock_response
result = service._parse_robots_txt("https://example.com")
# Should only include the valid sitemap URL
assert len(result) == 1
assert "https://example.com/valid-sitemap.xml" in result
def test_discovery_targets_constant(self):
"""Test that discovery targets constant is properly defined."""
service = DiscoveryService()
assert hasattr(service, 'DISCOVERY_TARGETS')
targets = service.DISCOVERY_TARGETS
# Verify required target types exist
assert 'llms_files' in targets
assert 'sitemap_files' in targets
assert 'robots_files' in targets
assert 'well_known_files' in targets
# Verify they contain expected files
assert 'llms.txt' in targets['llms_files']
assert 'sitemap.xml' in targets['sitemap_files']
assert 'robots.txt' in targets['robots_files']
assert '.well-known/ai.txt' in targets['well_known_files']

View File

@@ -122,4 +122,122 @@ class TestURLHandler:
# Should not transform non-GitHub URLs
other = "https://example.com/file"
assert handler.transform_github_url(other) == other
assert handler.transform_github_url(other) == other
def test_is_robots_txt(self):
"""Test robots.txt detection."""
handler = URLHandler()
# Standard robots.txt URLs
assert handler.is_robots_txt("https://example.com/robots.txt") is True
assert handler.is_robots_txt("http://example.com/robots.txt") is True
assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
# Case sensitivity
assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
# With query parameters (should still be detected)
assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
# Not robots.txt files
assert handler.is_robots_txt("https://example.com/robots") is False
assert handler.is_robots_txt("https://example.com/robots.html") is False
assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
assert handler.is_robots_txt("https://example.com/") is False
# Edge case: malformed URL should not crash
assert handler.is_robots_txt("not-a-url") is False
def test_is_llms_variant(self):
"""Test llms file variant detection."""
handler = URLHandler()
# All llms variants
assert handler.is_llms_variant("https://example.com/llms.txt") is True
assert handler.is_llms_variant("https://example.com/llms.md") is True
assert handler.is_llms_variant("https://example.com/llms.mdx") is True
assert handler.is_llms_variant("https://example.com/llms.markdown") is True
# Case sensitivity
assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
assert handler.is_llms_variant("https://example.com/Llms.Md") is True
# With paths (should still detect)
assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
assert handler.is_llms_variant("https://example.com/public/llms.md") is True
# With query parameters
assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
assert handler.is_llms_variant("https://example.com/llms.md#section") is True
# Not llms files
assert handler.is_llms_variant("https://example.com/llms") is False
assert handler.is_llms_variant("https://example.com/llms.html") is False
assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
assert handler.is_llms_variant("https://example.com/readme.txt") is False
# Edge case: malformed URL should not crash
assert handler.is_llms_variant("not-a-url") is False
def test_is_well_known_file(self):
"""Test .well-known file detection."""
handler = URLHandler()
# Standard .well-known files
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
# Case sensitivity (path should be case sensitive)
assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is True
assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is True
# With query parameters
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
# Not .well-known files
assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
assert handler.is_well_known_file("https://example.com/ai.txt") is False
assert handler.is_well_known_file("https://example.com/") is False
# Edge case: malformed URL should not crash
assert handler.is_well_known_file("not-a-url") is False
def test_get_base_url(self):
"""Test base URL extraction."""
handler = URLHandler()
# Standard URLs
assert handler.get_base_url("https://example.com") == "https://example.com"
assert handler.get_base_url("https://example.com/") == "https://example.com"
assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
# HTTP vs HTTPS
assert handler.get_base_url("http://example.com/path") == "http://example.com"
assert handler.get_base_url("https://example.com/path") == "https://example.com"
# Subdomains and ports
assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
# Complex cases
assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
# Edge cases - malformed URLs should return original
assert handler.get_base_url("not-a-url") == "not-a-url"
assert handler.get_base_url("") == ""
assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
# Missing scheme or netloc
assert handler.get_base_url("//example.com/path") == "//example.com/path" # Should return original
assert handler.get_base_url("/path/to/resource") == "/path/to/resource" # Should return original