mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Implement priority-based automatic discovery of llms.txt and sitemap.xml files
- Add DiscoveryService with single-file priority selection - Priority: llms-full.txt > llms.txt > llms.md > llms.mdx > sitemap.xml > robots.txt - All files contain similar AI/crawling guidance, so only best one is needed - Robots.txt sitemap declarations have highest priority - Fallback to subdirectories for llms files - Enhance URLHandler with discovery helper methods - Add is_robots_txt, is_llms_variant, is_well_known_file, get_base_url methods - Follow existing patterns with proper error handling - Integrate discovery into CrawlingService orchestration - When discovery finds file: crawl ONLY discovered file (not main URL) - When no discovery: crawl main URL normally - Fixes issue where both main URL + discovered file were crawled - Add discovery stage to progress mapping - New "discovery" stage in progress flow - Clear progress messages for discovered files - Comprehensive test coverage - Tests for priority-based selection logic - Tests for robots.txt priority and fallback behavior - Updated existing tests for new return formats Resolves efficient crawling by selecting single best guidance file instead of crawling redundant content from multiple similar files. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -17,6 +17,7 @@ from ...utils.progress.progress_tracker import ProgressTracker
|
||||
|
||||
# Import strategies
|
||||
# Import operations
|
||||
from .discovery_service import DiscoveryService
|
||||
from .document_storage_operations import DocumentStorageOperations
|
||||
from .helpers.site_config import SiteConfig
|
||||
|
||||
@@ -83,6 +84,7 @@ class CrawlingService:
|
||||
|
||||
# Initialize operations
|
||||
self.doc_storage_ops = DocumentStorageOperations(self.supabase_client)
|
||||
self.discovery_service = DiscoveryService()
|
||||
|
||||
# Track progress state across all stages to prevent UI resets
|
||||
self.progress_state = {"progressId": self.progress_id} if self.progress_id else {}
|
||||
@@ -132,7 +134,7 @@ class CrawlingService:
|
||||
f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')} | "
|
||||
f"kwargs_keys={list(kwargs.keys())}"
|
||||
)
|
||||
|
||||
|
||||
# Update progress via tracker (stores in memory for HTTP polling)
|
||||
await self.progress_tracker.update(
|
||||
status=base_status,
|
||||
@@ -332,16 +334,68 @@ class CrawlingService:
|
||||
# Check for cancellation before proceeding
|
||||
self._check_cancellation()
|
||||
|
||||
# Analyzing stage - report initial page count (at least 1)
|
||||
await update_mapped_progress(
|
||||
"analyzing", 50, f"Analyzing URL type for {url}",
|
||||
total_pages=1, # We know we have at least the start URL
|
||||
processed_pages=0
|
||||
)
|
||||
# Discovery phase - find the single best related file
|
||||
discovered_urls = []
|
||||
if request.get("auto_discovery", True): # Default enabled
|
||||
await update_mapped_progress(
|
||||
"discovery", 25, f"Discovering best related file for {url}", current_url=url
|
||||
)
|
||||
try:
|
||||
discovered_file = self.discovery_service.discover_files(url)
|
||||
|
||||
# Add the single best discovered file to crawl list
|
||||
if discovered_file:
|
||||
safe_logfire_info(f"Discovery found file: {discovered_file}")
|
||||
# Filter through is_binary_file() check like existing code
|
||||
if not self.url_handler.is_binary_file(discovered_file):
|
||||
discovered_urls.append(discovered_file)
|
||||
safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}")
|
||||
else:
|
||||
safe_logfire_info(f"Skipping binary file: {discovered_file}")
|
||||
else:
|
||||
safe_logfire_info(f"Discovery found no files for {url}")
|
||||
|
||||
file_count = len(discovered_urls)
|
||||
safe_logfire_info(f"Discovery selected {file_count} best file to crawl")
|
||||
|
||||
await update_mapped_progress(
|
||||
"discovery", 100, f"Discovery completed: selected {file_count} best file", current_url=url
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Discovery phase failed: {e}")
|
||||
# Continue with regular crawl even if discovery fails
|
||||
await update_mapped_progress(
|
||||
"discovery", 100, "Discovery phase failed, continuing with regular crawl", current_url=url
|
||||
)
|
||||
|
||||
# Analyzing stage - determine what to crawl
|
||||
if discovered_urls:
|
||||
# Discovery found a file - crawl ONLY the discovered file, not the main URL
|
||||
total_urls_to_crawl = len(discovered_urls)
|
||||
await update_mapped_progress(
|
||||
"analyzing", 50, f"Analyzing discovered file: {discovered_urls[0]}",
|
||||
total_pages=total_urls_to_crawl,
|
||||
processed_pages=0
|
||||
)
|
||||
|
||||
# Crawl only the discovered file
|
||||
safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}")
|
||||
crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request)
|
||||
|
||||
else:
|
||||
# No discovery - crawl the main URL normally
|
||||
total_urls_to_crawl = 1
|
||||
await update_mapped_progress(
|
||||
"analyzing", 50, f"Analyzing URL type for {url}",
|
||||
total_pages=total_urls_to_crawl,
|
||||
processed_pages=0
|
||||
)
|
||||
|
||||
# Crawl the main URL
|
||||
safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
|
||||
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
|
||||
|
||||
# Detect URL type and perform crawl
|
||||
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
|
||||
|
||||
# Update progress tracker with crawl type
|
||||
if self.progress_tracker and crawl_type:
|
||||
await self.progress_tracker.update(
|
||||
@@ -415,7 +469,7 @@ class CrawlingService:
|
||||
if request.get("extract_code_examples", True) and actual_chunks_stored > 0:
|
||||
# Check for cancellation before starting code extraction
|
||||
self._check_cancellation()
|
||||
|
||||
|
||||
await update_mapped_progress("code_extraction", 0, "Starting code extraction...")
|
||||
|
||||
# Create progress callback for code extraction
|
||||
@@ -424,7 +478,7 @@ class CrawlingService:
|
||||
# Use ProgressMapper to ensure progress never goes backwards
|
||||
raw_progress = data.get("progress", data.get("percentage", 0))
|
||||
mapped_progress = self.progress_mapper.map_progress("code_extraction", raw_progress)
|
||||
|
||||
|
||||
# Update progress state via tracker
|
||||
await self.progress_tracker.update(
|
||||
status=data.get("status", "code_extraction"),
|
||||
@@ -445,7 +499,7 @@ class CrawlingService:
|
||||
|
||||
# Check for cancellation after code extraction
|
||||
self._check_cancellation()
|
||||
|
||||
|
||||
# Send heartbeat after code extraction
|
||||
await send_heartbeat_if_needed()
|
||||
|
||||
@@ -571,7 +625,7 @@ class CrawlingService:
|
||||
crawl_type = None
|
||||
|
||||
if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url):
|
||||
# Handle text files
|
||||
# Handle text files
|
||||
crawl_type = "llms-txt" if "llms" in url.lower() else "text_file"
|
||||
if self.progress_tracker:
|
||||
await self.progress_tracker.update(
|
||||
@@ -593,7 +647,7 @@ class CrawlingService:
|
||||
if self.url_handler.is_link_collection_file(url, content):
|
||||
# Extract links from the content
|
||||
extracted_links = self.url_handler.extract_markdown_links(content, url)
|
||||
|
||||
|
||||
# Filter out self-referential links to avoid redundant crawling
|
||||
if extracted_links:
|
||||
original_count = len(extracted_links)
|
||||
@@ -604,7 +658,7 @@ class CrawlingService:
|
||||
self_filtered_count = original_count - len(extracted_links)
|
||||
if self_filtered_count > 0:
|
||||
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
|
||||
|
||||
|
||||
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
|
||||
if extracted_links:
|
||||
original_count = len(extracted_links)
|
||||
@@ -612,7 +666,7 @@ class CrawlingService:
|
||||
filtered_count = original_count - len(extracted_links)
|
||||
if filtered_count > 0:
|
||||
logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
|
||||
|
||||
|
||||
if extracted_links:
|
||||
# Crawl the extracted links using batch crawling
|
||||
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
|
||||
@@ -623,11 +677,11 @@ class CrawlingService:
|
||||
start_progress=10,
|
||||
end_progress=20,
|
||||
)
|
||||
|
||||
|
||||
# Combine original text file results with batch results
|
||||
crawl_results.extend(batch_results)
|
||||
crawl_type = "link_collection_with_crawled_links"
|
||||
|
||||
|
||||
logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
|
||||
else:
|
||||
logger.info(f"No valid links found in link collection file: {url}")
|
||||
|
||||
441
python/src/server/services/crawling/discovery_service.py
Normal file
441
python/src/server/services/crawling/discovery_service.py
Normal file
@@ -0,0 +1,441 @@
|
||||
"""
|
||||
Discovery Service for Automatic File Detection
|
||||
|
||||
Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
|
||||
to enhance crawling capabilities with priority-based discovery methods.
|
||||
"""
|
||||
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
from ...config.logfire_config import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DiscoveryService:
|
||||
"""Service for discovering related files automatically during crawls."""
|
||||
|
||||
# Global priority order - select ONE best file from all categories
|
||||
# All these files contain similar AI/crawling guidance content
|
||||
DISCOVERY_PRIORITY = [
|
||||
# LLMs files (highest priority - most comprehensive AI guidance)
|
||||
"llms-full.txt",
|
||||
"llms.txt",
|
||||
"llms.md",
|
||||
"llms.mdx",
|
||||
"llms.markdown",
|
||||
|
||||
# Sitemap files (structural crawling guidance)
|
||||
"sitemap_index.xml",
|
||||
"sitemap-index.xml",
|
||||
"sitemap.xml",
|
||||
|
||||
# Robots file (basic crawling rules)
|
||||
"robots.txt",
|
||||
|
||||
# Well-known variants (alternative locations)
|
||||
".well-known/ai.txt",
|
||||
".well-known/llms.txt",
|
||||
".well-known/sitemap.xml"
|
||||
]
|
||||
|
||||
def discover_files(self, base_url: str) -> str | None:
|
||||
"""
|
||||
Main discovery orchestrator - selects ONE best file across all categories.
|
||||
All files contain similar AI/crawling guidance, so we only need the best one.
|
||||
|
||||
Args:
|
||||
base_url: Base URL to discover files for
|
||||
|
||||
Returns:
|
||||
Single best URL found, or None if no files discovered
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Starting single-file discovery for {base_url}")
|
||||
|
||||
# First check robots.txt for explicit sitemap declarations (special case)
|
||||
robots_sitemaps = self._parse_robots_txt(base_url)
|
||||
if robots_sitemaps:
|
||||
best_file = robots_sitemaps[0] # Use first sitemap from robots.txt
|
||||
logger.info(f"Discovery found best file from robots.txt: {best_file}")
|
||||
return best_file
|
||||
|
||||
# Check files in global priority order
|
||||
for filename in self.DISCOVERY_PRIORITY:
|
||||
# Try root location first
|
||||
file_url = urljoin(base_url, f"/{filename}")
|
||||
if self._check_url_exists(file_url):
|
||||
logger.info(f"Discovery found best file: {file_url}")
|
||||
return file_url
|
||||
|
||||
# For llms files, also try common subdirectories
|
||||
if filename.startswith('llms'):
|
||||
for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
|
||||
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
if self._check_url_exists(subdir_url):
|
||||
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
||||
return subdir_url
|
||||
|
||||
# For sitemap files, also try common subdirectories
|
||||
if filename.endswith('.xml') and not filename.startswith('.well-known'):
|
||||
for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
|
||||
subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
if self._check_url_exists(subdir_url):
|
||||
logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
|
||||
return subdir_url
|
||||
|
||||
# Check HTML meta tags for sitemap references as final fallback
|
||||
html_sitemaps = self._parse_html_meta_tags(base_url)
|
||||
if html_sitemaps:
|
||||
best_file = html_sitemaps[0]
|
||||
logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
|
||||
return best_file
|
||||
|
||||
logger.info(f"Discovery completed for {base_url}: no files found")
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error during discovery for {base_url}")
|
||||
return None
|
||||
|
||||
def _discover_best_sitemap(self, base_url: str) -> str | None:
|
||||
"""
|
||||
Discover the best available sitemap using priority-based selection.
|
||||
|
||||
Priority order:
|
||||
1. Sitemaps from robots.txt (highest priority - explicitly declared)
|
||||
2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
|
||||
3. Common subdirectory variations
|
||||
4. HTML meta tag references
|
||||
5. .well-known directory
|
||||
"""
|
||||
try:
|
||||
# Priority 1: Check robots.txt for sitemap declarations
|
||||
robots_sitemaps = self._parse_robots_txt(base_url)
|
||||
if robots_sitemaps:
|
||||
return robots_sitemaps[0] # Use first sitemap from robots.txt
|
||||
|
||||
# Priority 2: Check standard locations in priority order
|
||||
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
|
||||
sitemap_url = urljoin(base_url, f"/{filename}")
|
||||
if self._check_url_exists(sitemap_url):
|
||||
return sitemap_url
|
||||
|
||||
# Priority 3: Check common subdirectory variations
|
||||
subdirs = ["sitemaps", "sitemap", "xml", "feed"]
|
||||
for subdir in subdirs:
|
||||
for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
|
||||
sitemap_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
if self._check_url_exists(sitemap_url):
|
||||
return sitemap_url
|
||||
|
||||
# Priority 4: Check HTML meta tag references
|
||||
html_sitemaps = self._parse_html_meta_tags(base_url)
|
||||
if html_sitemaps:
|
||||
return html_sitemaps[0] # Use first sitemap from HTML
|
||||
|
||||
# Priority 5: Check .well-known directory
|
||||
well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml")
|
||||
if self._check_url_exists(well_known_sitemap):
|
||||
return well_known_sitemap
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Error discovering best sitemap for {base_url}")
|
||||
|
||||
return None
|
||||
|
||||
def _discover_best_llms_file(self, base_url: str) -> str | None:
|
||||
"""
|
||||
Discover the best available llms file using priority-based selection.
|
||||
|
||||
Priority order:
|
||||
1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
|
||||
2. Common subdirectory variations (static, public, docs, assets)
|
||||
3. .well-known directory variants
|
||||
"""
|
||||
try:
|
||||
# Priority 1: Check standard root locations in priority order
|
||||
for filename in self.DISCOVERY_TARGETS["llms_files"]:
|
||||
llms_url = urljoin(base_url, f"/{filename}")
|
||||
if self._check_url_exists(llms_url):
|
||||
return llms_url
|
||||
|
||||
# Priority 2: Check common subdirectory variations
|
||||
subdirs = ["static", "public", "docs", "assets", "doc", "api"]
|
||||
for subdir in subdirs:
|
||||
for filename in self.DISCOVERY_TARGETS["llms_files"]:
|
||||
llms_url = urljoin(base_url, f"/{subdir}/{filename}")
|
||||
if self._check_url_exists(llms_url):
|
||||
return llms_url
|
||||
|
||||
# Priority 3: Check .well-known directory variants
|
||||
for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
|
||||
well_known_url = urljoin(base_url, f"/{well_known_file}")
|
||||
if self._check_url_exists(well_known_url):
|
||||
return well_known_url
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Error discovering best llms file for {base_url}")
|
||||
|
||||
return None
|
||||
|
||||
def _discover_robots_file(self, base_url: str) -> str | None:
|
||||
"""
|
||||
Discover robots.txt file (always single file at root).
|
||||
"""
|
||||
try:
|
||||
robots_url = urljoin(base_url, "/robots.txt")
|
||||
if self._check_url_exists(robots_url):
|
||||
return robots_url
|
||||
except Exception:
|
||||
logger.exception(f"Error discovering robots file for {base_url}")
|
||||
|
||||
return None
|
||||
|
||||
def _check_url_exists(self, url: str) -> bool:
|
||||
"""
|
||||
Check if a URL exists and returns a successful response.
|
||||
"""
|
||||
try:
|
||||
resp = requests.get(url, timeout=5, allow_redirects=True)
|
||||
success = resp.status_code == 200
|
||||
logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
|
||||
return success
|
||||
except Exception as e:
|
||||
logger.debug(f"URL check failed: {url} -> {e}")
|
||||
return False
|
||||
|
||||
def _parse_robots_txt(self, base_url: str) -> list[str]:
|
||||
"""
|
||||
Extract sitemap URLs from robots.txt.
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check robots.txt for
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs found in robots.txt
|
||||
"""
|
||||
sitemaps: list[str] = []
|
||||
|
||||
try:
|
||||
robots_url = urljoin(base_url, "/robots.txt")
|
||||
logger.info(f"Checking robots.txt at {robots_url}")
|
||||
|
||||
resp = requests.get(robots_url, timeout=30)
|
||||
|
||||
if resp.status_code != 200:
|
||||
logger.info(f"No robots.txt found: HTTP {resp.status_code}")
|
||||
return sitemaps
|
||||
|
||||
# Parse robots.txt content for sitemap directives
|
||||
for line in resp.text.splitlines():
|
||||
line = line.strip().lower()
|
||||
if line.startswith("sitemap:"):
|
||||
sitemap_url = line.split(":", 1)[1].strip()
|
||||
# Validate URL format before adding
|
||||
if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')):
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
logger.exception(f"Network error fetching robots.txt from {base_url}")
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
|
||||
|
||||
return sitemaps
|
||||
|
||||
def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
|
||||
"""
|
||||
Check common file locations for discovery targets.
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check standard locations for
|
||||
|
||||
Returns:
|
||||
Dictionary with file types and discovered URLs
|
||||
"""
|
||||
discovered: dict[str, list[str]] = {
|
||||
"sitemaps": [],
|
||||
"llms_files": [],
|
||||
"robots_files": []
|
||||
}
|
||||
|
||||
try:
|
||||
# Check all discovery targets at standard locations
|
||||
all_targets = []
|
||||
for target_type, files in self.DISCOVERY_TARGETS.items():
|
||||
if target_type != "well_known_files": # Skip well-known, handled separately
|
||||
for filename in files:
|
||||
all_targets.append((target_type, filename))
|
||||
|
||||
for target_type, filename in all_targets:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{filename}")
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
# Map target type to discovery category
|
||||
if target_type == "sitemap_files":
|
||||
discovered["sitemaps"].append(file_url)
|
||||
elif target_type == "llms_files":
|
||||
discovered["llms_files"].append(file_url)
|
||||
elif target_type == "robots_files":
|
||||
discovered["robots_files"].append(file_url)
|
||||
|
||||
logger.info(f"Found {target_type} file: {file_url}")
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
logger.debug(f"File not found or network error: {filename}")
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error checking {filename}")
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error in standard pattern checking for {base_url}")
|
||||
|
||||
return discovered
|
||||
|
||||
def _parse_html_meta_tags(self, base_url: str) -> list[str]:
|
||||
"""
|
||||
Extract sitemap references from HTML meta tags.
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check HTML for meta tags
|
||||
|
||||
Returns:
|
||||
List of sitemap URLs found in HTML meta tags
|
||||
"""
|
||||
sitemaps: list[str] = []
|
||||
|
||||
try:
|
||||
logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
|
||||
resp = requests.get(base_url, timeout=30)
|
||||
|
||||
if resp.status_code != 200:
|
||||
logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
|
||||
return sitemaps
|
||||
|
||||
content = resp.text.lower()
|
||||
|
||||
# Look for sitemap meta tags or link elements
|
||||
import re
|
||||
|
||||
# Check for <link rel="sitemap" href="...">
|
||||
sitemap_link_pattern = r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']'
|
||||
matches = re.findall(sitemap_link_pattern, content)
|
||||
|
||||
for match in matches:
|
||||
sitemap_url = urljoin(base_url, match)
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
|
||||
|
||||
# Check for <meta name="sitemap" content="...">
|
||||
sitemap_meta_pattern = r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']'
|
||||
matches = re.findall(sitemap_meta_pattern, content)
|
||||
|
||||
for match in matches:
|
||||
sitemap_url = urljoin(base_url, match)
|
||||
sitemaps.append(sitemap_url)
|
||||
logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
logger.exception(f"Network error fetching HTML from {base_url}")
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
|
||||
|
||||
return sitemaps
|
||||
|
||||
def _check_well_known_directory(self, base_url: str) -> list[str]:
|
||||
"""
|
||||
Check .well-known/* files for discovery targets.
|
||||
|
||||
Args:
|
||||
base_url: Base URL to check .well-known directory for
|
||||
|
||||
Returns:
|
||||
List of URLs found in .well-known directory
|
||||
"""
|
||||
well_known_files: list[str] = []
|
||||
|
||||
try:
|
||||
for filename in self.DISCOVERY_TARGETS["well_known_files"]:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{filename}")
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
well_known_files.append(file_url)
|
||||
logger.info(f"Found .well-known file: {file_url}")
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
logger.debug(f"Well-known file not found or network error: {filename}")
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error checking well-known file: {filename}")
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error checking .well-known directory for {base_url}")
|
||||
|
||||
return well_known_files
|
||||
|
||||
def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
|
||||
"""
|
||||
Try pattern variations for discovery targets.
|
||||
|
||||
Args:
|
||||
base_url: Base URL to try variations for
|
||||
|
||||
Returns:
|
||||
Dictionary with file types and discovered variation URLs
|
||||
"""
|
||||
discovered: dict[str, list[str]] = {
|
||||
"sitemaps": [],
|
||||
"llms_files": []
|
||||
}
|
||||
|
||||
try:
|
||||
# Common subdirectories to check
|
||||
subdirs = ["public", "static", "assets", "docs", "doc", "api"]
|
||||
|
||||
# Try llms.txt variants in subdirectories
|
||||
for subdir in subdirs:
|
||||
for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{subdir}/{llms_file}")
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
discovered["llms_files"].append(file_url)
|
||||
logger.info(f"Found llms file variant: {file_url}")
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
logger.debug(f"Variant not found: {subdir}/{llms_file}")
|
||||
except Exception:
|
||||
logger.exception(f"Error checking variant: {subdir}/{llms_file}")
|
||||
|
||||
# Try sitemap variants with different paths
|
||||
sitemap_paths = [
|
||||
"sitemaps/sitemap.xml",
|
||||
"sitemap/sitemap.xml",
|
||||
"xml/sitemap.xml",
|
||||
"feed/sitemap.xml"
|
||||
]
|
||||
|
||||
for sitemap_path in sitemap_paths:
|
||||
try:
|
||||
file_url = urljoin(base_url, f"/{sitemap_path}")
|
||||
resp = requests.get(file_url, timeout=30, allow_redirects=True)
|
||||
|
||||
if resp.status_code == 200:
|
||||
discovered["sitemaps"].append(file_url)
|
||||
logger.info(f"Found sitemap variant: {file_url}")
|
||||
|
||||
except requests.exceptions.RequestException:
|
||||
logger.debug(f"Sitemap variant not found: {sitemap_path}")
|
||||
except Exception:
|
||||
logger.exception(f"Error checking sitemap variant: {sitemap_path}")
|
||||
|
||||
except Exception:
|
||||
logger.exception(f"Unexpected error trying common variations for {base_url}")
|
||||
|
||||
return discovered
|
||||
@@ -6,8 +6,7 @@ Handles URL transformations and validations.
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from ....config.logfire_config import get_logger
|
||||
|
||||
@@ -33,8 +32,8 @@ class URLHandler:
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if URL is sitemap: {e}")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
|
||||
@staticmethod
|
||||
def is_markdown(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL points to a markdown file (.md, .mdx, .markdown).
|
||||
@@ -274,9 +273,9 @@ class URLHandler:
|
||||
# Fallback: use a hash of the error message + url to still get something unique
|
||||
fallback = f"error_{redacted}_{str(e)}"
|
||||
return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
|
||||
@staticmethod
|
||||
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
|
||||
def extract_markdown_links(content: str, base_url: str | None = None) -> list[str]:
|
||||
"""
|
||||
Extract markdown-style links from text content.
|
||||
|
||||
@@ -290,10 +289,10 @@ class URLHandler:
|
||||
try:
|
||||
if not content:
|
||||
return []
|
||||
|
||||
|
||||
# Ultimate URL pattern with comprehensive format support:
|
||||
# 1) [text](url) - markdown links
|
||||
# 2) <https://...> - autolinks
|
||||
# 2) <https://...> - autolinks
|
||||
# 3) https://... - bare URLs with protocol
|
||||
# 4) //example.com - protocol-relative URLs
|
||||
# 5) www.example.com - scheme-less www URLs
|
||||
@@ -348,7 +347,7 @@ class URLHandler:
|
||||
# Only include HTTP/HTTPS URLs
|
||||
if url.startswith(('http://', 'https://')):
|
||||
urls.append(url)
|
||||
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
@@ -356,16 +355,16 @@ class URLHandler:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
|
||||
logger.info(f"Extracted {len(unique_urls)} unique links from content")
|
||||
return unique_urls
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting markdown links: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
|
||||
@staticmethod
|
||||
def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
|
||||
def is_link_collection_file(url: str, content: str | None = None) -> bool:
|
||||
"""
|
||||
Check if a URL/file appears to be a link collection file like llms.txt.
|
||||
|
||||
@@ -380,7 +379,7 @@ class URLHandler:
|
||||
# Extract filename from URL
|
||||
parsed = urlparse(url)
|
||||
filename = parsed.path.split('/')[-1].lower()
|
||||
|
||||
|
||||
# Check for specific link collection filenames
|
||||
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
|
||||
link_collection_patterns = [
|
||||
@@ -391,12 +390,12 @@ class URLHandler:
|
||||
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
|
||||
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
|
||||
]
|
||||
|
||||
|
||||
# Direct filename match
|
||||
if filename in link_collection_patterns:
|
||||
logger.info(f"Detected link collection file by filename: {filename}")
|
||||
return True
|
||||
|
||||
|
||||
# Pattern-based detection for variations, but exclude "full" variants
|
||||
# Only match files that are likely link collections, not complete content files
|
||||
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
|
||||
@@ -407,7 +406,7 @@ class URLHandler:
|
||||
if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
|
||||
logger.info(f"Detected potential link collection file: {filename}")
|
||||
return True
|
||||
|
||||
|
||||
# Content-based detection if content is provided
|
||||
if content:
|
||||
# Never treat "full" variants as link collections to preserve single-page behavior
|
||||
@@ -417,19 +416,19 @@ class URLHandler:
|
||||
# Reuse extractor to avoid regex divergence and maintain consistency
|
||||
extracted_links = URLHandler.extract_markdown_links(content, url)
|
||||
total_links = len(extracted_links)
|
||||
|
||||
|
||||
# Calculate link density (links per 100 characters)
|
||||
content_length = len(content.strip())
|
||||
if content_length > 0:
|
||||
link_density = (total_links * 100) / content_length
|
||||
|
||||
|
||||
# If more than 2% of content is links, likely a link collection
|
||||
if link_density > 2.0 and total_links > 3:
|
||||
logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
|
||||
return True
|
||||
|
||||
|
||||
return False
|
||||
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
|
||||
return False
|
||||
@@ -583,3 +582,92 @@ class URLHandler:
|
||||
logger.warning(f"Error extracting display name for {url}: {e}, using URL")
|
||||
# Fallback: return truncated URL
|
||||
return url[:50] + "..." if len(url) > 50 else url
|
||||
|
||||
@staticmethod
|
||||
def is_robots_txt(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL is a robots.txt file with error handling.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
True if URL is a robots.txt file, False otherwise
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Normalize to lowercase and ignore query/fragment
|
||||
path = parsed.path.lower()
|
||||
# Only detect robots.txt at root level
|
||||
return path == '/robots.txt'
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if URL is robots.txt: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def is_llms_variant(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL is a llms.txt/llms.md variant with error handling.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
True if URL is a llms file variant, False otherwise
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Normalize to lowercase and ignore query/fragment
|
||||
path = parsed.path.lower()
|
||||
filename = path.split('/')[-1] if '/' in path else path
|
||||
|
||||
# Check for llms file variants
|
||||
llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
|
||||
return filename in llms_variants
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def is_well_known_file(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL is a .well-known/* file with error handling.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
True if URL is a .well-known file, False otherwise
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Normalize to lowercase and ignore query/fragment
|
||||
path = parsed.path.lower()
|
||||
# Only detect .well-known files at root level
|
||||
return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
|
||||
except Exception as e:
|
||||
logger.warning(f"Error checking if URL is well-known file: {e}", exc_info=True)
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_base_url(url: str) -> str:
|
||||
"""
|
||||
Extract base domain URL for discovery with error handling.
|
||||
|
||||
Args:
|
||||
url: URL to extract base from
|
||||
|
||||
Returns:
|
||||
Base URL (scheme + netloc) or original URL if extraction fails
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Ensure we have scheme and netloc
|
||||
if parsed.scheme and parsed.netloc:
|
||||
return f"{parsed.scheme}://{parsed.netloc}"
|
||||
else:
|
||||
logger.warning(f"URL missing scheme or netloc: {url}")
|
||||
return url
|
||||
except Exception as e:
|
||||
logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True)
|
||||
return url
|
||||
|
||||
@@ -15,7 +15,8 @@ class ProgressMapper:
|
||||
"starting": (0, 1),
|
||||
"initializing": (0, 1),
|
||||
"analyzing": (1, 2), # URL analysis is very quick
|
||||
"crawling": (2, 5), # Crawling pages is relatively fast
|
||||
"discovery": (2, 3), # File discovery is quick
|
||||
"crawling": (3, 5), # Crawling pages is relatively fast
|
||||
"processing": (5, 8), # Content processing/chunking is quick
|
||||
"source_creation": (8, 10), # DB operations are fast
|
||||
"document_storage": (10, 30), # Embeddings + batch processing - significant but not longest
|
||||
|
||||
449
python/tests/test_discovery_service.py
Normal file
449
python/tests/test_discovery_service.py
Normal file
@@ -0,0 +1,449 @@
|
||||
"""Unit tests for DiscoveryService class."""
|
||||
import pytest
|
||||
from unittest.mock import patch, Mock
|
||||
from src.server.services.crawling.discovery_service import DiscoveryService
|
||||
|
||||
|
||||
class TestDiscoveryService:
|
||||
"""Test suite for DiscoveryService class."""
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_files_basic(self, mock_get):
|
||||
"""Test main discovery method returns single best file."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock robots.txt response (no sitemaps)
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nDisallow: /admin/"
|
||||
|
||||
# Mock file existence - llms-full.txt doesn't exist, but llms.txt does
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if url.endswith('robots.txt'):
|
||||
return robots_response
|
||||
elif url.endswith('llms-full.txt'):
|
||||
response.status_code = 404 # Highest priority doesn't exist
|
||||
elif url.endswith('llms.txt'):
|
||||
response.status_code = 200 # Second priority exists
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
# Should return single URL string (not dict, not list)
|
||||
assert isinstance(result, str)
|
||||
assert result == 'https://example.com/llms.txt'
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_files_no_files_found(self, mock_get):
|
||||
"""Test discovery when no files are found."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock all HTTP requests to return 404
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
# Should return None when no files found
|
||||
assert result is None
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_files_priority_order(self, mock_get):
|
||||
"""Test that discovery follows the correct priority order."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock robots.txt response (no sitemaps declared)
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nDisallow: /admin/"
|
||||
|
||||
# Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if url.endswith('robots.txt'):
|
||||
return robots_response
|
||||
elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
|
||||
response.status_code = 200 # Both exist
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
# Should return llms.txt since it has higher priority than sitemap.xml
|
||||
assert result == 'https://example.com/llms.txt'
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_files_robots_sitemap_priority(self, mock_get):
|
||||
"""Test that robots.txt sitemap declarations have highest priority."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock robots.txt response WITH sitemap declaration
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml"
|
||||
|
||||
# Mock other files also exist
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if url.endswith('robots.txt'):
|
||||
return robots_response
|
||||
elif 'llms' in url or 'sitemap' in url:
|
||||
response.status_code = 200
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
# Should return the sitemap declared in robots.txt (highest priority)
|
||||
assert result == 'https://example.com/declared-sitemap.xml'
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_best_sitemap_robots_priority(self, mock_get):
|
||||
"""Test sitemap discovery prioritizes robots.txt declarations."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock robots.txt with sitemap
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "Sitemap: https://example.com/robots-sitemap.xml"
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._discover_best_sitemap(base_url)
|
||||
|
||||
# Should return the sitemap from robots.txt (highest priority)
|
||||
assert result == "https://example.com/robots-sitemap.xml"
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_best_llms_file_priority_order(self, mock_get):
|
||||
"""Test llms file discovery follows priority order."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock HTTP responses - only llms.txt exists, not llms-full.txt
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if url.endswith('llms-full.txt'):
|
||||
response.status_code = 404 # Higher priority file doesn't exist
|
||||
elif url.endswith('llms.txt'):
|
||||
response.status_code = 200 # Standard file exists
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service._discover_best_llms_file(base_url)
|
||||
|
||||
# Should find llms.txt since llms-full.txt doesn't exist
|
||||
assert result == "https://example.com/llms.txt"
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_best_llms_file_subdirectory_fallback(self, mock_get):
|
||||
"""Test llms file discovery falls back to subdirectories."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock HTTP responses - no root files, but static/llms.txt exists
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if '/static/llms.txt' in url:
|
||||
response.status_code = 200 # Found in subdirectory
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service._discover_best_llms_file(base_url)
|
||||
|
||||
# Should find the file in static subdirectory
|
||||
assert result == "https://example.com/static/llms.txt"
|
||||
|
||||
@patch('requests.get')
|
||||
def test_check_url_exists(self, mock_get):
|
||||
"""Test URL existence checking."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Test successful response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
assert service._check_url_exists("https://example.com/exists") is True
|
||||
|
||||
# Test 404 response
|
||||
mock_response.status_code = 404
|
||||
assert service._check_url_exists("https://example.com/not-found") is False
|
||||
|
||||
# Test network error
|
||||
mock_get.side_effect = Exception("Network error")
|
||||
assert service._check_url_exists("https://example.com/error") is False
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_robots_txt_with_sitemap(self, mock_get):
|
||||
"""Test robots.txt parsing with sitemap directives."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock successful robots.txt response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """User-agent: *
|
||||
Disallow: /admin/
|
||||
Sitemap: https://example.com/sitemap.xml
|
||||
Sitemap: https://example.com/sitemap-news.xml"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
assert len(result) == 2
|
||||
assert "https://example.com/sitemap.xml" in result
|
||||
assert "https://example.com/sitemap-news.xml" in result
|
||||
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_robots_txt_no_sitemap(self, mock_get):
|
||||
"""Test robots.txt parsing without sitemap directives."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock robots.txt without sitemaps
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """User-agent: *
|
||||
Disallow: /admin/
|
||||
Allow: /public/"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
assert len(result) == 0
|
||||
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_robots_txt_not_found(self, mock_get):
|
||||
"""Test robots.txt parsing when file is not found."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock 404 response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
assert len(result) == 0
|
||||
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_check_standard_patterns(self, mock_get):
|
||||
"""Test standard file pattern checking."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock responses for different files
|
||||
def mock_response_side_effect(url, **kwargs):
|
||||
mock_response = Mock()
|
||||
if 'llms.txt' in url:
|
||||
mock_response.status_code = 200
|
||||
elif 'sitemap.xml' in url:
|
||||
mock_response.status_code = 200
|
||||
else:
|
||||
mock_response.status_code = 404
|
||||
return mock_response
|
||||
|
||||
mock_get.side_effect = mock_response_side_effect
|
||||
|
||||
result = service._check_standard_patterns("https://example.com")
|
||||
|
||||
assert 'sitemaps' in result
|
||||
assert 'llms_files' in result
|
||||
assert 'robots_files' in result
|
||||
|
||||
# Should find the files that returned 200
|
||||
assert any('llms.txt' in url for url in result['llms_files'])
|
||||
assert any('sitemap.xml' in url for url in result['sitemaps'])
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_html_meta_tags(self, mock_get):
|
||||
"""Test HTML meta tag parsing for sitemaps."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock HTML with sitemap references
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """
|
||||
<html>
|
||||
<head>
|
||||
<link rel="sitemap" href="/sitemap.xml">
|
||||
<meta name="sitemap" content="https://example.com/sitemap-meta.xml">
|
||||
</head>
|
||||
<body>Content here</body>
|
||||
</html>
|
||||
"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_html_meta_tags("https://example.com")
|
||||
|
||||
# Should find sitemaps from both link and meta tags
|
||||
assert len(result) >= 1
|
||||
assert any('sitemap' in url.lower() for url in result)
|
||||
mock_get.assert_called_once_with("https://example.com", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_html_meta_tags_not_found(self, mock_get):
|
||||
"""Test HTML meta tag parsing when page not found."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock 404 response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_html_meta_tags("https://example.com")
|
||||
|
||||
assert len(result) == 0
|
||||
mock_get.assert_called_once_with("https://example.com", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_check_well_known_directory(self, mock_get):
|
||||
"""Test .well-known directory file checking."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock responses - some files exist, some don't
|
||||
def mock_response_side_effect(url, **kwargs):
|
||||
mock_response = Mock()
|
||||
if 'ai.txt' in url:
|
||||
mock_response.status_code = 200
|
||||
else:
|
||||
mock_response.status_code = 404
|
||||
return mock_response
|
||||
|
||||
mock_get.side_effect = mock_response_side_effect
|
||||
|
||||
result = service._check_well_known_directory("https://example.com")
|
||||
|
||||
# Should find the ai.txt file
|
||||
assert len(result) >= 1
|
||||
assert any('ai.txt' in url for url in result)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_try_common_variations(self, mock_get):
|
||||
"""Test pattern variations for discovery targets."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock responses for variations
|
||||
def mock_response_side_effect(url, **kwargs):
|
||||
mock_response = Mock()
|
||||
if 'docs/llms.txt' in url or 'sitemaps/sitemap.xml' in url:
|
||||
mock_response.status_code = 200
|
||||
else:
|
||||
mock_response.status_code = 404
|
||||
return mock_response
|
||||
|
||||
mock_get.side_effect = mock_response_side_effect
|
||||
|
||||
result = service._try_common_variations("https://example.com")
|
||||
|
||||
assert 'sitemaps' in result
|
||||
assert 'llms_files' in result
|
||||
|
||||
# Should find at least one variation
|
||||
assert len(result['llms_files']) >= 1 or len(result['sitemaps']) >= 1
|
||||
|
||||
@patch('requests.get')
|
||||
def test_network_error_handling(self, mock_get):
|
||||
"""Test error scenarios with network failures."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock network error
|
||||
mock_get.side_effect = Exception("Network error")
|
||||
|
||||
# Should not raise exception, but return empty results
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
assert result == []
|
||||
|
||||
result = service._check_standard_patterns("https://example.com")
|
||||
assert isinstance(result, dict)
|
||||
|
||||
result = service._parse_html_meta_tags("https://example.com")
|
||||
assert result == []
|
||||
|
||||
result = service._check_well_known_directory("https://example.com")
|
||||
assert result == []
|
||||
|
||||
result = service._try_common_variations("https://example.com")
|
||||
assert isinstance(result, dict)
|
||||
|
||||
def test_discover_files_with_exceptions(self):
|
||||
"""Test main discovery method handles exceptions gracefully."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock methods to raise exceptions
|
||||
with patch.object(service, '_parse_robots_txt', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_check_standard_patterns', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_parse_html_meta_tags', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_check_well_known_directory', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_try_common_variations', side_effect=Exception("Test error")):
|
||||
result = service.discover_files("https://example.com")
|
||||
|
||||
# Should still return proper structure even with all methods failing
|
||||
assert isinstance(result, dict)
|
||||
assert 'sitemaps' in result
|
||||
assert 'llms_files' in result
|
||||
assert 'robots_files' in result
|
||||
assert 'well_known_files' in result
|
||||
|
||||
@patch('requests.get')
|
||||
def test_robots_txt_with_malformed_content(self, mock_get):
|
||||
"""Test robots.txt parsing with malformed content."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock malformed robots.txt content
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """User-agent: *
|
||||
Disallow: /admin/
|
||||
Sitemap:
|
||||
Sitemap: not-a-valid-url
|
||||
Sitemap: https://example.com/valid-sitemap.xml"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
# Should only include the valid sitemap URL
|
||||
assert len(result) == 1
|
||||
assert "https://example.com/valid-sitemap.xml" in result
|
||||
|
||||
def test_discovery_targets_constant(self):
|
||||
"""Test that discovery targets constant is properly defined."""
|
||||
service = DiscoveryService()
|
||||
|
||||
assert hasattr(service, 'DISCOVERY_TARGETS')
|
||||
targets = service.DISCOVERY_TARGETS
|
||||
|
||||
# Verify required target types exist
|
||||
assert 'llms_files' in targets
|
||||
assert 'sitemap_files' in targets
|
||||
assert 'robots_files' in targets
|
||||
assert 'well_known_files' in targets
|
||||
|
||||
# Verify they contain expected files
|
||||
assert 'llms.txt' in targets['llms_files']
|
||||
assert 'sitemap.xml' in targets['sitemap_files']
|
||||
assert 'robots.txt' in targets['robots_files']
|
||||
assert '.well-known/ai.txt' in targets['well_known_files']
|
||||
@@ -122,4 +122,122 @@ class TestURLHandler:
|
||||
|
||||
# Should not transform non-GitHub URLs
|
||||
other = "https://example.com/file"
|
||||
assert handler.transform_github_url(other) == other
|
||||
assert handler.transform_github_url(other) == other
|
||||
|
||||
def test_is_robots_txt(self):
|
||||
"""Test robots.txt detection."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Standard robots.txt URLs
|
||||
assert handler.is_robots_txt("https://example.com/robots.txt") is True
|
||||
assert handler.is_robots_txt("http://example.com/robots.txt") is True
|
||||
assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
|
||||
|
||||
# Case sensitivity
|
||||
assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
|
||||
assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
|
||||
|
||||
# With query parameters (should still be detected)
|
||||
assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
|
||||
assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
|
||||
|
||||
# Not robots.txt files
|
||||
assert handler.is_robots_txt("https://example.com/robots") is False
|
||||
assert handler.is_robots_txt("https://example.com/robots.html") is False
|
||||
assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
|
||||
assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
|
||||
assert handler.is_robots_txt("https://example.com/") is False
|
||||
|
||||
# Edge case: malformed URL should not crash
|
||||
assert handler.is_robots_txt("not-a-url") is False
|
||||
|
||||
def test_is_llms_variant(self):
|
||||
"""Test llms file variant detection."""
|
||||
handler = URLHandler()
|
||||
|
||||
# All llms variants
|
||||
assert handler.is_llms_variant("https://example.com/llms.txt") is True
|
||||
assert handler.is_llms_variant("https://example.com/llms.md") is True
|
||||
assert handler.is_llms_variant("https://example.com/llms.mdx") is True
|
||||
assert handler.is_llms_variant("https://example.com/llms.markdown") is True
|
||||
|
||||
# Case sensitivity
|
||||
assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
|
||||
assert handler.is_llms_variant("https://example.com/Llms.Md") is True
|
||||
|
||||
# With paths (should still detect)
|
||||
assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
|
||||
assert handler.is_llms_variant("https://example.com/public/llms.md") is True
|
||||
|
||||
# With query parameters
|
||||
assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
|
||||
assert handler.is_llms_variant("https://example.com/llms.md#section") is True
|
||||
|
||||
# Not llms files
|
||||
assert handler.is_llms_variant("https://example.com/llms") is False
|
||||
assert handler.is_llms_variant("https://example.com/llms.html") is False
|
||||
assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
|
||||
assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
|
||||
assert handler.is_llms_variant("https://example.com/readme.txt") is False
|
||||
|
||||
# Edge case: malformed URL should not crash
|
||||
assert handler.is_llms_variant("not-a-url") is False
|
||||
|
||||
def test_is_well_known_file(self):
|
||||
"""Test .well-known file detection."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Standard .well-known files
|
||||
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
|
||||
assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
|
||||
assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
|
||||
|
||||
# Case sensitivity (path should be case sensitive)
|
||||
assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is True
|
||||
assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is True
|
||||
|
||||
# With query parameters
|
||||
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
|
||||
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
|
||||
|
||||
# Not .well-known files
|
||||
assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
|
||||
assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
|
||||
assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
|
||||
assert handler.is_well_known_file("https://example.com/ai.txt") is False
|
||||
assert handler.is_well_known_file("https://example.com/") is False
|
||||
|
||||
# Edge case: malformed URL should not crash
|
||||
assert handler.is_well_known_file("not-a-url") is False
|
||||
|
||||
def test_get_base_url(self):
|
||||
"""Test base URL extraction."""
|
||||
handler = URLHandler()
|
||||
|
||||
# Standard URLs
|
||||
assert handler.get_base_url("https://example.com") == "https://example.com"
|
||||
assert handler.get_base_url("https://example.com/") == "https://example.com"
|
||||
assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
|
||||
assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
|
||||
assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
|
||||
|
||||
# HTTP vs HTTPS
|
||||
assert handler.get_base_url("http://example.com/path") == "http://example.com"
|
||||
assert handler.get_base_url("https://example.com/path") == "https://example.com"
|
||||
|
||||
# Subdomains and ports
|
||||
assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
|
||||
assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
|
||||
assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
|
||||
|
||||
# Complex cases
|
||||
assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
|
||||
|
||||
# Edge cases - malformed URLs should return original
|
||||
assert handler.get_base_url("not-a-url") == "not-a-url"
|
||||
assert handler.get_base_url("") == ""
|
||||
assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
|
||||
|
||||
# Missing scheme or netloc
|
||||
assert handler.get_base_url("//example.com/path") == "//example.com/path" # Should return original
|
||||
assert handler.get_base_url("/path/to/resource") == "/path/to/resource" # Should return original
|
||||
Reference in New Issue
Block a user