From 1a55d93a4e019d20acf20a8afb891e42c4512239 Mon Sep 17 00:00:00 2001 From: leex279 Date: Mon, 8 Sep 2025 09:03:15 +0200 Subject: [PATCH] Implement priority-based automatic discovery of llms.txt and sitemap.xml files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add DiscoveryService with single-file priority selection - Priority: llms-full.txt > llms.txt > llms.md > llms.mdx > sitemap.xml > robots.txt - All files contain similar AI/crawling guidance, so only best one is needed - Robots.txt sitemap declarations have highest priority - Fallback to subdirectories for llms files - Enhance URLHandler with discovery helper methods - Add is_robots_txt, is_llms_variant, is_well_known_file, get_base_url methods - Follow existing patterns with proper error handling - Integrate discovery into CrawlingService orchestration - When discovery finds file: crawl ONLY discovered file (not main URL) - When no discovery: crawl main URL normally - Fixes issue where both main URL + discovered file were crawled - Add discovery stage to progress mapping - New "discovery" stage in progress flow - Clear progress messages for discovered files - Comprehensive test coverage - Tests for priority-based selection logic - Tests for robots.txt priority and fallback behavior - Updated existing tests for new return formats Resolves efficient crawling by selecting single best guidance file instead of crawling redundant content from multiple similar files. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../services/crawling/crawling_service.py | 92 +++- .../services/crawling/discovery_service.py | 441 +++++++++++++++++ .../services/crawling/helpers/url_handler.py | 130 ++++- .../services/crawling/progress_mapper.py | 3 +- python/tests/test_discovery_service.py | 449 ++++++++++++++++++ python/tests/test_url_handler.py | 120 ++++- 6 files changed, 1193 insertions(+), 42 deletions(-) create mode 100644 python/src/server/services/crawling/discovery_service.py create mode 100644 python/tests/test_discovery_service.py diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index e85c1fa2..3faebf28 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -17,6 +17,7 @@ from ...utils.progress.progress_tracker import ProgressTracker # Import strategies # Import operations +from .discovery_service import DiscoveryService from .document_storage_operations import DocumentStorageOperations from .helpers.site_config import SiteConfig @@ -83,6 +84,7 @@ class CrawlingService: # Initialize operations self.doc_storage_ops = DocumentStorageOperations(self.supabase_client) + self.discovery_service = DiscoveryService() # Track progress state across all stages to prevent UI resets self.progress_state = {"progressId": self.progress_id} if self.progress_id else {} @@ -132,7 +134,7 @@ class CrawlingService: f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')} | " f"kwargs_keys={list(kwargs.keys())}" ) - + # Update progress via tracker (stores in memory for HTTP polling) await self.progress_tracker.update( status=base_status, @@ -332,16 +334,68 @@ class CrawlingService: # Check for cancellation before proceeding self._check_cancellation() - # Analyzing stage - report initial page count (at least 1) - await update_mapped_progress( - "analyzing", 50, f"Analyzing URL type for {url}", - total_pages=1, # We know we have at least the start URL - processed_pages=0 - ) + # Discovery phase - find the single best related file + discovered_urls = [] + if request.get("auto_discovery", True): # Default enabled + await update_mapped_progress( + "discovery", 25, f"Discovering best related file for {url}", current_url=url + ) + try: + discovered_file = self.discovery_service.discover_files(url) + + # Add the single best discovered file to crawl list + if discovered_file: + safe_logfire_info(f"Discovery found file: {discovered_file}") + # Filter through is_binary_file() check like existing code + if not self.url_handler.is_binary_file(discovered_file): + discovered_urls.append(discovered_file) + safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}") + else: + safe_logfire_info(f"Skipping binary file: {discovered_file}") + else: + safe_logfire_info(f"Discovery found no files for {url}") + + file_count = len(discovered_urls) + safe_logfire_info(f"Discovery selected {file_count} best file to crawl") + + await update_mapped_progress( + "discovery", 100, f"Discovery completed: selected {file_count} best file", current_url=url + ) + + except Exception as e: + safe_logfire_error(f"Discovery phase failed: {e}") + # Continue with regular crawl even if discovery fails + await update_mapped_progress( + "discovery", 100, "Discovery phase failed, continuing with regular crawl", current_url=url + ) + + # Analyzing stage - determine what to crawl + if discovered_urls: + # Discovery found a file - crawl ONLY the discovered file, not the main URL + total_urls_to_crawl = len(discovered_urls) + await update_mapped_progress( + "analyzing", 50, f"Analyzing discovered file: {discovered_urls[0]}", + total_pages=total_urls_to_crawl, + processed_pages=0 + ) + + # Crawl only the discovered file + safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}") + crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request) + + else: + # No discovery - crawl the main URL normally + total_urls_to_crawl = 1 + await update_mapped_progress( + "analyzing", 50, f"Analyzing URL type for {url}", + total_pages=total_urls_to_crawl, + processed_pages=0 + ) + + # Crawl the main URL + safe_logfire_info(f"No discovery file found, crawling main URL: {url}") + crawl_results, crawl_type = await self._crawl_by_url_type(url, request) - # Detect URL type and perform crawl - crawl_results, crawl_type = await self._crawl_by_url_type(url, request) - # Update progress tracker with crawl type if self.progress_tracker and crawl_type: await self.progress_tracker.update( @@ -415,7 +469,7 @@ class CrawlingService: if request.get("extract_code_examples", True) and actual_chunks_stored > 0: # Check for cancellation before starting code extraction self._check_cancellation() - + await update_mapped_progress("code_extraction", 0, "Starting code extraction...") # Create progress callback for code extraction @@ -424,7 +478,7 @@ class CrawlingService: # Use ProgressMapper to ensure progress never goes backwards raw_progress = data.get("progress", data.get("percentage", 0)) mapped_progress = self.progress_mapper.map_progress("code_extraction", raw_progress) - + # Update progress state via tracker await self.progress_tracker.update( status=data.get("status", "code_extraction"), @@ -445,7 +499,7 @@ class CrawlingService: # Check for cancellation after code extraction self._check_cancellation() - + # Send heartbeat after code extraction await send_heartbeat_if_needed() @@ -571,7 +625,7 @@ class CrawlingService: crawl_type = None if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url): - # Handle text files + # Handle text files crawl_type = "llms-txt" if "llms" in url.lower() else "text_file" if self.progress_tracker: await self.progress_tracker.update( @@ -593,7 +647,7 @@ class CrawlingService: if self.url_handler.is_link_collection_file(url, content): # Extract links from the content extracted_links = self.url_handler.extract_markdown_links(content, url) - + # Filter out self-referential links to avoid redundant crawling if extracted_links: original_count = len(extracted_links) @@ -604,7 +658,7 @@ class CrawlingService: self_filtered_count = original_count - len(extracted_links) if self_filtered_count > 0: logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links") - + # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling if extracted_links: original_count = len(extracted_links) @@ -612,7 +666,7 @@ class CrawlingService: filtered_count = original_count - len(extracted_links) if filtered_count > 0: logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links") - + if extracted_links: # Crawl the extracted links using batch crawling logger.info(f"Crawling {len(extracted_links)} extracted links from {url}") @@ -623,11 +677,11 @@ class CrawlingService: start_progress=10, end_progress=20, ) - + # Combine original text file results with batch results crawl_results.extend(batch_results) crawl_type = "link_collection_with_crawled_links" - + logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)") else: logger.info(f"No valid links found in link collection file: {url}") diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py new file mode 100644 index 00000000..6a3762a4 --- /dev/null +++ b/python/src/server/services/crawling/discovery_service.py @@ -0,0 +1,441 @@ +""" +Discovery Service for Automatic File Detection + +Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files +to enhance crawling capabilities with priority-based discovery methods. +""" + +from urllib.parse import urljoin + +import requests + +from ...config.logfire_config import get_logger + +logger = get_logger(__name__) + + +class DiscoveryService: + """Service for discovering related files automatically during crawls.""" + + # Global priority order - select ONE best file from all categories + # All these files contain similar AI/crawling guidance content + DISCOVERY_PRIORITY = [ + # LLMs files (highest priority - most comprehensive AI guidance) + "llms-full.txt", + "llms.txt", + "llms.md", + "llms.mdx", + "llms.markdown", + + # Sitemap files (structural crawling guidance) + "sitemap_index.xml", + "sitemap-index.xml", + "sitemap.xml", + + # Robots file (basic crawling rules) + "robots.txt", + + # Well-known variants (alternative locations) + ".well-known/ai.txt", + ".well-known/llms.txt", + ".well-known/sitemap.xml" + ] + + def discover_files(self, base_url: str) -> str | None: + """ + Main discovery orchestrator - selects ONE best file across all categories. + All files contain similar AI/crawling guidance, so we only need the best one. + + Args: + base_url: Base URL to discover files for + + Returns: + Single best URL found, or None if no files discovered + """ + try: + logger.info(f"Starting single-file discovery for {base_url}") + + # First check robots.txt for explicit sitemap declarations (special case) + robots_sitemaps = self._parse_robots_txt(base_url) + if robots_sitemaps: + best_file = robots_sitemaps[0] # Use first sitemap from robots.txt + logger.info(f"Discovery found best file from robots.txt: {best_file}") + return best_file + + # Check files in global priority order + for filename in self.DISCOVERY_PRIORITY: + # Try root location first + file_url = urljoin(base_url, f"/{filename}") + if self._check_url_exists(file_url): + logger.info(f"Discovery found best file: {file_url}") + return file_url + + # For llms files, also try common subdirectories + if filename.startswith('llms'): + for subdir in ["static", "public", "docs", "assets", "doc", "api"]: + subdir_url = urljoin(base_url, f"/{subdir}/{filename}") + if self._check_url_exists(subdir_url): + logger.info(f"Discovery found best file in subdirectory: {subdir_url}") + return subdir_url + + # For sitemap files, also try common subdirectories + if filename.endswith('.xml') and not filename.startswith('.well-known'): + for subdir in ["sitemaps", "sitemap", "xml", "feed"]: + subdir_url = urljoin(base_url, f"/{subdir}/{filename}") + if self._check_url_exists(subdir_url): + logger.info(f"Discovery found best file in subdirectory: {subdir_url}") + return subdir_url + + # Check HTML meta tags for sitemap references as final fallback + html_sitemaps = self._parse_html_meta_tags(base_url) + if html_sitemaps: + best_file = html_sitemaps[0] + logger.info(f"Discovery found best file from HTML meta tags: {best_file}") + return best_file + + logger.info(f"Discovery completed for {base_url}: no files found") + return None + + except Exception: + logger.exception(f"Unexpected error during discovery for {base_url}") + return None + + def _discover_best_sitemap(self, base_url: str) -> str | None: + """ + Discover the best available sitemap using priority-based selection. + + Priority order: + 1. Sitemaps from robots.txt (highest priority - explicitly declared) + 2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml) + 3. Common subdirectory variations + 4. HTML meta tag references + 5. .well-known directory + """ + try: + # Priority 1: Check robots.txt for sitemap declarations + robots_sitemaps = self._parse_robots_txt(base_url) + if robots_sitemaps: + return robots_sitemaps[0] # Use first sitemap from robots.txt + + # Priority 2: Check standard locations in priority order + for filename in self.DISCOVERY_TARGETS["sitemap_files"]: + sitemap_url = urljoin(base_url, f"/{filename}") + if self._check_url_exists(sitemap_url): + return sitemap_url + + # Priority 3: Check common subdirectory variations + subdirs = ["sitemaps", "sitemap", "xml", "feed"] + for subdir in subdirs: + for filename in self.DISCOVERY_TARGETS["sitemap_files"]: + sitemap_url = urljoin(base_url, f"/{subdir}/{filename}") + if self._check_url_exists(sitemap_url): + return sitemap_url + + # Priority 4: Check HTML meta tag references + html_sitemaps = self._parse_html_meta_tags(base_url) + if html_sitemaps: + return html_sitemaps[0] # Use first sitemap from HTML + + # Priority 5: Check .well-known directory + well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml") + if self._check_url_exists(well_known_sitemap): + return well_known_sitemap + + except Exception: + logger.exception(f"Error discovering best sitemap for {base_url}") + + return None + + def _discover_best_llms_file(self, base_url: str) -> str | None: + """ + Discover the best available llms file using priority-based selection. + + Priority order: + 1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown) + 2. Common subdirectory variations (static, public, docs, assets) + 3. .well-known directory variants + """ + try: + # Priority 1: Check standard root locations in priority order + for filename in self.DISCOVERY_TARGETS["llms_files"]: + llms_url = urljoin(base_url, f"/{filename}") + if self._check_url_exists(llms_url): + return llms_url + + # Priority 2: Check common subdirectory variations + subdirs = ["static", "public", "docs", "assets", "doc", "api"] + for subdir in subdirs: + for filename in self.DISCOVERY_TARGETS["llms_files"]: + llms_url = urljoin(base_url, f"/{subdir}/{filename}") + if self._check_url_exists(llms_url): + return llms_url + + # Priority 3: Check .well-known directory variants + for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]: + well_known_url = urljoin(base_url, f"/{well_known_file}") + if self._check_url_exists(well_known_url): + return well_known_url + + except Exception: + logger.exception(f"Error discovering best llms file for {base_url}") + + return None + + def _discover_robots_file(self, base_url: str) -> str | None: + """ + Discover robots.txt file (always single file at root). + """ + try: + robots_url = urljoin(base_url, "/robots.txt") + if self._check_url_exists(robots_url): + return robots_url + except Exception: + logger.exception(f"Error discovering robots file for {base_url}") + + return None + + def _check_url_exists(self, url: str) -> bool: + """ + Check if a URL exists and returns a successful response. + """ + try: + resp = requests.get(url, timeout=5, allow_redirects=True) + success = resp.status_code == 200 + logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})") + return success + except Exception as e: + logger.debug(f"URL check failed: {url} -> {e}") + return False + + def _parse_robots_txt(self, base_url: str) -> list[str]: + """ + Extract sitemap URLs from robots.txt. + + Args: + base_url: Base URL to check robots.txt for + + Returns: + List of sitemap URLs found in robots.txt + """ + sitemaps: list[str] = [] + + try: + robots_url = urljoin(base_url, "/robots.txt") + logger.info(f"Checking robots.txt at {robots_url}") + + resp = requests.get(robots_url, timeout=30) + + if resp.status_code != 200: + logger.info(f"No robots.txt found: HTTP {resp.status_code}") + return sitemaps + + # Parse robots.txt content for sitemap directives + for line in resp.text.splitlines(): + line = line.strip().lower() + if line.startswith("sitemap:"): + sitemap_url = line.split(":", 1)[1].strip() + # Validate URL format before adding + if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')): + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in robots.txt: {sitemap_url}") + + except requests.exceptions.RequestException: + logger.exception(f"Network error fetching robots.txt from {base_url}") + except Exception: + logger.exception(f"Unexpected error parsing robots.txt from {base_url}") + + return sitemaps + + def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]: + """ + Check common file locations for discovery targets. + + Args: + base_url: Base URL to check standard locations for + + Returns: + Dictionary with file types and discovered URLs + """ + discovered: dict[str, list[str]] = { + "sitemaps": [], + "llms_files": [], + "robots_files": [] + } + + try: + # Check all discovery targets at standard locations + all_targets = [] + for target_type, files in self.DISCOVERY_TARGETS.items(): + if target_type != "well_known_files": # Skip well-known, handled separately + for filename in files: + all_targets.append((target_type, filename)) + + for target_type, filename in all_targets: + try: + file_url = urljoin(base_url, f"/{filename}") + resp = requests.get(file_url, timeout=30, allow_redirects=True) + + if resp.status_code == 200: + # Map target type to discovery category + if target_type == "sitemap_files": + discovered["sitemaps"].append(file_url) + elif target_type == "llms_files": + discovered["llms_files"].append(file_url) + elif target_type == "robots_files": + discovered["robots_files"].append(file_url) + + logger.info(f"Found {target_type} file: {file_url}") + + except requests.exceptions.RequestException: + logger.debug(f"File not found or network error: {filename}") + except Exception: + logger.exception(f"Unexpected error checking {filename}") + + except Exception: + logger.exception(f"Unexpected error in standard pattern checking for {base_url}") + + return discovered + + def _parse_html_meta_tags(self, base_url: str) -> list[str]: + """ + Extract sitemap references from HTML meta tags. + + Args: + base_url: Base URL to check HTML for meta tags + + Returns: + List of sitemap URLs found in HTML meta tags + """ + sitemaps: list[str] = [] + + try: + logger.info(f"Checking HTML meta tags for sitemaps at {base_url}") + resp = requests.get(base_url, timeout=30) + + if resp.status_code != 200: + logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}") + return sitemaps + + content = resp.text.lower() + + # Look for sitemap meta tags or link elements + import re + + # Check for + sitemap_link_pattern = r']*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']' + matches = re.findall(sitemap_link_pattern, content) + + for match in matches: + sitemap_url = urljoin(base_url, match) + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in HTML link tag: {sitemap_url}") + + # Check for + sitemap_meta_pattern = r']*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']' + matches = re.findall(sitemap_meta_pattern, content) + + for match in matches: + sitemap_url = urljoin(base_url, match) + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}") + + except requests.exceptions.RequestException: + logger.exception(f"Network error fetching HTML from {base_url}") + except Exception: + logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}") + + return sitemaps + + def _check_well_known_directory(self, base_url: str) -> list[str]: + """ + Check .well-known/* files for discovery targets. + + Args: + base_url: Base URL to check .well-known directory for + + Returns: + List of URLs found in .well-known directory + """ + well_known_files: list[str] = [] + + try: + for filename in self.DISCOVERY_TARGETS["well_known_files"]: + try: + file_url = urljoin(base_url, f"/{filename}") + resp = requests.get(file_url, timeout=30, allow_redirects=True) + + if resp.status_code == 200: + well_known_files.append(file_url) + logger.info(f"Found .well-known file: {file_url}") + + except requests.exceptions.RequestException: + logger.debug(f"Well-known file not found or network error: {filename}") + except Exception: + logger.exception(f"Unexpected error checking well-known file: {filename}") + + except Exception: + logger.exception(f"Unexpected error checking .well-known directory for {base_url}") + + return well_known_files + + def _try_common_variations(self, base_url: str) -> dict[str, list[str]]: + """ + Try pattern variations for discovery targets. + + Args: + base_url: Base URL to try variations for + + Returns: + Dictionary with file types and discovered variation URLs + """ + discovered: dict[str, list[str]] = { + "sitemaps": [], + "llms_files": [] + } + + try: + # Common subdirectories to check + subdirs = ["public", "static", "assets", "docs", "doc", "api"] + + # Try llms.txt variants in subdirectories + for subdir in subdirs: + for llms_file in self.DISCOVERY_TARGETS["llms_files"]: + try: + file_url = urljoin(base_url, f"/{subdir}/{llms_file}") + resp = requests.get(file_url, timeout=30, allow_redirects=True) + + if resp.status_code == 200: + discovered["llms_files"].append(file_url) + logger.info(f"Found llms file variant: {file_url}") + + except requests.exceptions.RequestException: + logger.debug(f"Variant not found: {subdir}/{llms_file}") + except Exception: + logger.exception(f"Error checking variant: {subdir}/{llms_file}") + + # Try sitemap variants with different paths + sitemap_paths = [ + "sitemaps/sitemap.xml", + "sitemap/sitemap.xml", + "xml/sitemap.xml", + "feed/sitemap.xml" + ] + + for sitemap_path in sitemap_paths: + try: + file_url = urljoin(base_url, f"/{sitemap_path}") + resp = requests.get(file_url, timeout=30, allow_redirects=True) + + if resp.status_code == 200: + discovered["sitemaps"].append(file_url) + logger.info(f"Found sitemap variant: {file_url}") + + except requests.exceptions.RequestException: + logger.debug(f"Sitemap variant not found: {sitemap_path}") + except Exception: + logger.exception(f"Error checking sitemap variant: {sitemap_path}") + + except Exception: + logger.exception(f"Unexpected error trying common variations for {base_url}") + + return discovered diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 33c75c57..b4bb0e58 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -6,8 +6,7 @@ Handles URL transformations and validations. import hashlib import re -from urllib.parse import urlparse, urljoin -from typing import List, Optional +from urllib.parse import urljoin, urlparse from ....config.logfire_config import get_logger @@ -33,8 +32,8 @@ class URLHandler: except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False - - @staticmethod + + @staticmethod def is_markdown(url: str) -> bool: """ Check if a URL points to a markdown file (.md, .mdx, .markdown). @@ -274,9 +273,9 @@ class URLHandler: # Fallback: use a hash of the error message + url to still get something unique fallback = f"error_{redacted}_{str(e)}" return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16] - + @staticmethod - def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]: + def extract_markdown_links(content: str, base_url: str | None = None) -> list[str]: """ Extract markdown-style links from text content. @@ -290,10 +289,10 @@ class URLHandler: try: if not content: return [] - + # Ultimate URL pattern with comprehensive format support: # 1) [text](url) - markdown links - # 2) - autolinks + # 2) - autolinks # 3) https://... - bare URLs with protocol # 4) //example.com - protocol-relative URLs # 5) www.example.com - scheme-less www URLs @@ -348,7 +347,7 @@ class URLHandler: # Only include HTTP/HTTPS URLs if url.startswith(('http://', 'https://')): urls.append(url) - + # Remove duplicates while preserving order seen = set() unique_urls = [] @@ -356,16 +355,16 @@ class URLHandler: if url not in seen: seen.add(url) unique_urls.append(url) - + logger.info(f"Extracted {len(unique_urls)} unique links from content") return unique_urls - + except Exception as e: logger.error(f"Error extracting markdown links: {e}", exc_info=True) return [] - + @staticmethod - def is_link_collection_file(url: str, content: Optional[str] = None) -> bool: + def is_link_collection_file(url: str, content: str | None = None) -> bool: """ Check if a URL/file appears to be a link collection file like llms.txt. @@ -380,7 +379,7 @@ class URLHandler: # Extract filename from URL parsed = urlparse(url) filename = parsed.path.split('/')[-1].lower() - + # Check for specific link collection filenames # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links link_collection_patterns = [ @@ -391,12 +390,12 @@ class URLHandler: 'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx', 'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown', ] - + # Direct filename match if filename in link_collection_patterns: logger.info(f"Detected link collection file by filename: {filename}") return True - + # Pattern-based detection for variations, but exclude "full" variants # Only match files that are likely link collections, not complete content files if filename.endswith(('.txt', '.md', '.mdx', '.markdown')): @@ -407,7 +406,7 @@ class URLHandler: if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns): logger.info(f"Detected potential link collection file: {filename}") return True - + # Content-based detection if content is provided if content: # Never treat "full" variants as link collections to preserve single-page behavior @@ -417,19 +416,19 @@ class URLHandler: # Reuse extractor to avoid regex divergence and maintain consistency extracted_links = URLHandler.extract_markdown_links(content, url) total_links = len(extracted_links) - + # Calculate link density (links per 100 characters) content_length = len(content.strip()) if content_length > 0: link_density = (total_links * 100) / content_length - + # If more than 2% of content is links, likely a link collection if link_density > 2.0 and total_links > 3: logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%") return True - + return False - + except Exception as e: logger.warning(f"Error checking if file is link collection: {e}", exc_info=True) return False @@ -583,3 +582,92 @@ class URLHandler: logger.warning(f"Error extracting display name for {url}: {e}, using URL") # Fallback: return truncated URL return url[:50] + "..." if len(url) > 50 else url + + @staticmethod + def is_robots_txt(url: str) -> bool: + """ + Check if a URL is a robots.txt file with error handling. + + Args: + url: URL to check + + Returns: + True if URL is a robots.txt file, False otherwise + """ + try: + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + path = parsed.path.lower() + # Only detect robots.txt at root level + return path == '/robots.txt' + except Exception as e: + logger.warning(f"Error checking if URL is robots.txt: {e}", exc_info=True) + return False + + @staticmethod + def is_llms_variant(url: str) -> bool: + """ + Check if a URL is a llms.txt/llms.md variant with error handling. + + Args: + url: URL to check + + Returns: + True if URL is a llms file variant, False otherwise + """ + try: + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + path = parsed.path.lower() + filename = path.split('/')[-1] if '/' in path else path + + # Check for llms file variants + llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown'] + return filename in llms_variants + except Exception as e: + logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True) + return False + + @staticmethod + def is_well_known_file(url: str) -> bool: + """ + Check if a URL is a .well-known/* file with error handling. + + Args: + url: URL to check + + Returns: + True if URL is a .well-known file, False otherwise + """ + try: + parsed = urlparse(url) + # Normalize to lowercase and ignore query/fragment + path = parsed.path.lower() + # Only detect .well-known files at root level + return path.startswith('/.well-known/') and path.count('/.well-known/') == 1 + except Exception as e: + logger.warning(f"Error checking if URL is well-known file: {e}", exc_info=True) + return False + + @staticmethod + def get_base_url(url: str) -> str: + """ + Extract base domain URL for discovery with error handling. + + Args: + url: URL to extract base from + + Returns: + Base URL (scheme + netloc) or original URL if extraction fails + """ + try: + parsed = urlparse(url) + # Ensure we have scheme and netloc + if parsed.scheme and parsed.netloc: + return f"{parsed.scheme}://{parsed.netloc}" + else: + logger.warning(f"URL missing scheme or netloc: {url}") + return url + except Exception as e: + logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True) + return url diff --git a/python/src/server/services/crawling/progress_mapper.py b/python/src/server/services/crawling/progress_mapper.py index 473cac01..c806afd7 100644 --- a/python/src/server/services/crawling/progress_mapper.py +++ b/python/src/server/services/crawling/progress_mapper.py @@ -15,7 +15,8 @@ class ProgressMapper: "starting": (0, 1), "initializing": (0, 1), "analyzing": (1, 2), # URL analysis is very quick - "crawling": (2, 5), # Crawling pages is relatively fast + "discovery": (2, 3), # File discovery is quick + "crawling": (3, 5), # Crawling pages is relatively fast "processing": (5, 8), # Content processing/chunking is quick "source_creation": (8, 10), # DB operations are fast "document_storage": (10, 30), # Embeddings + batch processing - significant but not longest diff --git a/python/tests/test_discovery_service.py b/python/tests/test_discovery_service.py new file mode 100644 index 00000000..5c31b0e6 --- /dev/null +++ b/python/tests/test_discovery_service.py @@ -0,0 +1,449 @@ +"""Unit tests for DiscoveryService class.""" +import pytest +from unittest.mock import patch, Mock +from src.server.services.crawling.discovery_service import DiscoveryService + + +class TestDiscoveryService: + """Test suite for DiscoveryService class.""" + + @patch('requests.get') + def test_discover_files_basic(self, mock_get): + """Test main discovery method returns single best file.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock robots.txt response (no sitemaps) + robots_response = Mock() + robots_response.status_code = 200 + robots_response.text = "User-agent: *\nDisallow: /admin/" + + # Mock file existence - llms-full.txt doesn't exist, but llms.txt does + def mock_get_side_effect(url, **kwargs): + response = Mock() + if url.endswith('robots.txt'): + return robots_response + elif url.endswith('llms-full.txt'): + response.status_code = 404 # Highest priority doesn't exist + elif url.endswith('llms.txt'): + response.status_code = 200 # Second priority exists + else: + response.status_code = 404 + return response + + mock_get.side_effect = mock_get_side_effect + + result = service.discover_files(base_url) + + # Should return single URL string (not dict, not list) + assert isinstance(result, str) + assert result == 'https://example.com/llms.txt' + + @patch('requests.get') + def test_discover_files_no_files_found(self, mock_get): + """Test discovery when no files are found.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock all HTTP requests to return 404 + mock_response = Mock() + mock_response.status_code = 404 + mock_get.return_value = mock_response + + result = service.discover_files(base_url) + + # Should return None when no files found + assert result is None + + @patch('requests.get') + def test_discover_files_priority_order(self, mock_get): + """Test that discovery follows the correct priority order.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock robots.txt response (no sitemaps declared) + robots_response = Mock() + robots_response.status_code = 200 + robots_response.text = "User-agent: *\nDisallow: /admin/" + + # Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority + def mock_get_side_effect(url, **kwargs): + response = Mock() + if url.endswith('robots.txt'): + return robots_response + elif url.endswith('llms.txt') or url.endswith('sitemap.xml'): + response.status_code = 200 # Both exist + else: + response.status_code = 404 + return response + + mock_get.side_effect = mock_get_side_effect + + result = service.discover_files(base_url) + + # Should return llms.txt since it has higher priority than sitemap.xml + assert result == 'https://example.com/llms.txt' + + @patch('requests.get') + def test_discover_files_robots_sitemap_priority(self, mock_get): + """Test that robots.txt sitemap declarations have highest priority.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock robots.txt response WITH sitemap declaration + robots_response = Mock() + robots_response.status_code = 200 + robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml" + + # Mock other files also exist + def mock_get_side_effect(url, **kwargs): + response = Mock() + if url.endswith('robots.txt'): + return robots_response + elif 'llms' in url or 'sitemap' in url: + response.status_code = 200 + else: + response.status_code = 404 + return response + + mock_get.side_effect = mock_get_side_effect + + result = service.discover_files(base_url) + + # Should return the sitemap declared in robots.txt (highest priority) + assert result == 'https://example.com/declared-sitemap.xml' + + @patch('requests.get') + def test_discover_best_sitemap_robots_priority(self, mock_get): + """Test sitemap discovery prioritizes robots.txt declarations.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock robots.txt with sitemap + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = "Sitemap: https://example.com/robots-sitemap.xml" + mock_get.return_value = mock_response + + result = service._discover_best_sitemap(base_url) + + # Should return the sitemap from robots.txt (highest priority) + assert result == "https://example.com/robots-sitemap.xml" + + @patch('requests.get') + def test_discover_best_llms_file_priority_order(self, mock_get): + """Test llms file discovery follows priority order.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock HTTP responses - only llms.txt exists, not llms-full.txt + def mock_get_side_effect(url, **kwargs): + response = Mock() + if url.endswith('llms-full.txt'): + response.status_code = 404 # Higher priority file doesn't exist + elif url.endswith('llms.txt'): + response.status_code = 200 # Standard file exists + else: + response.status_code = 404 + return response + + mock_get.side_effect = mock_get_side_effect + + result = service._discover_best_llms_file(base_url) + + # Should find llms.txt since llms-full.txt doesn't exist + assert result == "https://example.com/llms.txt" + + @patch('requests.get') + def test_discover_best_llms_file_subdirectory_fallback(self, mock_get): + """Test llms file discovery falls back to subdirectories.""" + service = DiscoveryService() + base_url = "https://example.com" + + # Mock HTTP responses - no root files, but static/llms.txt exists + def mock_get_side_effect(url, **kwargs): + response = Mock() + if '/static/llms.txt' in url: + response.status_code = 200 # Found in subdirectory + else: + response.status_code = 404 + return response + + mock_get.side_effect = mock_get_side_effect + + result = service._discover_best_llms_file(base_url) + + # Should find the file in static subdirectory + assert result == "https://example.com/static/llms.txt" + + @patch('requests.get') + def test_check_url_exists(self, mock_get): + """Test URL existence checking.""" + service = DiscoveryService() + + # Test successful response + mock_response = Mock() + mock_response.status_code = 200 + mock_get.return_value = mock_response + + assert service._check_url_exists("https://example.com/exists") is True + + # Test 404 response + mock_response.status_code = 404 + assert service._check_url_exists("https://example.com/not-found") is False + + # Test network error + mock_get.side_effect = Exception("Network error") + assert service._check_url_exists("https://example.com/error") is False + + @patch('requests.get') + def test_parse_robots_txt_with_sitemap(self, mock_get): + """Test robots.txt parsing with sitemap directives.""" + service = DiscoveryService() + + # Mock successful robots.txt response + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = """User-agent: * +Disallow: /admin/ +Sitemap: https://example.com/sitemap.xml +Sitemap: https://example.com/sitemap-news.xml""" + mock_get.return_value = mock_response + + result = service._parse_robots_txt("https://example.com") + + assert len(result) == 2 + assert "https://example.com/sitemap.xml" in result + assert "https://example.com/sitemap-news.xml" in result + mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) + + @patch('requests.get') + def test_parse_robots_txt_no_sitemap(self, mock_get): + """Test robots.txt parsing without sitemap directives.""" + service = DiscoveryService() + + # Mock robots.txt without sitemaps + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = """User-agent: * +Disallow: /admin/ +Allow: /public/""" + mock_get.return_value = mock_response + + result = service._parse_robots_txt("https://example.com") + + assert len(result) == 0 + mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) + + @patch('requests.get') + def test_parse_robots_txt_not_found(self, mock_get): + """Test robots.txt parsing when file is not found.""" + service = DiscoveryService() + + # Mock 404 response + mock_response = Mock() + mock_response.status_code = 404 + mock_get.return_value = mock_response + + result = service._parse_robots_txt("https://example.com") + + assert len(result) == 0 + mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) + + @patch('requests.get') + def test_check_standard_patterns(self, mock_get): + """Test standard file pattern checking.""" + service = DiscoveryService() + + # Mock responses for different files + def mock_response_side_effect(url, **kwargs): + mock_response = Mock() + if 'llms.txt' in url: + mock_response.status_code = 200 + elif 'sitemap.xml' in url: + mock_response.status_code = 200 + else: + mock_response.status_code = 404 + return mock_response + + mock_get.side_effect = mock_response_side_effect + + result = service._check_standard_patterns("https://example.com") + + assert 'sitemaps' in result + assert 'llms_files' in result + assert 'robots_files' in result + + # Should find the files that returned 200 + assert any('llms.txt' in url for url in result['llms_files']) + assert any('sitemap.xml' in url for url in result['sitemaps']) + + @patch('requests.get') + def test_parse_html_meta_tags(self, mock_get): + """Test HTML meta tag parsing for sitemaps.""" + service = DiscoveryService() + + # Mock HTML with sitemap references + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = """ + + + + + + Content here + + """ + mock_get.return_value = mock_response + + result = service._parse_html_meta_tags("https://example.com") + + # Should find sitemaps from both link and meta tags + assert len(result) >= 1 + assert any('sitemap' in url.lower() for url in result) + mock_get.assert_called_once_with("https://example.com", timeout=30) + + @patch('requests.get') + def test_parse_html_meta_tags_not_found(self, mock_get): + """Test HTML meta tag parsing when page not found.""" + service = DiscoveryService() + + # Mock 404 response + mock_response = Mock() + mock_response.status_code = 404 + mock_get.return_value = mock_response + + result = service._parse_html_meta_tags("https://example.com") + + assert len(result) == 0 + mock_get.assert_called_once_with("https://example.com", timeout=30) + + @patch('requests.get') + def test_check_well_known_directory(self, mock_get): + """Test .well-known directory file checking.""" + service = DiscoveryService() + + # Mock responses - some files exist, some don't + def mock_response_side_effect(url, **kwargs): + mock_response = Mock() + if 'ai.txt' in url: + mock_response.status_code = 200 + else: + mock_response.status_code = 404 + return mock_response + + mock_get.side_effect = mock_response_side_effect + + result = service._check_well_known_directory("https://example.com") + + # Should find the ai.txt file + assert len(result) >= 1 + assert any('ai.txt' in url for url in result) + + @patch('requests.get') + def test_try_common_variations(self, mock_get): + """Test pattern variations for discovery targets.""" + service = DiscoveryService() + + # Mock responses for variations + def mock_response_side_effect(url, **kwargs): + mock_response = Mock() + if 'docs/llms.txt' in url or 'sitemaps/sitemap.xml' in url: + mock_response.status_code = 200 + else: + mock_response.status_code = 404 + return mock_response + + mock_get.side_effect = mock_response_side_effect + + result = service._try_common_variations("https://example.com") + + assert 'sitemaps' in result + assert 'llms_files' in result + + # Should find at least one variation + assert len(result['llms_files']) >= 1 or len(result['sitemaps']) >= 1 + + @patch('requests.get') + def test_network_error_handling(self, mock_get): + """Test error scenarios with network failures.""" + service = DiscoveryService() + + # Mock network error + mock_get.side_effect = Exception("Network error") + + # Should not raise exception, but return empty results + result = service._parse_robots_txt("https://example.com") + assert result == [] + + result = service._check_standard_patterns("https://example.com") + assert isinstance(result, dict) + + result = service._parse_html_meta_tags("https://example.com") + assert result == [] + + result = service._check_well_known_directory("https://example.com") + assert result == [] + + result = service._try_common_variations("https://example.com") + assert isinstance(result, dict) + + def test_discover_files_with_exceptions(self): + """Test main discovery method handles exceptions gracefully.""" + service = DiscoveryService() + + # Mock methods to raise exceptions + with patch.object(service, '_parse_robots_txt', side_effect=Exception("Test error")): + with patch.object(service, '_check_standard_patterns', side_effect=Exception("Test error")): + with patch.object(service, '_parse_html_meta_tags', side_effect=Exception("Test error")): + with patch.object(service, '_check_well_known_directory', side_effect=Exception("Test error")): + with patch.object(service, '_try_common_variations', side_effect=Exception("Test error")): + result = service.discover_files("https://example.com") + + # Should still return proper structure even with all methods failing + assert isinstance(result, dict) + assert 'sitemaps' in result + assert 'llms_files' in result + assert 'robots_files' in result + assert 'well_known_files' in result + + @patch('requests.get') + def test_robots_txt_with_malformed_content(self, mock_get): + """Test robots.txt parsing with malformed content.""" + service = DiscoveryService() + + # Mock malformed robots.txt content + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = """User-agent: * +Disallow: /admin/ +Sitemap: +Sitemap: not-a-valid-url +Sitemap: https://example.com/valid-sitemap.xml""" + mock_get.return_value = mock_response + + result = service._parse_robots_txt("https://example.com") + + # Should only include the valid sitemap URL + assert len(result) == 1 + assert "https://example.com/valid-sitemap.xml" in result + + def test_discovery_targets_constant(self): + """Test that discovery targets constant is properly defined.""" + service = DiscoveryService() + + assert hasattr(service, 'DISCOVERY_TARGETS') + targets = service.DISCOVERY_TARGETS + + # Verify required target types exist + assert 'llms_files' in targets + assert 'sitemap_files' in targets + assert 'robots_files' in targets + assert 'well_known_files' in targets + + # Verify they contain expected files + assert 'llms.txt' in targets['llms_files'] + assert 'sitemap.xml' in targets['sitemap_files'] + assert 'robots.txt' in targets['robots_files'] + assert '.well-known/ai.txt' in targets['well_known_files'] \ No newline at end of file diff --git a/python/tests/test_url_handler.py b/python/tests/test_url_handler.py index 1310bd87..33c80d9e 100644 --- a/python/tests/test_url_handler.py +++ b/python/tests/test_url_handler.py @@ -122,4 +122,122 @@ class TestURLHandler: # Should not transform non-GitHub URLs other = "https://example.com/file" - assert handler.transform_github_url(other) == other \ No newline at end of file + assert handler.transform_github_url(other) == other + + def test_is_robots_txt(self): + """Test robots.txt detection.""" + handler = URLHandler() + + # Standard robots.txt URLs + assert handler.is_robots_txt("https://example.com/robots.txt") is True + assert handler.is_robots_txt("http://example.com/robots.txt") is True + assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True + + # Case sensitivity + assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True + assert handler.is_robots_txt("https://example.com/Robots.Txt") is True + + # With query parameters (should still be detected) + assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True + assert handler.is_robots_txt("https://example.com/robots.txt#section") is True + + # Not robots.txt files + assert handler.is_robots_txt("https://example.com/robots") is False + assert handler.is_robots_txt("https://example.com/robots.html") is False + assert handler.is_robots_txt("https://example.com/some-robots.txt") is False + assert handler.is_robots_txt("https://example.com/path/robots.txt") is False + assert handler.is_robots_txt("https://example.com/") is False + + # Edge case: malformed URL should not crash + assert handler.is_robots_txt("not-a-url") is False + + def test_is_llms_variant(self): + """Test llms file variant detection.""" + handler = URLHandler() + + # All llms variants + assert handler.is_llms_variant("https://example.com/llms.txt") is True + assert handler.is_llms_variant("https://example.com/llms.md") is True + assert handler.is_llms_variant("https://example.com/llms.mdx") is True + assert handler.is_llms_variant("https://example.com/llms.markdown") is True + + # Case sensitivity + assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True + assert handler.is_llms_variant("https://example.com/Llms.Md") is True + + # With paths (should still detect) + assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True + assert handler.is_llms_variant("https://example.com/public/llms.md") is True + + # With query parameters + assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True + assert handler.is_llms_variant("https://example.com/llms.md#section") is True + + # Not llms files + assert handler.is_llms_variant("https://example.com/llms") is False + assert handler.is_llms_variant("https://example.com/llms.html") is False + assert handler.is_llms_variant("https://example.com/my-llms.txt") is False + assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False + assert handler.is_llms_variant("https://example.com/readme.txt") is False + + # Edge case: malformed URL should not crash + assert handler.is_llms_variant("not-a-url") is False + + def test_is_well_known_file(self): + """Test .well-known file detection.""" + handler = URLHandler() + + # Standard .well-known files + assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True + assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True + assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True + + # Case sensitivity (path should be case sensitive) + assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is True + assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is True + + # With query parameters + assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True + assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True + + # Not .well-known files + assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False + assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False + assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False + assert handler.is_well_known_file("https://example.com/ai.txt") is False + assert handler.is_well_known_file("https://example.com/") is False + + # Edge case: malformed URL should not crash + assert handler.is_well_known_file("not-a-url") is False + + def test_get_base_url(self): + """Test base URL extraction.""" + handler = URLHandler() + + # Standard URLs + assert handler.get_base_url("https://example.com") == "https://example.com" + assert handler.get_base_url("https://example.com/") == "https://example.com" + assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com" + assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com" + assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com" + + # HTTP vs HTTPS + assert handler.get_base_url("http://example.com/path") == "http://example.com" + assert handler.get_base_url("https://example.com/path") == "https://example.com" + + # Subdomains and ports + assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com" + assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080" + assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000" + + # Complex cases + assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com" + + # Edge cases - malformed URLs should return original + assert handler.get_base_url("not-a-url") == "not-a-url" + assert handler.get_base_url("") == "" + assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com" + + # Missing scheme or netloc + assert handler.get_base_url("//example.com/path") == "//example.com/path" # Should return original + assert handler.get_base_url("/path/to/resource") == "/path/to/resource" # Should return original \ No newline at end of file