Implement priority-based automatic discovery of llms.txt and sitemap.xml files

- Add DiscoveryService with single-file priority selection - Priority: llms-full.txt > llms.txt > llms.md > llms.mdx > sitemap.xml > robots.txt - All files contain similar AI/crawling guidance, so only best one is needed - Robots.txt sitemap declarations have highest priority - Fallback to subdirectories for llms files - Enhance URLHandler with discovery helper methods - Add is_robots_txt, is_llms_variant, is_well_known_file, get_base_url methods - Follow existing patterns with proper error handling - Integrate discovery into CrawlingService orchestration - When discovery finds file: crawl ONLY discovered file (not main URL) - When no discovery: crawl main URL normally - Fixes issue where both main URL + discovered file were crawled - Add discovery stage to progress mapping - New "discovery" stage in progress flow - Clear progress messages for discovered files - Comprehensive test coverage - Tests for priority-based selection logic - Tests for robots.txt priority and fallback behavior - Updated existing tests for new return formats Resolves efficient crawling by selecting single best guidance file instead of crawling redundant content from multiple similar files. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-24 02:39:17 -05:00 · 2025-09-08 09:03:15 +02:00
parent 012d2c58ed
commit 1a55d93a4e
6 changed files with 1193 additions and 42 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -17,6 +17,7 @@ from ...utils.progress.progress_tracker import ProgressTracker

 # Import strategies
 # Import operations
+from .discovery_service import DiscoveryService
 from .document_storage_operations import DocumentStorageOperations
 from .helpers.site_config import SiteConfig

@@ -83,6 +84,7 @@ class CrawlingService:

        # Initialize operations
        self.doc_storage_ops = DocumentStorageOperations(self.supabase_client)
+        self.discovery_service = DiscoveryService()

        # Track progress state across all stages to prevent UI resets
        self.progress_state = {"progressId": self.progress_id} if self.progress_id else {}
@@ -132,7 +134,7 @@ class CrawlingService:
                    f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')} | "
                    f"kwargs_keys={list(kwargs.keys())}"
                )
-                
+
                # Update progress via tracker (stores in memory for HTTP polling)
                await self.progress_tracker.update(
                    status=base_status,
@@ -332,16 +334,68 @@ class CrawlingService:
            # Check for cancellation before proceeding
            self._check_cancellation()

-            # Analyzing stage - report initial page count (at least 1)
-            await update_mapped_progress(
-                "analyzing", 50, f"Analyzing URL type for {url}",
-                total_pages=1,  # We know we have at least the start URL
-                processed_pages=0
-            )
+            # Discovery phase - find the single best related file
+            discovered_urls = []
+            if request.get("auto_discovery", True):  # Default enabled
+                await update_mapped_progress(
+                    "discovery", 25, f"Discovering best related file for {url}", current_url=url
+                )
+                try:
+                    discovered_file = self.discovery_service.discover_files(url)
+
+                    # Add the single best discovered file to crawl list
+                    if discovered_file:
+                        safe_logfire_info(f"Discovery found file: {discovered_file}")
+                        # Filter through is_binary_file() check like existing code
+                        if not self.url_handler.is_binary_file(discovered_file):
+                            discovered_urls.append(discovered_file)
+                            safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}")
+                        else:
+                            safe_logfire_info(f"Skipping binary file: {discovered_file}")
+                    else:
+                        safe_logfire_info(f"Discovery found no files for {url}")
+
+                    file_count = len(discovered_urls)
+                    safe_logfire_info(f"Discovery selected {file_count} best file to crawl")
+
+                    await update_mapped_progress(
+                        "discovery", 100, f"Discovery completed: selected {file_count} best file", current_url=url
+                    )
+
+                except Exception as e:
+                    safe_logfire_error(f"Discovery phase failed: {e}")
+                    # Continue with regular crawl even if discovery fails
+                    await update_mapped_progress(
+                        "discovery", 100, "Discovery phase failed, continuing with regular crawl", current_url=url
+                    )
+
+            # Analyzing stage - determine what to crawl
+            if discovered_urls:
+                # Discovery found a file - crawl ONLY the discovered file, not the main URL
+                total_urls_to_crawl = len(discovered_urls)
+                await update_mapped_progress(
+                    "analyzing", 50, f"Analyzing discovered file: {discovered_urls[0]}",
+                    total_pages=total_urls_to_crawl,
+                    processed_pages=0
+                )
+                
+                # Crawl only the discovered file
+                safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}")
+                crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request)
+                
+            else:
+                # No discovery - crawl the main URL normally
+                total_urls_to_crawl = 1
+                await update_mapped_progress(
+                    "analyzing", 50, f"Analyzing URL type for {url}",
+                    total_pages=total_urls_to_crawl,
+                    processed_pages=0
+                )
+                
+                # Crawl the main URL
+                safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
+                crawl_results, crawl_type = await self._crawl_by_url_type(url, request)

-            # Detect URL type and perform crawl
-            crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
-            
            # Update progress tracker with crawl type
            if self.progress_tracker and crawl_type:
                await self.progress_tracker.update(
@@ -415,7 +469,7 @@ class CrawlingService:
            if request.get("extract_code_examples", True) and actual_chunks_stored > 0:
                # Check for cancellation before starting code extraction
                self._check_cancellation()
-                
+
                await update_mapped_progress("code_extraction", 0, "Starting code extraction...")

                # Create progress callback for code extraction
@@ -424,7 +478,7 @@ class CrawlingService:
                        # Use ProgressMapper to ensure progress never goes backwards
                        raw_progress = data.get("progress", data.get("percentage", 0))
                        mapped_progress = self.progress_mapper.map_progress("code_extraction", raw_progress)
-                        
+
                        # Update progress state via tracker
                        await self.progress_tracker.update(
                            status=data.get("status", "code_extraction"),
@@ -445,7 +499,7 @@ class CrawlingService:

                # Check for cancellation after code extraction
                self._check_cancellation()
-                
+
                # Send heartbeat after code extraction
                await send_heartbeat_if_needed()

@@ -571,7 +625,7 @@ class CrawlingService:
        crawl_type = None

        if self.url_handler.is_txt(url) or self.url_handler.is_markdown(url):
-            # Handle text files  
+            # Handle text files
            crawl_type = "llms-txt" if "llms" in url.lower() else "text_file"
            if self.progress_tracker:
                await self.progress_tracker.update(
@@ -593,7 +647,7 @@ class CrawlingService:
                if self.url_handler.is_link_collection_file(url, content):
                    # Extract links from the content
                    extracted_links = self.url_handler.extract_markdown_links(content, url)
-                    
+
                    # Filter out self-referential links to avoid redundant crawling
                    if extracted_links:
                        original_count = len(extracted_links)
@@ -604,7 +658,7 @@ class CrawlingService:
                        self_filtered_count = original_count - len(extracted_links)
                        if self_filtered_count > 0:
                            logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
-                    
+
                    # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
                    if extracted_links:
                        original_count = len(extracted_links)
@@ -612,7 +666,7 @@ class CrawlingService:
                        filtered_count = original_count - len(extracted_links)
                        if filtered_count > 0:
                            logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
-                    
+
                    if extracted_links:
                        # Crawl the extracted links using batch crawling
                        logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
@@ -623,11 +677,11 @@ class CrawlingService:
                            start_progress=10,
                            end_progress=20,
                        )
-                        
+
                        # Combine original text file results with batch results
                        crawl_results.extend(batch_results)
                        crawl_type = "link_collection_with_crawled_links"
-                        
+
                        logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
                    else:
                        logger.info(f"No valid links found in link collection file: {url}")
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -0,0 +1,441 @@
+"""
+Discovery Service for Automatic File Detection
+
+Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
+to enhance crawling capabilities with priority-based discovery methods.
+"""
+
+from urllib.parse import urljoin
+
+import requests
+
+from ...config.logfire_config import get_logger
+
+logger = get_logger(__name__)
+
+
+class DiscoveryService:
+    """Service for discovering related files automatically during crawls."""
+
+    # Global priority order - select ONE best file from all categories
+    # All these files contain similar AI/crawling guidance content
+    DISCOVERY_PRIORITY = [
+        # LLMs files (highest priority - most comprehensive AI guidance)
+        "llms-full.txt",
+        "llms.txt", 
+        "llms.md",
+        "llms.mdx",
+        "llms.markdown",
+        
+        # Sitemap files (structural crawling guidance)  
+        "sitemap_index.xml",
+        "sitemap-index.xml",
+        "sitemap.xml",
+        
+        # Robots file (basic crawling rules)
+        "robots.txt",
+        
+        # Well-known variants (alternative locations)
+        ".well-known/ai.txt",
+        ".well-known/llms.txt", 
+        ".well-known/sitemap.xml"
+    ]
+
+    def discover_files(self, base_url: str) -> str | None:
+        """
+        Main discovery orchestrator - selects ONE best file across all categories.
+        All files contain similar AI/crawling guidance, so we only need the best one.
+        
+        Args:
+            base_url: Base URL to discover files for
+            
+        Returns:
+            Single best URL found, or None if no files discovered
+        """
+        try:
+            logger.info(f"Starting single-file discovery for {base_url}")
+
+            # First check robots.txt for explicit sitemap declarations (special case)
+            robots_sitemaps = self._parse_robots_txt(base_url)
+            if robots_sitemaps:
+                best_file = robots_sitemaps[0]  # Use first sitemap from robots.txt
+                logger.info(f"Discovery found best file from robots.txt: {best_file}")
+                return best_file
+
+            # Check files in global priority order
+            for filename in self.DISCOVERY_PRIORITY:
+                # Try root location first
+                file_url = urljoin(base_url, f"/{filename}")
+                if self._check_url_exists(file_url):
+                    logger.info(f"Discovery found best file: {file_url}")
+                    return file_url
+                
+                # For llms files, also try common subdirectories
+                if filename.startswith('llms'):
+                    for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
+                        subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
+                        if self._check_url_exists(subdir_url):
+                            logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
+                            return subdir_url
+                
+                # For sitemap files, also try common subdirectories
+                if filename.endswith('.xml') and not filename.startswith('.well-known'):
+                    for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
+                        subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
+                        if self._check_url_exists(subdir_url):
+                            logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
+                            return subdir_url
+
+            # Check HTML meta tags for sitemap references as final fallback
+            html_sitemaps = self._parse_html_meta_tags(base_url)
+            if html_sitemaps:
+                best_file = html_sitemaps[0]
+                logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
+                return best_file
+
+            logger.info(f"Discovery completed for {base_url}: no files found")
+            return None
+
+        except Exception:
+            logger.exception(f"Unexpected error during discovery for {base_url}")
+            return None
+
+    def _discover_best_sitemap(self, base_url: str) -> str | None:
+        """
+        Discover the best available sitemap using priority-based selection.
+        
+        Priority order:
+        1. Sitemaps from robots.txt (highest priority - explicitly declared)
+        2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
+        3. Common subdirectory variations
+        4. HTML meta tag references
+        5. .well-known directory
+        """
+        try:
+            # Priority 1: Check robots.txt for sitemap declarations
+            robots_sitemaps = self._parse_robots_txt(base_url)
+            if robots_sitemaps:
+                return robots_sitemaps[0]  # Use first sitemap from robots.txt
+
+            # Priority 2: Check standard locations in priority order
+            for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
+                sitemap_url = urljoin(base_url, f"/{filename}")
+                if self._check_url_exists(sitemap_url):
+                    return sitemap_url
+
+            # Priority 3: Check common subdirectory variations
+            subdirs = ["sitemaps", "sitemap", "xml", "feed"]
+            for subdir in subdirs:
+                for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
+                    sitemap_url = urljoin(base_url, f"/{subdir}/{filename}")
+                    if self._check_url_exists(sitemap_url):
+                        return sitemap_url
+
+            # Priority 4: Check HTML meta tag references
+            html_sitemaps = self._parse_html_meta_tags(base_url)
+            if html_sitemaps:
+                return html_sitemaps[0]  # Use first sitemap from HTML
+
+            # Priority 5: Check .well-known directory
+            well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml")
+            if self._check_url_exists(well_known_sitemap):
+                return well_known_sitemap
+
+        except Exception:
+            logger.exception(f"Error discovering best sitemap for {base_url}")
+
+        return None
+
+    def _discover_best_llms_file(self, base_url: str) -> str | None:
+        """
+        Discover the best available llms file using priority-based selection.
+        
+        Priority order:
+        1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
+        2. Common subdirectory variations (static, public, docs, assets)
+        3. .well-known directory variants
+        """
+        try:
+            # Priority 1: Check standard root locations in priority order
+            for filename in self.DISCOVERY_TARGETS["llms_files"]:
+                llms_url = urljoin(base_url, f"/{filename}")
+                if self._check_url_exists(llms_url):
+                    return llms_url
+
+            # Priority 2: Check common subdirectory variations
+            subdirs = ["static", "public", "docs", "assets", "doc", "api"]
+            for subdir in subdirs:
+                for filename in self.DISCOVERY_TARGETS["llms_files"]:
+                    llms_url = urljoin(base_url, f"/{subdir}/{filename}")
+                    if self._check_url_exists(llms_url):
+                        return llms_url
+
+            # Priority 3: Check .well-known directory variants
+            for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
+                well_known_url = urljoin(base_url, f"/{well_known_file}")
+                if self._check_url_exists(well_known_url):
+                    return well_known_url
+
+        except Exception:
+            logger.exception(f"Error discovering best llms file for {base_url}")
+
+        return None
+
+    def _discover_robots_file(self, base_url: str) -> str | None:
+        """
+        Discover robots.txt file (always single file at root).
+        """
+        try:
+            robots_url = urljoin(base_url, "/robots.txt")
+            if self._check_url_exists(robots_url):
+                return robots_url
+        except Exception:
+            logger.exception(f"Error discovering robots file for {base_url}")
+        
+        return None
+
+    def _check_url_exists(self, url: str) -> bool:
+        """
+        Check if a URL exists and returns a successful response.
+        """
+        try:
+            resp = requests.get(url, timeout=5, allow_redirects=True)
+            success = resp.status_code == 200
+            logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
+            return success
+        except Exception as e:
+            logger.debug(f"URL check failed: {url} -> {e}")
+            return False
+
+    def _parse_robots_txt(self, base_url: str) -> list[str]:
+        """
+        Extract sitemap URLs from robots.txt.
+        
+        Args:
+            base_url: Base URL to check robots.txt for
+            
+        Returns:
+            List of sitemap URLs found in robots.txt
+        """
+        sitemaps: list[str] = []
+
+        try:
+            robots_url = urljoin(base_url, "/robots.txt")
+            logger.info(f"Checking robots.txt at {robots_url}")
+
+            resp = requests.get(robots_url, timeout=30)
+
+            if resp.status_code != 200:
+                logger.info(f"No robots.txt found: HTTP {resp.status_code}")
+                return sitemaps
+
+            # Parse robots.txt content for sitemap directives
+            for line in resp.text.splitlines():
+                line = line.strip().lower()
+                if line.startswith("sitemap:"):
+                    sitemap_url = line.split(":", 1)[1].strip()
+                    # Validate URL format before adding
+                    if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')):
+                        sitemaps.append(sitemap_url)
+                        logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
+
+        except requests.exceptions.RequestException:
+            logger.exception(f"Network error fetching robots.txt from {base_url}")
+        except Exception:
+            logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
+
+        return sitemaps
+
+    def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
+        """
+        Check common file locations for discovery targets.
+        
+        Args:
+            base_url: Base URL to check standard locations for
+            
+        Returns:
+            Dictionary with file types and discovered URLs
+        """
+        discovered: dict[str, list[str]] = {
+            "sitemaps": [],
+            "llms_files": [],
+            "robots_files": []
+        }
+
+        try:
+            # Check all discovery targets at standard locations
+            all_targets = []
+            for target_type, files in self.DISCOVERY_TARGETS.items():
+                if target_type != "well_known_files":  # Skip well-known, handled separately
+                    for filename in files:
+                        all_targets.append((target_type, filename))
+
+            for target_type, filename in all_targets:
+                try:
+                    file_url = urljoin(base_url, f"/{filename}")
+                    resp = requests.get(file_url, timeout=30, allow_redirects=True)
+
+                    if resp.status_code == 200:
+                        # Map target type to discovery category
+                        if target_type == "sitemap_files":
+                            discovered["sitemaps"].append(file_url)
+                        elif target_type == "llms_files":
+                            discovered["llms_files"].append(file_url)
+                        elif target_type == "robots_files":
+                            discovered["robots_files"].append(file_url)
+
+                        logger.info(f"Found {target_type} file: {file_url}")
+
+                except requests.exceptions.RequestException:
+                    logger.debug(f"File not found or network error: {filename}")
+                except Exception:
+                    logger.exception(f"Unexpected error checking {filename}")
+
+        except Exception:
+            logger.exception(f"Unexpected error in standard pattern checking for {base_url}")
+
+        return discovered
+
+    def _parse_html_meta_tags(self, base_url: str) -> list[str]:
+        """
+        Extract sitemap references from HTML meta tags.
+        
+        Args:
+            base_url: Base URL to check HTML for meta tags
+            
+        Returns:
+            List of sitemap URLs found in HTML meta tags
+        """
+        sitemaps: list[str] = []
+
+        try:
+            logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
+            resp = requests.get(base_url, timeout=30)
+
+            if resp.status_code != 200:
+                logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
+                return sitemaps
+
+            content = resp.text.lower()
+
+            # Look for sitemap meta tags or link elements
+            import re
+
+            # Check for <link rel="sitemap" href="...">
+            sitemap_link_pattern = r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']'
+            matches = re.findall(sitemap_link_pattern, content)
+
+            for match in matches:
+                sitemap_url = urljoin(base_url, match)
+                sitemaps.append(sitemap_url)
+                logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
+
+            # Check for <meta name="sitemap" content="...">
+            sitemap_meta_pattern = r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']'
+            matches = re.findall(sitemap_meta_pattern, content)
+
+            for match in matches:
+                sitemap_url = urljoin(base_url, match)
+                sitemaps.append(sitemap_url)
+                logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
+
+        except requests.exceptions.RequestException:
+            logger.exception(f"Network error fetching HTML from {base_url}")
+        except Exception:
+            logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
+
+        return sitemaps
+
+    def _check_well_known_directory(self, base_url: str) -> list[str]:
+        """
+        Check .well-known/* files for discovery targets.
+        
+        Args:
+            base_url: Base URL to check .well-known directory for
+            
+        Returns:
+            List of URLs found in .well-known directory
+        """
+        well_known_files: list[str] = []
+
+        try:
+            for filename in self.DISCOVERY_TARGETS["well_known_files"]:
+                try:
+                    file_url = urljoin(base_url, f"/{filename}")
+                    resp = requests.get(file_url, timeout=30, allow_redirects=True)
+
+                    if resp.status_code == 200:
+                        well_known_files.append(file_url)
+                        logger.info(f"Found .well-known file: {file_url}")
+
+                except requests.exceptions.RequestException:
+                    logger.debug(f"Well-known file not found or network error: {filename}")
+                except Exception:
+                    logger.exception(f"Unexpected error checking well-known file: {filename}")
+
+        except Exception:
+            logger.exception(f"Unexpected error checking .well-known directory for {base_url}")
+
+        return well_known_files
+
+    def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
+        """
+        Try pattern variations for discovery targets.
+        
+        Args:
+            base_url: Base URL to try variations for
+            
+        Returns:
+            Dictionary with file types and discovered variation URLs
+        """
+        discovered: dict[str, list[str]] = {
+            "sitemaps": [],
+            "llms_files": []
+        }
+
+        try:
+            # Common subdirectories to check
+            subdirs = ["public", "static", "assets", "docs", "doc", "api"]
+
+            # Try llms.txt variants in subdirectories
+            for subdir in subdirs:
+                for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
+                    try:
+                        file_url = urljoin(base_url, f"/{subdir}/{llms_file}")
+                        resp = requests.get(file_url, timeout=30, allow_redirects=True)
+
+                        if resp.status_code == 200:
+                            discovered["llms_files"].append(file_url)
+                            logger.info(f"Found llms file variant: {file_url}")
+
+                    except requests.exceptions.RequestException:
+                        logger.debug(f"Variant not found: {subdir}/{llms_file}")
+                    except Exception:
+                        logger.exception(f"Error checking variant: {subdir}/{llms_file}")
+
+            # Try sitemap variants with different paths
+            sitemap_paths = [
+                "sitemaps/sitemap.xml",
+                "sitemap/sitemap.xml",
+                "xml/sitemap.xml",
+                "feed/sitemap.xml"
+            ]
+
+            for sitemap_path in sitemap_paths:
+                try:
+                    file_url = urljoin(base_url, f"/{sitemap_path}")
+                    resp = requests.get(file_url, timeout=30, allow_redirects=True)
+
+                    if resp.status_code == 200:
+                        discovered["sitemaps"].append(file_url)
+                        logger.info(f"Found sitemap variant: {file_url}")
+
+                except requests.exceptions.RequestException:
+                    logger.debug(f"Sitemap variant not found: {sitemap_path}")
+                except Exception:
+                    logger.exception(f"Error checking sitemap variant: {sitemap_path}")
+
+        except Exception:
+            logger.exception(f"Unexpected error trying common variations for {base_url}")
+
+        return discovered
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -6,8 +6,7 @@ Handles URL transformations and validations.

 import hashlib
 import re
-from urllib.parse import urlparse, urljoin
-from typing import List, Optional
+from urllib.parse import urljoin, urlparse

 from ....config.logfire_config import get_logger

@@ -33,8 +32,8 @@ class URLHandler:
        except Exception as e:
            logger.warning(f"Error checking if URL is sitemap: {e}")
            return False
-    
-    @staticmethod  
+
+    @staticmethod
    def is_markdown(url: str) -> bool:
        """
        Check if a URL points to a markdown file (.md, .mdx, .markdown).
@@ -274,9 +273,9 @@ class URLHandler:
            # Fallback: use a hash of the error message + url to still get something unique
            fallback = f"error_{redacted}_{str(e)}"
            return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
-    
+
    @staticmethod
-    def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
+    def extract_markdown_links(content: str, base_url: str | None = None) -> list[str]:
        """
        Extract markdown-style links from text content.
        
@@ -290,10 +289,10 @@ class URLHandler:
        try:
            if not content:
                return []
-            
+
            # Ultimate URL pattern with comprehensive format support:
            #  1) [text](url) - markdown links
-            #  2) <https://...> - autolinks  
+            #  2) <https://...> - autolinks
            #  3) https://... - bare URLs with protocol
            #  4) //example.com - protocol-relative URLs
            #  5) www.example.com - scheme-less www URLs
@@ -348,7 +347,7 @@ class URLHandler:
                # Only include HTTP/HTTPS URLs
                if url.startswith(('http://', 'https://')):
                    urls.append(url)
-            
+
            # Remove duplicates while preserving order
            seen = set()
            unique_urls = []
@@ -356,16 +355,16 @@ class URLHandler:
                if url not in seen:
                    seen.add(url)
                    unique_urls.append(url)
-            
+
            logger.info(f"Extracted {len(unique_urls)} unique links from content")
            return unique_urls
-            
+
        except Exception as e:
            logger.error(f"Error extracting markdown links: {e}", exc_info=True)
            return []
-    
+
    @staticmethod
-    def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
+    def is_link_collection_file(url: str, content: str | None = None) -> bool:
        """
        Check if a URL/file appears to be a link collection file like llms.txt.
        
@@ -380,7 +379,7 @@ class URLHandler:
            # Extract filename from URL
            parsed = urlparse(url)
            filename = parsed.path.split('/')[-1].lower()
-            
+
            # Check for specific link collection filenames
            # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
            link_collection_patterns = [
@@ -391,12 +390,12 @@ class URLHandler:
                'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
                'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
            ]
-            
+
            # Direct filename match
            if filename in link_collection_patterns:
                logger.info(f"Detected link collection file by filename: {filename}")
                return True
-            
+
            # Pattern-based detection for variations, but exclude "full" variants
            # Only match files that are likely link collections, not complete content files
            if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
@@ -407,7 +406,7 @@ class URLHandler:
                    if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
                        logger.info(f"Detected potential link collection file: {filename}")
                        return True
-            
+
            # Content-based detection if content is provided
            if content:
                # Never treat "full" variants as link collections to preserve single-page behavior
@@ -417,19 +416,19 @@ class URLHandler:
                # Reuse extractor to avoid regex divergence and maintain consistency
                extracted_links = URLHandler.extract_markdown_links(content, url)
                total_links = len(extracted_links)
-                
+
                # Calculate link density (links per 100 characters)
                content_length = len(content.strip())
                if content_length > 0:
                    link_density = (total_links * 100) / content_length
-                    
+
                    # If more than 2% of content is links, likely a link collection
                    if link_density > 2.0 and total_links > 3:
                        logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
                        return True
-            
+
            return False
-            
+
        except Exception as e:
            logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
            return False
@@ -583,3 +582,92 @@ class URLHandler:
            logger.warning(f"Error extracting display name for {url}: {e}, using URL")
            # Fallback: return truncated URL
            return url[:50] + "..." if len(url) > 50 else url
+
+    @staticmethod
+    def is_robots_txt(url: str) -> bool:
+        """
+        Check if a URL is a robots.txt file with error handling.
+
+        Args:
+            url: URL to check
+
+        Returns:
+            True if URL is a robots.txt file, False otherwise
+        """
+        try:
+            parsed = urlparse(url)
+            # Normalize to lowercase and ignore query/fragment
+            path = parsed.path.lower()
+            # Only detect robots.txt at root level
+            return path == '/robots.txt'
+        except Exception as e:
+            logger.warning(f"Error checking if URL is robots.txt: {e}", exc_info=True)
+            return False
+
+    @staticmethod
+    def is_llms_variant(url: str) -> bool:
+        """
+        Check if a URL is a llms.txt/llms.md variant with error handling.
+
+        Args:
+            url: URL to check
+
+        Returns:
+            True if URL is a llms file variant, False otherwise
+        """
+        try:
+            parsed = urlparse(url)
+            # Normalize to lowercase and ignore query/fragment
+            path = parsed.path.lower()
+            filename = path.split('/')[-1] if '/' in path else path
+
+            # Check for llms file variants
+            llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
+            return filename in llms_variants
+        except Exception as e:
+            logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
+            return False
+
+    @staticmethod
+    def is_well_known_file(url: str) -> bool:
+        """
+        Check if a URL is a .well-known/* file with error handling.
+
+        Args:
+            url: URL to check
+
+        Returns:
+            True if URL is a .well-known file, False otherwise
+        """
+        try:
+            parsed = urlparse(url)
+            # Normalize to lowercase and ignore query/fragment
+            path = parsed.path.lower()
+            # Only detect .well-known files at root level
+            return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
+        except Exception as e:
+            logger.warning(f"Error checking if URL is well-known file: {e}", exc_info=True)
+            return False
+
+    @staticmethod
+    def get_base_url(url: str) -> str:
+        """
+        Extract base domain URL for discovery with error handling.
+
+        Args:
+            url: URL to extract base from
+
+        Returns:
+            Base URL (scheme + netloc) or original URL if extraction fails
+        """
+        try:
+            parsed = urlparse(url)
+            # Ensure we have scheme and netloc
+            if parsed.scheme and parsed.netloc:
+                return f"{parsed.scheme}://{parsed.netloc}"
+            else:
+                logger.warning(f"URL missing scheme or netloc: {url}")
+                return url
+        except Exception as e:
+            logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True)
+            return url
--- a/python/src/server/services/crawling/progress_mapper.py
+++ b/python/src/server/services/crawling/progress_mapper.py
@@ -15,7 +15,8 @@ class ProgressMapper:
        "starting": (0, 1),
        "initializing": (0, 1),
        "analyzing": (1, 2),       # URL analysis is very quick
-        "crawling": (2, 5),        # Crawling pages is relatively fast
+        "discovery": (2, 3),       # File discovery is quick
+        "crawling": (3, 5),        # Crawling pages is relatively fast
        "processing": (5, 8),      # Content processing/chunking is quick
        "source_creation": (8, 10), # DB operations are fast
        "document_storage": (10, 30), # Embeddings + batch processing - significant but not longest
--- a/python/tests/test_discovery_service.py
+++ b/python/tests/test_discovery_service.py
@@ -0,0 +1,449 @@
+"""Unit tests for DiscoveryService class."""
+import pytest
+from unittest.mock import patch, Mock
+from src.server.services.crawling.discovery_service import DiscoveryService
+
+
+class TestDiscoveryService:
+    """Test suite for DiscoveryService class."""
+
+    @patch('requests.get')
+    def test_discover_files_basic(self, mock_get):
+        """Test main discovery method returns single best file."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock robots.txt response (no sitemaps)
+        robots_response = Mock()
+        robots_response.status_code = 200
+        robots_response.text = "User-agent: *\nDisallow: /admin/"
+        
+        # Mock file existence - llms-full.txt doesn't exist, but llms.txt does
+        def mock_get_side_effect(url, **kwargs):
+            response = Mock()
+            if url.endswith('robots.txt'):
+                return robots_response
+            elif url.endswith('llms-full.txt'):
+                response.status_code = 404  # Highest priority doesn't exist
+            elif url.endswith('llms.txt'):
+                response.status_code = 200  # Second priority exists
+            else:
+                response.status_code = 404
+            return response
+        
+        mock_get.side_effect = mock_get_side_effect
+        
+        result = service.discover_files(base_url)
+        
+        # Should return single URL string (not dict, not list)
+        assert isinstance(result, str)
+        assert result == 'https://example.com/llms.txt'
+
+    @patch('requests.get')  
+    def test_discover_files_no_files_found(self, mock_get):
+        """Test discovery when no files are found."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock all HTTP requests to return 404
+        mock_response = Mock()
+        mock_response.status_code = 404
+        mock_get.return_value = mock_response
+        
+        result = service.discover_files(base_url)
+        
+        # Should return None when no files found
+        assert result is None
+
+    @patch('requests.get') 
+    def test_discover_files_priority_order(self, mock_get):
+        """Test that discovery follows the correct priority order."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock robots.txt response (no sitemaps declared)
+        robots_response = Mock()
+        robots_response.status_code = 200
+        robots_response.text = "User-agent: *\nDisallow: /admin/"
+        
+        # Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority
+        def mock_get_side_effect(url, **kwargs):
+            response = Mock()
+            if url.endswith('robots.txt'):
+                return robots_response
+            elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
+                response.status_code = 200  # Both exist
+            else:
+                response.status_code = 404
+            return response
+        
+        mock_get.side_effect = mock_get_side_effect
+        
+        result = service.discover_files(base_url)
+        
+        # Should return llms.txt since it has higher priority than sitemap.xml
+        assert result == 'https://example.com/llms.txt'
+
+    @patch('requests.get')
+    def test_discover_files_robots_sitemap_priority(self, mock_get):
+        """Test that robots.txt sitemap declarations have highest priority."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock robots.txt response WITH sitemap declaration  
+        robots_response = Mock()
+        robots_response.status_code = 200
+        robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml"
+        
+        # Mock other files also exist
+        def mock_get_side_effect(url, **kwargs):
+            response = Mock()
+            if url.endswith('robots.txt'):
+                return robots_response
+            elif 'llms' in url or 'sitemap' in url:
+                response.status_code = 200
+            else:
+                response.status_code = 404
+            return response
+        
+        mock_get.side_effect = mock_get_side_effect
+        
+        result = service.discover_files(base_url)
+        
+        # Should return the sitemap declared in robots.txt (highest priority)
+        assert result == 'https://example.com/declared-sitemap.xml'
+
+    @patch('requests.get')
+    def test_discover_best_sitemap_robots_priority(self, mock_get):
+        """Test sitemap discovery prioritizes robots.txt declarations."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock robots.txt with sitemap
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = "Sitemap: https://example.com/robots-sitemap.xml"
+        mock_get.return_value = mock_response
+        
+        result = service._discover_best_sitemap(base_url)
+        
+        # Should return the sitemap from robots.txt (highest priority)
+        assert result == "https://example.com/robots-sitemap.xml"
+
+    @patch('requests.get')
+    def test_discover_best_llms_file_priority_order(self, mock_get):
+        """Test llms file discovery follows priority order."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock HTTP responses - only llms.txt exists, not llms-full.txt
+        def mock_get_side_effect(url, **kwargs):
+            response = Mock()
+            if url.endswith('llms-full.txt'):
+                response.status_code = 404  # Higher priority file doesn't exist
+            elif url.endswith('llms.txt'):
+                response.status_code = 200  # Standard file exists
+            else:
+                response.status_code = 404
+            return response
+        
+        mock_get.side_effect = mock_get_side_effect
+        
+        result = service._discover_best_llms_file(base_url)
+        
+        # Should find llms.txt since llms-full.txt doesn't exist
+        assert result == "https://example.com/llms.txt"
+
+    @patch('requests.get')
+    def test_discover_best_llms_file_subdirectory_fallback(self, mock_get):
+        """Test llms file discovery falls back to subdirectories."""
+        service = DiscoveryService()
+        base_url = "https://example.com"
+        
+        # Mock HTTP responses - no root files, but static/llms.txt exists
+        def mock_get_side_effect(url, **kwargs):
+            response = Mock()
+            if '/static/llms.txt' in url:
+                response.status_code = 200  # Found in subdirectory
+            else:
+                response.status_code = 404
+            return response
+        
+        mock_get.side_effect = mock_get_side_effect
+        
+        result = service._discover_best_llms_file(base_url)
+        
+        # Should find the file in static subdirectory
+        assert result == "https://example.com/static/llms.txt"
+
+    @patch('requests.get')
+    def test_check_url_exists(self, mock_get):
+        """Test URL existence checking."""
+        service = DiscoveryService()
+        
+        # Test successful response
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_get.return_value = mock_response
+        
+        assert service._check_url_exists("https://example.com/exists") is True
+        
+        # Test 404 response
+        mock_response.status_code = 404
+        assert service._check_url_exists("https://example.com/not-found") is False
+        
+        # Test network error
+        mock_get.side_effect = Exception("Network error")
+        assert service._check_url_exists("https://example.com/error") is False
+
+    @patch('requests.get')
+    def test_parse_robots_txt_with_sitemap(self, mock_get):
+        """Test robots.txt parsing with sitemap directives."""
+        service = DiscoveryService()
+        
+        # Mock successful robots.txt response
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = """User-agent: *
+Disallow: /admin/
+Sitemap: https://example.com/sitemap.xml
+Sitemap: https://example.com/sitemap-news.xml"""
+        mock_get.return_value = mock_response
+        
+        result = service._parse_robots_txt("https://example.com")
+        
+        assert len(result) == 2
+        assert "https://example.com/sitemap.xml" in result
+        assert "https://example.com/sitemap-news.xml" in result
+        mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
+
+    @patch('requests.get')
+    def test_parse_robots_txt_no_sitemap(self, mock_get):
+        """Test robots.txt parsing without sitemap directives."""
+        service = DiscoveryService()
+        
+        # Mock robots.txt without sitemaps
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = """User-agent: *
+Disallow: /admin/
+Allow: /public/"""
+        mock_get.return_value = mock_response
+        
+        result = service._parse_robots_txt("https://example.com")
+        
+        assert len(result) == 0
+        mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
+
+    @patch('requests.get')
+    def test_parse_robots_txt_not_found(self, mock_get):
+        """Test robots.txt parsing when file is not found."""
+        service = DiscoveryService()
+        
+        # Mock 404 response
+        mock_response = Mock()
+        mock_response.status_code = 404
+        mock_get.return_value = mock_response
+        
+        result = service._parse_robots_txt("https://example.com")
+        
+        assert len(result) == 0
+        mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
+
+    @patch('requests.get')
+    def test_check_standard_patterns(self, mock_get):
+        """Test standard file pattern checking."""
+        service = DiscoveryService()
+        
+        # Mock responses for different files
+        def mock_response_side_effect(url, **kwargs):
+            mock_response = Mock()
+            if 'llms.txt' in url:
+                mock_response.status_code = 200
+            elif 'sitemap.xml' in url:
+                mock_response.status_code = 200
+            else:
+                mock_response.status_code = 404
+            return mock_response
+        
+        mock_get.side_effect = mock_response_side_effect
+        
+        result = service._check_standard_patterns("https://example.com")
+        
+        assert 'sitemaps' in result
+        assert 'llms_files' in result
+        assert 'robots_files' in result
+        
+        # Should find the files that returned 200
+        assert any('llms.txt' in url for url in result['llms_files'])
+        assert any('sitemap.xml' in url for url in result['sitemaps'])
+
+    @patch('requests.get')
+    def test_parse_html_meta_tags(self, mock_get):
+        """Test HTML meta tag parsing for sitemaps."""
+        service = DiscoveryService()
+        
+        # Mock HTML with sitemap references
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = """
+        <html>
+        <head>
+            <link rel="sitemap" href="/sitemap.xml">
+            <meta name="sitemap" content="https://example.com/sitemap-meta.xml">
+        </head>
+        <body>Content here</body>
+        </html>
+        """
+        mock_get.return_value = mock_response
+        
+        result = service._parse_html_meta_tags("https://example.com")
+        
+        # Should find sitemaps from both link and meta tags
+        assert len(result) >= 1
+        assert any('sitemap' in url.lower() for url in result)
+        mock_get.assert_called_once_with("https://example.com", timeout=30)
+
+    @patch('requests.get')
+    def test_parse_html_meta_tags_not_found(self, mock_get):
+        """Test HTML meta tag parsing when page not found."""
+        service = DiscoveryService()
+        
+        # Mock 404 response
+        mock_response = Mock()
+        mock_response.status_code = 404
+        mock_get.return_value = mock_response
+        
+        result = service._parse_html_meta_tags("https://example.com")
+        
+        assert len(result) == 0
+        mock_get.assert_called_once_with("https://example.com", timeout=30)
+
+    @patch('requests.get')
+    def test_check_well_known_directory(self, mock_get):
+        """Test .well-known directory file checking."""
+        service = DiscoveryService()
+        
+        # Mock responses - some files exist, some don't
+        def mock_response_side_effect(url, **kwargs):
+            mock_response = Mock()
+            if 'ai.txt' in url:
+                mock_response.status_code = 200
+            else:
+                mock_response.status_code = 404
+            return mock_response
+        
+        mock_get.side_effect = mock_response_side_effect
+        
+        result = service._check_well_known_directory("https://example.com")
+        
+        # Should find the ai.txt file
+        assert len(result) >= 1
+        assert any('ai.txt' in url for url in result)
+
+    @patch('requests.get')
+    def test_try_common_variations(self, mock_get):
+        """Test pattern variations for discovery targets."""
+        service = DiscoveryService()
+        
+        # Mock responses for variations
+        def mock_response_side_effect(url, **kwargs):
+            mock_response = Mock()
+            if 'docs/llms.txt' in url or 'sitemaps/sitemap.xml' in url:
+                mock_response.status_code = 200
+            else:
+                mock_response.status_code = 404
+            return mock_response
+        
+        mock_get.side_effect = mock_response_side_effect
+        
+        result = service._try_common_variations("https://example.com")
+        
+        assert 'sitemaps' in result
+        assert 'llms_files' in result
+        
+        # Should find at least one variation
+        assert len(result['llms_files']) >= 1 or len(result['sitemaps']) >= 1
+
+    @patch('requests.get')
+    def test_network_error_handling(self, mock_get):
+        """Test error scenarios with network failures."""
+        service = DiscoveryService()
+        
+        # Mock network error
+        mock_get.side_effect = Exception("Network error")
+        
+        # Should not raise exception, but return empty results
+        result = service._parse_robots_txt("https://example.com")
+        assert result == []
+        
+        result = service._check_standard_patterns("https://example.com")
+        assert isinstance(result, dict)
+        
+        result = service._parse_html_meta_tags("https://example.com")
+        assert result == []
+        
+        result = service._check_well_known_directory("https://example.com")
+        assert result == []
+        
+        result = service._try_common_variations("https://example.com")
+        assert isinstance(result, dict)
+
+    def test_discover_files_with_exceptions(self):
+        """Test main discovery method handles exceptions gracefully."""
+        service = DiscoveryService()
+        
+        # Mock methods to raise exceptions
+        with patch.object(service, '_parse_robots_txt', side_effect=Exception("Test error")):
+            with patch.object(service, '_check_standard_patterns', side_effect=Exception("Test error")):
+                with patch.object(service, '_parse_html_meta_tags', side_effect=Exception("Test error")):
+                    with patch.object(service, '_check_well_known_directory', side_effect=Exception("Test error")):
+                        with patch.object(service, '_try_common_variations', side_effect=Exception("Test error")):
+                            result = service.discover_files("https://example.com")
+        
+        # Should still return proper structure even with all methods failing
+        assert isinstance(result, dict)
+        assert 'sitemaps' in result
+        assert 'llms_files' in result
+        assert 'robots_files' in result
+        assert 'well_known_files' in result
+
+    @patch('requests.get')
+    def test_robots_txt_with_malformed_content(self, mock_get):
+        """Test robots.txt parsing with malformed content."""
+        service = DiscoveryService()
+        
+        # Mock malformed robots.txt content
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = """User-agent: *
+Disallow: /admin/
+Sitemap: 
+Sitemap: not-a-valid-url
+Sitemap: https://example.com/valid-sitemap.xml"""
+        mock_get.return_value = mock_response
+        
+        result = service._parse_robots_txt("https://example.com")
+        
+        # Should only include the valid sitemap URL
+        assert len(result) == 1
+        assert "https://example.com/valid-sitemap.xml" in result
+
+    def test_discovery_targets_constant(self):
+        """Test that discovery targets constant is properly defined."""
+        service = DiscoveryService()
+        
+        assert hasattr(service, 'DISCOVERY_TARGETS')
+        targets = service.DISCOVERY_TARGETS
+        
+        # Verify required target types exist
+        assert 'llms_files' in targets
+        assert 'sitemap_files' in targets
+        assert 'robots_files' in targets
+        assert 'well_known_files' in targets
+        
+        # Verify they contain expected files
+        assert 'llms.txt' in targets['llms_files']
+        assert 'sitemap.xml' in targets['sitemap_files']
+        assert 'robots.txt' in targets['robots_files']
+        assert '.well-known/ai.txt' in targets['well_known_files']
--- a/python/tests/test_url_handler.py
+++ b/python/tests/test_url_handler.py
@@ -122,4 +122,122 @@ class TestURLHandler:
        
        # Should not transform non-GitHub URLs
        other = "https://example.com/file"
-        assert handler.transform_github_url(other) == other
+        assert handler.transform_github_url(other) == other
+
+    def test_is_robots_txt(self):
+        """Test robots.txt detection."""
+        handler = URLHandler()
+        
+        # Standard robots.txt URLs
+        assert handler.is_robots_txt("https://example.com/robots.txt") is True
+        assert handler.is_robots_txt("http://example.com/robots.txt") is True
+        assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
+        
+        # Case sensitivity
+        assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
+        assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
+        
+        # With query parameters (should still be detected)
+        assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
+        assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
+        
+        # Not robots.txt files
+        assert handler.is_robots_txt("https://example.com/robots") is False
+        assert handler.is_robots_txt("https://example.com/robots.html") is False
+        assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
+        assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
+        assert handler.is_robots_txt("https://example.com/") is False
+        
+        # Edge case: malformed URL should not crash
+        assert handler.is_robots_txt("not-a-url") is False
+
+    def test_is_llms_variant(self):
+        """Test llms file variant detection."""
+        handler = URLHandler()
+        
+        # All llms variants
+        assert handler.is_llms_variant("https://example.com/llms.txt") is True
+        assert handler.is_llms_variant("https://example.com/llms.md") is True
+        assert handler.is_llms_variant("https://example.com/llms.mdx") is True
+        assert handler.is_llms_variant("https://example.com/llms.markdown") is True
+        
+        # Case sensitivity
+        assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
+        assert handler.is_llms_variant("https://example.com/Llms.Md") is True
+        
+        # With paths (should still detect)
+        assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
+        assert handler.is_llms_variant("https://example.com/public/llms.md") is True
+        
+        # With query parameters
+        assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
+        assert handler.is_llms_variant("https://example.com/llms.md#section") is True
+        
+        # Not llms files
+        assert handler.is_llms_variant("https://example.com/llms") is False
+        assert handler.is_llms_variant("https://example.com/llms.html") is False
+        assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
+        assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
+        assert handler.is_llms_variant("https://example.com/readme.txt") is False
+        
+        # Edge case: malformed URL should not crash
+        assert handler.is_llms_variant("not-a-url") is False
+
+    def test_is_well_known_file(self):
+        """Test .well-known file detection."""
+        handler = URLHandler()
+        
+        # Standard .well-known files
+        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
+        assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
+        assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
+        
+        # Case sensitivity (path should be case sensitive)
+        assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is True
+        assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is True
+        
+        # With query parameters
+        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
+        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
+        
+        # Not .well-known files
+        assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/") is False
+        
+        # Edge case: malformed URL should not crash
+        assert handler.is_well_known_file("not-a-url") is False
+
+    def test_get_base_url(self):
+        """Test base URL extraction."""
+        handler = URLHandler()
+        
+        # Standard URLs
+        assert handler.get_base_url("https://example.com") == "https://example.com"
+        assert handler.get_base_url("https://example.com/") == "https://example.com"
+        assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
+        assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
+        assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
+        
+        # HTTP vs HTTPS
+        assert handler.get_base_url("http://example.com/path") == "http://example.com"
+        assert handler.get_base_url("https://example.com/path") == "https://example.com"
+        
+        # Subdomains and ports
+        assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
+        assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
+        assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
+        
+        # Complex cases
+        assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
+        
+        # Edge cases - malformed URLs should return original
+        assert handler.get_base_url("not-a-url") == "not-a-url"
+        assert handler.get_base_url("") == ""
+        assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
+        
+        # Missing scheme or netloc
+        assert handler.get_base_url("//example.com/path") == "//example.com/path"  # Should return original
+        assert handler.get_base_url("/path/to/resource") == "/path/to/resource"  # Should return original