Implement priority-based automatic discovery of llms.txt and sitemap.xml files

- Add DiscoveryService with single-file priority selection - Priority: llms-full.txt > llms.txt > llms.md > llms.mdx > sitemap.xml > robots.txt - All files contain similar AI/crawling guidance, so only best one is needed - Robots.txt sitemap declarations have highest priority - Fallback to subdirectories for llms files - Enhance URLHandler with discovery helper methods - Add is_robots_txt, is_llms_variant, is_well_known_file, get_base_url methods - Follow existing patterns with proper error handling - Integrate discovery into CrawlingService orchestration - When discovery finds file: crawl ONLY discovered file (not main URL) - When no discovery: crawl main URL normally - Fixes issue where both main URL + discovered file were crawled - Add discovery stage to progress mapping - New "discovery" stage in progress flow - Clear progress messages for discovered files - Comprehensive test coverage - Tests for priority-based selection logic - Tests for robots.txt priority and fallback behavior - Updated existing tests for new return formats Resolves efficient crawling by selecting single best guidance file instead of crawling redundant content from multiple similar files. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-30 21:49:30 -05:00 · 2025-09-08 09:03:15 +02:00
parent 012d2c58ed
commit 1a55d93a4e
6 changed files with 1193 additions and 42 deletions
--- a/python/tests/test_url_handler.py
+++ b/python/tests/test_url_handler.py
@@ -122,4 +122,122 @@ class TestURLHandler:
        
        # Should not transform non-GitHub URLs
        other = "https://example.com/file"
-        assert handler.transform_github_url(other) == other
+        assert handler.transform_github_url(other) == other
+
+    def test_is_robots_txt(self):
+        """Test robots.txt detection."""
+        handler = URLHandler()
+        
+        # Standard robots.txt URLs
+        assert handler.is_robots_txt("https://example.com/robots.txt") is True
+        assert handler.is_robots_txt("http://example.com/robots.txt") is True
+        assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
+        
+        # Case sensitivity
+        assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
+        assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
+        
+        # With query parameters (should still be detected)
+        assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
+        assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
+        
+        # Not robots.txt files
+        assert handler.is_robots_txt("https://example.com/robots") is False
+        assert handler.is_robots_txt("https://example.com/robots.html") is False
+        assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
+        assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
+        assert handler.is_robots_txt("https://example.com/") is False
+        
+        # Edge case: malformed URL should not crash
+        assert handler.is_robots_txt("not-a-url") is False
+
+    def test_is_llms_variant(self):
+        """Test llms file variant detection."""
+        handler = URLHandler()
+        
+        # All llms variants
+        assert handler.is_llms_variant("https://example.com/llms.txt") is True
+        assert handler.is_llms_variant("https://example.com/llms.md") is True
+        assert handler.is_llms_variant("https://example.com/llms.mdx") is True
+        assert handler.is_llms_variant("https://example.com/llms.markdown") is True
+        
+        # Case sensitivity
+        assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
+        assert handler.is_llms_variant("https://example.com/Llms.Md") is True
+        
+        # With paths (should still detect)
+        assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
+        assert handler.is_llms_variant("https://example.com/public/llms.md") is True
+        
+        # With query parameters
+        assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
+        assert handler.is_llms_variant("https://example.com/llms.md#section") is True
+        
+        # Not llms files
+        assert handler.is_llms_variant("https://example.com/llms") is False
+        assert handler.is_llms_variant("https://example.com/llms.html") is False
+        assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
+        assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
+        assert handler.is_llms_variant("https://example.com/readme.txt") is False
+        
+        # Edge case: malformed URL should not crash
+        assert handler.is_llms_variant("not-a-url") is False
+
+    def test_is_well_known_file(self):
+        """Test .well-known file detection."""
+        handler = URLHandler()
+        
+        # Standard .well-known files
+        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
+        assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
+        assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
+        
+        # Case sensitivity (path should be case sensitive)
+        assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is True
+        assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is True
+        
+        # With query parameters
+        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
+        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
+        
+        # Not .well-known files
+        assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/ai.txt") is False
+        assert handler.is_well_known_file("https://example.com/") is False
+        
+        # Edge case: malformed URL should not crash
+        assert handler.is_well_known_file("not-a-url") is False
+
+    def test_get_base_url(self):
+        """Test base URL extraction."""
+        handler = URLHandler()
+        
+        # Standard URLs
+        assert handler.get_base_url("https://example.com") == "https://example.com"
+        assert handler.get_base_url("https://example.com/") == "https://example.com"
+        assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
+        assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
+        assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
+        
+        # HTTP vs HTTPS
+        assert handler.get_base_url("http://example.com/path") == "http://example.com"
+        assert handler.get_base_url("https://example.com/path") == "https://example.com"
+        
+        # Subdomains and ports
+        assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
+        assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
+        assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
+        
+        # Complex cases
+        assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
+        
+        # Edge cases - malformed URLs should return original
+        assert handler.get_base_url("not-a-url") == "not-a-url"
+        assert handler.get_base_url("") == ""
+        assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
+        
+        # Missing scheme or netloc
+        assert handler.get_base_url("//example.com/path") == "//example.com/path"  # Should return original
+        assert handler.get_base_url("/path/to/resource") == "/path/to/resource"  # Should return original