Implement priority-based automatic discovery of llms.txt and sitemap.xml files

- Add DiscoveryService with single-file priority selection
  - Priority: llms-full.txt > llms.txt > llms.md > llms.mdx > sitemap.xml > robots.txt
  - All files contain similar AI/crawling guidance, so only best one is needed
  - Robots.txt sitemap declarations have highest priority
  - Fallback to subdirectories for llms files

- Enhance URLHandler with discovery helper methods
  - Add is_robots_txt, is_llms_variant, is_well_known_file, get_base_url methods
  - Follow existing patterns with proper error handling

- Integrate discovery into CrawlingService orchestration
  - When discovery finds file: crawl ONLY discovered file (not main URL)
  - When no discovery: crawl main URL normally
  - Fixes issue where both main URL + discovered file were crawled

- Add discovery stage to progress mapping
  - New "discovery" stage in progress flow
  - Clear progress messages for discovered files

- Comprehensive test coverage
  - Tests for priority-based selection logic
  - Tests for robots.txt priority and fallback behavior
  - Updated existing tests for new return formats

Resolves efficient crawling by selecting single best guidance file instead
of crawling redundant content from multiple similar files.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-09-08 09:03:15 +02:00
parent 012d2c58ed
commit 1a55d93a4e
6 changed files with 1193 additions and 42 deletions

View File

@@ -122,4 +122,122 @@ class TestURLHandler:
# Should not transform non-GitHub URLs
other = "https://example.com/file"
assert handler.transform_github_url(other) == other
assert handler.transform_github_url(other) == other
def test_is_robots_txt(self):
"""Test robots.txt detection."""
handler = URLHandler()
# Standard robots.txt URLs
assert handler.is_robots_txt("https://example.com/robots.txt") is True
assert handler.is_robots_txt("http://example.com/robots.txt") is True
assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
# Case sensitivity
assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
# With query parameters (should still be detected)
assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
# Not robots.txt files
assert handler.is_robots_txt("https://example.com/robots") is False
assert handler.is_robots_txt("https://example.com/robots.html") is False
assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
assert handler.is_robots_txt("https://example.com/") is False
# Edge case: malformed URL should not crash
assert handler.is_robots_txt("not-a-url") is False
def test_is_llms_variant(self):
"""Test llms file variant detection."""
handler = URLHandler()
# All llms variants
assert handler.is_llms_variant("https://example.com/llms.txt") is True
assert handler.is_llms_variant("https://example.com/llms.md") is True
assert handler.is_llms_variant("https://example.com/llms.mdx") is True
assert handler.is_llms_variant("https://example.com/llms.markdown") is True
# Case sensitivity
assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
assert handler.is_llms_variant("https://example.com/Llms.Md") is True
# With paths (should still detect)
assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
assert handler.is_llms_variant("https://example.com/public/llms.md") is True
# With query parameters
assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
assert handler.is_llms_variant("https://example.com/llms.md#section") is True
# Not llms files
assert handler.is_llms_variant("https://example.com/llms") is False
assert handler.is_llms_variant("https://example.com/llms.html") is False
assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
assert handler.is_llms_variant("https://example.com/readme.txt") is False
# Edge case: malformed URL should not crash
assert handler.is_llms_variant("not-a-url") is False
def test_is_well_known_file(self):
"""Test .well-known file detection."""
handler = URLHandler()
# Standard .well-known files
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
# Case sensitivity (path should be case sensitive)
assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is True
assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is True
# With query parameters
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
# Not .well-known files
assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
assert handler.is_well_known_file("https://example.com/ai.txt") is False
assert handler.is_well_known_file("https://example.com/") is False
# Edge case: malformed URL should not crash
assert handler.is_well_known_file("not-a-url") is False
def test_get_base_url(self):
"""Test base URL extraction."""
handler = URLHandler()
# Standard URLs
assert handler.get_base_url("https://example.com") == "https://example.com"
assert handler.get_base_url("https://example.com/") == "https://example.com"
assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
# HTTP vs HTTPS
assert handler.get_base_url("http://example.com/path") == "http://example.com"
assert handler.get_base_url("https://example.com/path") == "https://example.com"
# Subdomains and ports
assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
# Complex cases
assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
# Edge cases - malformed URLs should return original
assert handler.get_base_url("not-a-url") == "not-a-url"
assert handler.get_base_url("") == ""
assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
# Missing scheme or netloc
assert handler.get_base_url("//example.com/path") == "//example.com/path" # Should return original
assert handler.get_base_url("/path/to/resource") == "/path/to/resource" # Should return original