mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Implements complete llms.txt link following functionality that crawls linked llms.txt files on the same domain/subdomain, along with critical bug fixes for discovery priority and variant detection. Backend Core Functionality: - Add _is_same_domain_or_subdomain method for subdomain matching - Fix is_llms_variant to detect .txt files in /llms/ directories - Implement llms.txt link extraction and following logic - Add two-phase discovery: prioritize ALL llms.txt before sitemaps - Enhanced progress reporting with discovery metadata Critical Bug Fixes: - Discovery priority: Fixed sitemap.xml being found before llms.txt - is_llms_variant: Now matches /llms/guides.txt, /llms/swift.txt, etc. - These were blocking bugs preventing link following from working Frontend UI: - Add discovery and linked files display to CrawlingProgress component - Update progress types to include discoveredFile, linkedFiles fields - Add new crawl types: llms_txt_with_linked_files, discovery_* - Add "discovery" to ProgressStatus enum and active statuses Testing: - 8 subdomain matching unit tests (test_crawling_service_subdomain.py) - 7 integration tests for link following (test_llms_txt_link_following.py) - All 15 tests passing - Validated against real Supabase llms.txt structure (1 main + 8 linked) Files Modified: Backend: - crawling_service.py: Core link following logic (lines 744-788, 862-920) - url_handler.py: Fixed variant detection (lines 633-665) - discovery_service.py: Two-phase discovery (lines 137-214) - 2 new comprehensive test files Frontend: - progress/types/progress.ts: Updated types with new fields - progress/components/CrawlingProgress.tsx: Added UI sections Real-world testing: Crawling supabase.com/docs now discovers /docs/llms.txt and automatically follows 8 linked llms.txt files, indexing complete documentation from all files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
153 lines
5.2 KiB
Python
153 lines
5.2 KiB
Python
"""Unit tests for CrawlingService subdomain checking functionality."""
|
|
import pytest
|
|
from src.server.services.crawling.crawling_service import CrawlingService
|
|
|
|
|
|
class TestCrawlingServiceSubdomain:
|
|
"""Test suite for CrawlingService subdomain checking methods."""
|
|
|
|
@pytest.fixture
|
|
def service(self):
|
|
"""Create a CrawlingService instance for testing."""
|
|
# Create service without crawler or supabase for testing domain checking
|
|
return CrawlingService(crawler=None, supabase_client=None)
|
|
|
|
def test_is_same_domain_or_subdomain_exact_match(self, service):
|
|
"""Test exact domain matches."""
|
|
# Same domain should match
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://supabase.com/docs",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://supabase.com/path/to/page",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
def test_is_same_domain_or_subdomain_subdomains(self, service):
|
|
"""Test subdomain matching."""
|
|
# Subdomain should match
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://docs.supabase.com/llms.txt",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://api.supabase.com/v1/endpoint",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
# Multiple subdomain levels
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://dev.api.supabase.com/test",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
def test_is_same_domain_or_subdomain_different_domains(self, service):
|
|
"""Test that different domains are rejected."""
|
|
# Different domain should not match
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://external.com/llms.txt",
|
|
"https://supabase.com"
|
|
) is False
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://docs.other-site.com",
|
|
"https://supabase.com"
|
|
) is False
|
|
|
|
# Similar but different domains
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://supabase.org",
|
|
"https://supabase.com"
|
|
) is False
|
|
|
|
def test_is_same_domain_or_subdomain_protocols(self, service):
|
|
"""Test that protocol differences don't affect matching."""
|
|
# Different protocols should still match
|
|
assert service._is_same_domain_or_subdomain(
|
|
"http://supabase.com/docs",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://docs.supabase.com",
|
|
"http://supabase.com"
|
|
) is True
|
|
|
|
def test_is_same_domain_or_subdomain_ports(self, service):
|
|
"""Test handling of port numbers."""
|
|
# Same root domain with different ports should match
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://supabase.com:8080/api",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"http://localhost:3000/dev",
|
|
"http://localhost:8080"
|
|
) is True
|
|
|
|
def test_is_same_domain_or_subdomain_edge_cases(self, service):
|
|
"""Test edge cases and error handling."""
|
|
# Empty or malformed URLs should return False
|
|
assert service._is_same_domain_or_subdomain(
|
|
"",
|
|
"https://supabase.com"
|
|
) is False
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://supabase.com",
|
|
""
|
|
) is False
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"not-a-url",
|
|
"https://supabase.com"
|
|
) is False
|
|
|
|
def test_is_same_domain_or_subdomain_real_world_examples(self, service):
|
|
"""Test with real-world examples."""
|
|
# GitHub examples
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://api.github.com/repos",
|
|
"https://github.com"
|
|
) is True
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://raw.githubusercontent.com/owner/repo",
|
|
"https://github.com"
|
|
) is False # githubusercontent.com is different root domain
|
|
|
|
# Documentation sites
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://docs.python.org/3/library",
|
|
"https://python.org"
|
|
) is True
|
|
|
|
assert service._is_same_domain_or_subdomain(
|
|
"https://api.stripe.com/v1",
|
|
"https://stripe.com"
|
|
) is True
|
|
|
|
def test_is_same_domain_backward_compatibility(self, service):
|
|
"""Test that _is_same_domain still works correctly for exact matches."""
|
|
# Exact domain match should work
|
|
assert service._is_same_domain(
|
|
"https://supabase.com/docs",
|
|
"https://supabase.com"
|
|
) is True
|
|
|
|
# Subdomain should NOT match with _is_same_domain (only with _is_same_domain_or_subdomain)
|
|
assert service._is_same_domain(
|
|
"https://docs.supabase.com/llms.txt",
|
|
"https://supabase.com"
|
|
) is False
|
|
|
|
# Different domain should not match
|
|
assert service._is_same_domain(
|
|
"https://external.com/llms.txt",
|
|
"https://supabase.com"
|
|
) is False
|