feat: Implement llms.txt link following with discovery priority fix

Implements complete llms.txt link following functionality that crawls linked llms.txt files on the same domain/subdomain, along with critical bug fixes for discovery priority and variant detection. Backend Core Functionality: - Add _is_same_domain_or_subdomain method for subdomain matching - Fix is_llms_variant to detect .txt files in /llms/ directories - Implement llms.txt link extraction and following logic - Add two-phase discovery: prioritize ALL llms.txt before sitemaps - Enhanced progress reporting with discovery metadata Critical Bug Fixes: - Discovery priority: Fixed sitemap.xml being found before llms.txt - is_llms_variant: Now matches /llms/guides.txt, /llms/swift.txt, etc. - These were blocking bugs preventing link following from working Frontend UI: - Add discovery and linked files display to CrawlingProgress component - Update progress types to include discoveredFile, linkedFiles fields - Add new crawl types: llms_txt_with_linked_files, discovery_* - Add "discovery" to ProgressStatus enum and active statuses Testing: - 8 subdomain matching unit tests (test_crawling_service_subdomain.py) - 7 integration tests for link following (test_llms_txt_link_following.py) - All 15 tests passing - Validated against real Supabase llms.txt structure (1 main + 8 linked) Files Modified: Backend: - crawling_service.py: Core link following logic (lines 744-788, 862-920) - url_handler.py: Fixed variant detection (lines 633-665) - discovery_service.py: Two-phase discovery (lines 137-214) - 2 new comprehensive test files Frontend: - progress/types/progress.ts: Updated types with new fields - progress/components/CrawlingProgress.tsx: Added UI sections Real-world testing: Crawling supabase.com/docs now discovers /docs/llms.txt and automatically follows 8 linked llms.txt files, indexing complete documentation from all files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-24 02:39:17 -05:00 · 2025-10-17 22:05:15 +02:00
parent a03ce1e4fd
commit cdf4323534
8 changed files with 1158 additions and 40 deletions
--- a/python/tests/test_crawling_service_subdomain.py
+++ b/python/tests/test_crawling_service_subdomain.py
@@ -0,0 +1,152 @@
+"""Unit tests for CrawlingService subdomain checking functionality."""
+import pytest
+from src.server.services.crawling.crawling_service import CrawlingService
+
+
+class TestCrawlingServiceSubdomain:
+    """Test suite for CrawlingService subdomain checking methods."""
+
+    @pytest.fixture
+    def service(self):
+        """Create a CrawlingService instance for testing."""
+        # Create service without crawler or supabase for testing domain checking
+        return CrawlingService(crawler=None, supabase_client=None)
+
+    def test_is_same_domain_or_subdomain_exact_match(self, service):
+        """Test exact domain matches."""
+        # Same domain should match
+        assert service._is_same_domain_or_subdomain(
+            "https://supabase.com/docs",
+            "https://supabase.com"
+        ) is True
+
+        assert service._is_same_domain_or_subdomain(
+            "https://supabase.com/path/to/page",
+            "https://supabase.com"
+        ) is True
+
+    def test_is_same_domain_or_subdomain_subdomains(self, service):
+        """Test subdomain matching."""
+        # Subdomain should match
+        assert service._is_same_domain_or_subdomain(
+            "https://docs.supabase.com/llms.txt",
+            "https://supabase.com"
+        ) is True
+
+        assert service._is_same_domain_or_subdomain(
+            "https://api.supabase.com/v1/endpoint",
+            "https://supabase.com"
+        ) is True
+
+        # Multiple subdomain levels
+        assert service._is_same_domain_or_subdomain(
+            "https://dev.api.supabase.com/test",
+            "https://supabase.com"
+        ) is True
+
+    def test_is_same_domain_or_subdomain_different_domains(self, service):
+        """Test that different domains are rejected."""
+        # Different domain should not match
+        assert service._is_same_domain_or_subdomain(
+            "https://external.com/llms.txt",
+            "https://supabase.com"
+        ) is False
+
+        assert service._is_same_domain_or_subdomain(
+            "https://docs.other-site.com",
+            "https://supabase.com"
+        ) is False
+
+        # Similar but different domains
+        assert service._is_same_domain_or_subdomain(
+            "https://supabase.org",
+            "https://supabase.com"
+        ) is False
+
+    def test_is_same_domain_or_subdomain_protocols(self, service):
+        """Test that protocol differences don't affect matching."""
+        # Different protocols should still match
+        assert service._is_same_domain_or_subdomain(
+            "http://supabase.com/docs",
+            "https://supabase.com"
+        ) is True
+
+        assert service._is_same_domain_or_subdomain(
+            "https://docs.supabase.com",
+            "http://supabase.com"
+        ) is True
+
+    def test_is_same_domain_or_subdomain_ports(self, service):
+        """Test handling of port numbers."""
+        # Same root domain with different ports should match
+        assert service._is_same_domain_or_subdomain(
+            "https://supabase.com:8080/api",
+            "https://supabase.com"
+        ) is True
+
+        assert service._is_same_domain_or_subdomain(
+            "http://localhost:3000/dev",
+            "http://localhost:8080"
+        ) is True
+
+    def test_is_same_domain_or_subdomain_edge_cases(self, service):
+        """Test edge cases and error handling."""
+        # Empty or malformed URLs should return False
+        assert service._is_same_domain_or_subdomain(
+            "",
+            "https://supabase.com"
+        ) is False
+
+        assert service._is_same_domain_or_subdomain(
+            "https://supabase.com",
+            ""
+        ) is False
+
+        assert service._is_same_domain_or_subdomain(
+            "not-a-url",
+            "https://supabase.com"
+        ) is False
+
+    def test_is_same_domain_or_subdomain_real_world_examples(self, service):
+        """Test with real-world examples."""
+        # GitHub examples
+        assert service._is_same_domain_or_subdomain(
+            "https://api.github.com/repos",
+            "https://github.com"
+        ) is True
+
+        assert service._is_same_domain_or_subdomain(
+            "https://raw.githubusercontent.com/owner/repo",
+            "https://github.com"
+        ) is False  # githubusercontent.com is different root domain
+
+        # Documentation sites
+        assert service._is_same_domain_or_subdomain(
+            "https://docs.python.org/3/library",
+            "https://python.org"
+        ) is True
+
+        assert service._is_same_domain_or_subdomain(
+            "https://api.stripe.com/v1",
+            "https://stripe.com"
+        ) is True
+
+    def test_is_same_domain_backward_compatibility(self, service):
+        """Test that _is_same_domain still works correctly for exact matches."""
+        # Exact domain match should work
+        assert service._is_same_domain(
+            "https://supabase.com/docs",
+            "https://supabase.com"
+        ) is True
+
+        # Subdomain should NOT match with _is_same_domain (only with _is_same_domain_or_subdomain)
+        assert service._is_same_domain(
+            "https://docs.supabase.com/llms.txt",
+            "https://supabase.com"
+        ) is False
+
+        # Different domain should not match
+        assert service._is_same_domain(
+            "https://external.com/llms.txt",
+            "https://supabase.com"
+        ) is False
--- a/python/tests/test_llms_txt_link_following.py
+++ b/python/tests/test_llms_txt_link_following.py
@@ -0,0 +1,217 @@
+"""Integration tests for llms.txt link following functionality."""
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+from src.server.services.crawling.crawling_service import CrawlingService
+
+
+class TestLlmsTxtLinkFollowing:
+    """Test suite for llms.txt link following feature."""
+
+    @pytest.fixture
+    def service(self):
+        """Create a CrawlingService instance for testing."""
+        return CrawlingService(crawler=None, supabase_client=None)
+
+    @pytest.fixture
+    def supabase_llms_content(self):
+        """Return the actual Supabase llms.txt content."""
+        return """# Supabase Docs
+
+- [Supabase Guides](https://supabase.com/llms/guides.txt)
+- [Supabase Reference (JavaScript)](https://supabase.com/llms/js.txt)
+- [Supabase Reference (Dart)](https://supabase.com/llms/dart.txt)
+- [Supabase Reference (Swift)](https://supabase.com/llms/swift.txt)
+- [Supabase Reference (Kotlin)](https://supabase.com/llms/kotlin.txt)
+- [Supabase Reference (Python)](https://supabase.com/llms/python.txt)
+- [Supabase Reference (C#)](https://supabase.com/llms/csharp.txt)
+- [Supabase CLI Reference](https://supabase.com/llms/cli.txt)
+"""
+
+    def test_extract_links_from_supabase_llms_txt(self, service, supabase_llms_content):
+        """Test that links are correctly extracted from Supabase llms.txt."""
+        url = "https://supabase.com/docs/llms.txt"
+
+        extracted_links = service.url_handler.extract_markdown_links_with_text(
+            supabase_llms_content, url
+        )
+
+        # Should extract 8 links
+        assert len(extracted_links) == 8
+
+        # Verify all extracted links
+        expected_links = [
+            "https://supabase.com/llms/guides.txt",
+            "https://supabase.com/llms/js.txt",
+            "https://supabase.com/llms/dart.txt",
+            "https://supabase.com/llms/swift.txt",
+            "https://supabase.com/llms/kotlin.txt",
+            "https://supabase.com/llms/python.txt",
+            "https://supabase.com/llms/csharp.txt",
+            "https://supabase.com/llms/cli.txt",
+        ]
+
+        extracted_urls = [link for link, _ in extracted_links]
+        assert extracted_urls == expected_links
+
+    def test_all_links_are_llms_variants(self, service, supabase_llms_content):
+        """Test that all extracted links are recognized as llms.txt variants."""
+        url = "https://supabase.com/docs/llms.txt"
+
+        extracted_links = service.url_handler.extract_markdown_links_with_text(
+            supabase_llms_content, url
+        )
+
+        # All links should be recognized as llms variants
+        for link, _ in extracted_links:
+            is_llms = service.url_handler.is_llms_variant(link)
+            assert is_llms, f"Link {link} should be recognized as llms.txt variant"
+
+    def test_all_links_are_same_domain(self, service, supabase_llms_content):
+        """Test that all extracted links are from the same domain."""
+        url = "https://supabase.com/docs/llms.txt"
+        original_domain = "https://supabase.com"
+
+        extracted_links = service.url_handler.extract_markdown_links_with_text(
+            supabase_llms_content, url
+        )
+
+        # All links should be from the same domain
+        for link, _ in extracted_links:
+            is_same = service._is_same_domain_or_subdomain(link, original_domain)
+            assert is_same, f"Link {link} should match domain {original_domain}"
+
+    def test_filter_llms_links_from_supabase(self, service, supabase_llms_content):
+        """Test the complete filtering logic for Supabase llms.txt."""
+        url = "https://supabase.com/docs/llms.txt"
+        original_domain = "https://supabase.com"
+
+        # Extract all links
+        extracted_links = service.url_handler.extract_markdown_links_with_text(
+            supabase_llms_content, url
+        )
+
+        # Filter for llms.txt files on same domain (mimics actual code)
+        llms_links = []
+        for link, text in extracted_links:
+            if service.url_handler.is_llms_variant(link):
+                if service._is_same_domain_or_subdomain(link, original_domain):
+                    llms_links.append((link, text))
+
+        # Should have all 8 links
+        assert len(llms_links) == 8, f"Expected 8 llms links, got {len(llms_links)}"
+
+    @pytest.mark.asyncio
+    async def test_llms_txt_link_following_integration(self, service, supabase_llms_content):
+        """Integration test for the complete llms.txt link following flow."""
+        url = "https://supabase.com/docs/llms.txt"
+
+        # Mock the crawl_batch_with_progress to verify it's called with correct URLs
+        mock_batch_results = [
+            {'url': f'https://supabase.com/llms/{name}.txt', 'markdown': f'# {name}', 'title': f'{name}'}
+            for name in ['guides', 'js', 'dart', 'swift', 'kotlin', 'python', 'csharp', 'cli']
+        ]
+
+        service.crawl_batch_with_progress = AsyncMock(return_value=mock_batch_results)
+        service.crawl_markdown_file = AsyncMock(return_value=[{
+            'url': url,
+            'markdown': supabase_llms_content,
+            'title': 'Supabase Docs'
+        }])
+
+        # Create progress tracker mock
+        service.progress_tracker = MagicMock()
+        service.progress_tracker.update = AsyncMock()
+
+        # Simulate the request that would come from orchestration
+        request = {
+            "is_discovery_target": True,
+            "original_domain": "https://supabase.com",
+            "max_concurrent": 5
+        }
+
+        # Call the actual crawl method
+        crawl_results, crawl_type = await service._crawl_by_url_type(url, request)
+
+        # Verify batch crawl was called with the 8 llms.txt URLs
+        service.crawl_batch_with_progress.assert_called_once()
+        call_args = service.crawl_batch_with_progress.call_args
+        crawled_urls = call_args[0][0]  # First positional argument
+
+        assert len(crawled_urls) == 8, f"Should crawl 8 linked files, got {len(crawled_urls)}"
+
+        expected_urls = [
+            "https://supabase.com/llms/guides.txt",
+            "https://supabase.com/llms/js.txt",
+            "https://supabase.com/llms/dart.txt",
+            "https://supabase.com/llms/swift.txt",
+            "https://supabase.com/llms/kotlin.txt",
+            "https://supabase.com/llms/python.txt",
+            "https://supabase.com/llms/csharp.txt",
+            "https://supabase.com/llms/cli.txt",
+        ]
+
+        assert set(crawled_urls) == set(expected_urls)
+
+        # Verify total results include main file + linked files
+        assert len(crawl_results) == 9, f"Should have 9 total files (1 main + 8 linked), got {len(crawl_results)}"
+
+        # Verify crawl type
+        assert crawl_type == "llms_txt_with_linked_files"
+
+    def test_external_llms_links_are_filtered(self, service):
+        """Test that external domain llms.txt links are filtered out."""
+        content = """# Test llms.txt
+
+- [Internal Link](https://supabase.com/llms/internal.txt)
+- [External Link](https://external.com/llms/external.txt)
+- [Another Internal](https://docs.supabase.com/llms/docs.txt)
+"""
+        url = "https://supabase.com/llms.txt"
+        original_domain = "https://supabase.com"
+
+        extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
+
+        # Filter for same-domain llms links
+        llms_links = []
+        for link, text in extracted_links:
+            if service.url_handler.is_llms_variant(link):
+                if service._is_same_domain_or_subdomain(link, original_domain):
+                    llms_links.append((link, text))
+
+        # Should only have 2 links (internal and subdomain), external filtered out
+        assert len(llms_links) == 2
+
+        urls = [link for link, _ in llms_links]
+        assert "https://supabase.com/llms/internal.txt" in urls
+        assert "https://docs.supabase.com/llms/docs.txt" in urls
+        assert "https://external.com/llms/external.txt" not in urls
+
+    def test_non_llms_links_are_filtered(self, service):
+        """Test that non-llms.txt links are filtered out."""
+        content = """# Test llms.txt
+
+- [LLMs Link](https://supabase.com/llms/guide.txt)
+- [Regular Doc](https://supabase.com/docs/guide)
+- [PDF File](https://supabase.com/docs/guide.pdf)
+- [Another LLMs](https://supabase.com/llms/api.txt)
+"""
+        url = "https://supabase.com/llms.txt"
+        original_domain = "https://supabase.com"
+
+        extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
+
+        # Filter for llms links only
+        llms_links = []
+        for link, text in extracted_links:
+            if service.url_handler.is_llms_variant(link):
+                if service._is_same_domain_or_subdomain(link, original_domain):
+                    llms_links.append((link, text))
+
+        # Should only have 2 llms.txt links
+        assert len(llms_links) == 2
+
+        urls = [link for link, _ in llms_links]
+        assert "https://supabase.com/llms/guide.txt" in urls
+        assert "https://supabase.com/llms/api.txt" in urls
+        assert "https://supabase.com/docs/guide" not in urls
+        assert "https://supabase.com/docs/guide.pdf" not in urls