= ({ onSwitchToBr
)}
+ {/* Discovery Information */}
+ {(operation as any).discovered_file && (
+
+ )}
+
+ {/* Linked Files */}
+ {(operation as any).linked_files && (operation as any).linked_files.length > 0 && (
+
+
+ Following {(operation as any).linked_files.length} Linked File
+ {(operation as any).linked_files.length > 1 ? "s" : ""}
+
+
+ {(operation as any).linked_files.map((file: string, idx: number) => (
+
+ • {file}
+
+ ))}
+
+
+ )}
+
{/* Current Action or Operation Type Info */}
{(operation.current_url || operation.operation_type) && (
diff --git a/archon-ui-main/src/features/progress/types/progress.ts b/archon-ui-main/src/features/progress/types/progress.ts
index f129d191..74cbc5b8 100644
--- a/archon-ui-main/src/features/progress/types/progress.ts
+++ b/archon-ui-main/src/features/progress/types/progress.ts
@@ -6,6 +6,7 @@
export type ProgressStatus =
| "starting"
| "initializing"
+ | "discovery"
| "analyzing"
| "crawling"
| "processing"
@@ -24,7 +25,16 @@ export type ProgressStatus =
| "cancelled"
| "stopping";
-export type CrawlType = "normal" | "sitemap" | "llms-txt" | "text_file" | "refresh";
+export type CrawlType =
+ | "normal"
+ | "sitemap"
+ | "llms-txt"
+ | "text_file"
+ | "refresh"
+ | "llms_txt_with_linked_files"
+ | "llms_txt_linked_files"
+ | "discovery_single_file"
+ | "discovery_sitemap";
export type UploadType = "document";
export interface BaseProgressData {
@@ -48,6 +58,10 @@ export interface CrawlProgressData extends BaseProgressData {
codeBlocksFound?: number;
totalSummaries?: number;
completedSummaries?: number;
+ // Discovery-related fields
+ discoveredFile?: string;
+ discoveredFileType?: string;
+ linkedFiles?: string[];
originalCrawlParams?: {
url: string;
knowledge_type?: string;
@@ -127,6 +141,13 @@ export interface ProgressResponse {
codeBlocksFound?: number;
totalSummaries?: number;
completedSummaries?: number;
+ // Discovery-related fields
+ discoveredFile?: string;
+ discovered_file?: string; // Snake case from backend
+ discoveredFileType?: string;
+ discovered_file_type?: string; // Snake case from backend
+ linkedFiles?: string[];
+ linked_files?: string[]; // Snake case from backend
fileName?: string;
fileSize?: number;
chunksProcessed?: number;
diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
index ab8ccdad..c11a6312 100644
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -385,17 +385,32 @@ class CrawlingService:
if not self.url_handler.is_binary_file(discovered_file):
discovered_urls.append(discovered_file)
safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}")
+
+ # Determine file type for user feedback
+ discovered_file_type = "unknown"
+ if self.url_handler.is_llms_variant(discovered_file):
+ discovered_file_type = "llms.txt"
+ elif self.url_handler.is_sitemap(discovered_file):
+ discovered_file_type = "sitemap"
+ elif self.url_handler.is_robots_txt(discovered_file):
+ discovered_file_type = "robots.txt"
+
+ await update_mapped_progress(
+ "discovery", 100,
+ f"Discovery completed: found {discovered_file_type} file",
+ current_url=url,
+ discovered_file=discovered_file,
+ discovered_file_type=discovered_file_type
+ )
else:
safe_logfire_info(f"Skipping binary file: {discovered_file}")
else:
safe_logfire_info(f"Discovery found no files for {url}")
-
- file_count = len(discovered_urls)
- safe_logfire_info(f"Discovery selected {file_count} best file to crawl")
-
- await update_mapped_progress(
- "discovery", 100, f"Discovery completed: selected {file_count} best file", current_url=url
- )
+ await update_mapped_progress(
+ "discovery", 100,
+ "Discovery completed: no special files found, will crawl main URL",
+ current_url=url
+ )
except Exception as e:
safe_logfire_error(f"Discovery phase failed: {e}")
@@ -726,6 +741,52 @@ class CrawlingService:
# If parsing fails, be conservative and exclude the URL
return False
+ def _is_same_domain_or_subdomain(self, url: str, base_domain: str) -> bool:
+ """
+ Check if a URL belongs to the same root domain or subdomain.
+
+ Examples:
+ - docs.supabase.com matches supabase.com (subdomain)
+ - api.supabase.com matches supabase.com (subdomain)
+ - supabase.com matches supabase.com (exact match)
+ - external.com does NOT match supabase.com
+
+ Args:
+ url: URL to check
+ base_domain: Base domain URL to compare against
+
+ Returns:
+ True if the URL is from the same root domain or subdomain
+ """
+ try:
+ from urllib.parse import urlparse
+ u, b = urlparse(url), urlparse(base_domain)
+ url_host = (u.hostname or "").lower()
+ base_host = (b.hostname or "").lower()
+
+ if not url_host or not base_host:
+ return False
+
+ # Exact match
+ if url_host == base_host:
+ return True
+
+ # Check if url_host is a subdomain of base_host
+ # Extract root domain (last 2 parts for .com, .org, etc.)
+ def get_root_domain(host: str) -> str:
+ parts = host.split('.')
+ if len(parts) >= 2:
+ return '.'.join(parts[-2:])
+ return host
+
+ url_root = get_root_domain(url_host)
+ base_root = get_root_domain(base_host)
+
+ return url_root == base_root
+ except Exception:
+ # If parsing fails, be conservative and exclude the URL
+ return False
+
def _is_self_link(self, link: str, base_url: str) -> bool:
"""
Check if a link is a self-referential link to the base URL.
@@ -798,8 +859,60 @@ class CrawlingService:
if crawl_results and len(crawl_results) > 0:
content = crawl_results[0].get('markdown', '')
if self.url_handler.is_link_collection_file(url, content):
- # If this file was selected by discovery, skip link extraction (single-file mode)
+ # If this file was selected by discovery, check if it's an llms.txt file
if request.get("is_discovery_target"):
+ # Check if this is an llms.txt file (not sitemap or other discovery targets)
+ is_llms_file = self.url_handler.is_llms_variant(url)
+
+ if is_llms_file:
+ logger.info(f"Discovery llms.txt mode: checking for linked llms.txt files at {url}")
+
+ # Extract all links from the file
+ extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
+
+ # Filter for llms.txt files only on same domain
+ llms_links = []
+ if extracted_links_with_text:
+ original_domain = request.get("original_domain")
+ if original_domain:
+ for link, text in extracted_links_with_text:
+ # Check if link is to another llms.txt file
+ if self.url_handler.is_llms_variant(link):
+ # Check same domain/subdomain
+ if self._is_same_domain_or_subdomain(link, original_domain):
+ llms_links.append((link, text))
+ logger.info(f"Found linked llms.txt: {link}")
+
+ if llms_links:
+ # Build mapping and extract just URLs
+ url_to_link_text = dict(llms_links)
+ extracted_llms_urls = [link for link, _ in llms_links]
+
+ logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files")
+
+ # Notify user about linked files being crawled
+ await update_crawl_progress(
+ 60, # 60% of crawling stage
+ f"Found {len(extracted_llms_urls)} linked llms.txt files, crawling them now...",
+ crawl_type="llms_txt_linked_files",
+ linked_files=extracted_llms_urls
+ )
+
+ # Crawl linked llms.txt files (no recursion, just one level)
+ batch_results = await self.crawl_batch_with_progress(
+ extracted_llms_urls,
+ max_concurrent=request.get('max_concurrent'),
+ progress_callback=await self._create_crawl_progress_callback("crawling"),
+ link_text_fallbacks=url_to_link_text,
+ )
+
+ # Combine original llms.txt with linked files
+ crawl_results.extend(batch_results)
+ crawl_type = "llms_txt_with_linked_files"
+ logger.info(f"llms.txt crawling completed: {len(crawl_results)} total files (1 main + {len(batch_results)} linked)")
+ return crawl_results, crawl_type
+
+ # For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode
logger.info(f"Discovery single-file mode: skipping link extraction for {url}")
crawl_type = "discovery_single_file"
logger.info(f"Discovery file crawling completed: {len(crawl_results)} result")
diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py
index fc1671d0..28ea2f5e 100644
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -135,51 +135,71 @@ class DiscoveryService:
logger.info(f"Starting single-file discovery for {base_url}")
# Check files in global priority order
- # Note: robots.txt sitemaps are not given special priority as llms files should be preferred
+ # IMPORTANT: Check root-level llms files BEFORE same-directory sitemaps
+ # This ensures llms.txt at root is preferred over /docs/sitemap.xml
+ from urllib.parse import urlparse
+
+ # Get the directory path of the base URL
+ parsed = urlparse(base_url)
+ base_path = parsed.path.rstrip('/')
+ # Extract directory (remove filename if present)
+ if '.' in base_path.split('/')[-1]:
+ base_dir = '/'.join(base_path.split('/')[:-1])
+ else:
+ base_dir = base_path
+
+ # Phase 1: Check llms files at ALL priority levels before checking sitemaps
for filename in self.DISCOVERY_PRIORITY:
- from urllib.parse import urlparse
+ if not filename.startswith('llms') and not filename.startswith('.well-known/llms') and not filename.startswith('.well-known/ai'):
+ continue # Skip non-llms files in this phase
- # Get the directory path of the base URL
- parsed = urlparse(base_url)
- base_path = parsed.path.rstrip('/')
- # Extract directory (remove filename if present)
- if '.' in base_path.split('/')[-1]:
- base_dir = '/'.join(base_path.split('/')[:-1])
- else:
- base_dir = base_path
-
- # Priority 1: Check same directory as base_url (e.g., /docs/llms.txt for /docs URL)
+ # Priority 1a: Check same directory for llms files
if base_dir and base_dir != '/':
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
if self._check_url_exists(same_dir_url):
logger.info(f"Discovery found best file in same directory: {same_dir_url}")
return same_dir_url
- # Priority 2: Check root-level (standard urljoin behavior)
+ # Priority 1b: Check root-level for llms files
file_url = urljoin(base_url, filename)
if self._check_url_exists(file_url):
logger.info(f"Discovery found best file at root: {file_url}")
return file_url
- # Priority 3: For llms files, check common subdirectories (including base directory name)
- if filename.startswith('llms'):
- # Extract base directory name to check it first
- subdirs = []
- if base_dir and base_dir != '/':
- base_dir_name = base_dir.split('/')[-1]
- if base_dir_name:
- subdirs.append(base_dir_name)
- subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
+ # Priority 1c: Check subdirectories for llms files
+ subdirs = []
+ if base_dir and base_dir != '/':
+ base_dir_name = base_dir.split('/')[-1]
+ if base_dir_name:
+ subdirs.append(base_dir_name)
+ subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
- for subdir in subdirs:
- subdir_url = urljoin(base_url, f"{subdir}/{filename}")
- if self._check_url_exists(subdir_url):
- logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
- return subdir_url
+ for subdir in subdirs:
+ subdir_url = urljoin(base_url, f"{subdir}/{filename}")
+ if self._check_url_exists(subdir_url):
+ logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
+ return subdir_url
- # Priority 4: For sitemap files, check common subdirectories (including base directory name)
+ # Phase 2: Check sitemaps and robots.txt (only if no llms files found)
+ for filename in self.DISCOVERY_PRIORITY:
+ if filename.startswith('llms') or filename.startswith('.well-known/llms') or filename.startswith('.well-known/ai'):
+ continue # Skip llms files, already checked
+
+ # Priority 2a: Check same directory
+ if base_dir and base_dir != '/':
+ same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
+ if self._check_url_exists(same_dir_url):
+ logger.info(f"Discovery found best file in same directory: {same_dir_url}")
+ return same_dir_url
+
+ # Priority 2b: Check root-level
+ file_url = urljoin(base_url, filename)
+ if self._check_url_exists(file_url):
+ logger.info(f"Discovery found best file at root: {file_url}")
+ return file_url
+
+ # Priority 2c: For sitemap files, check common subdirectories
if filename.endswith('.xml') and not filename.startswith('.well-known'):
- # Extract base directory name to check it first
subdirs = []
if base_dir and base_dir != '/':
base_dir_name = base_dir.split('/')[-1]
diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py
index fa79ebe3..ac8513fe 100644
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -634,6 +634,10 @@ class URLHandler:
"""
Check if a URL is a llms.txt/llms.md variant with error handling.
+ Matches:
+ - Exact filename matches: llms.txt, llms-full.txt, llms.md, etc.
+ - Files in /llms/ directories: /llms/guides.txt, /llms/swift.txt, etc.
+
Args:
url: URL to check
@@ -646,9 +650,16 @@ class URLHandler:
path = parsed.path.lower()
filename = path.split('/')[-1] if '/' in path else path
- # Check for llms file variants
+ # Check for exact llms file variants (llms.txt, llms.md, etc.)
llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
- return filename in llms_variants
+ if filename in llms_variants:
+ return True
+
+ # Check for .txt files in /llms/ directory (e.g., /llms/guides.txt, /llms/swift.txt)
+ if '/llms/' in path and path.endswith('.txt'):
+ return True
+
+ return False
except Exception as e:
logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
return False
diff --git a/python/tests/test_crawling_service_subdomain.py b/python/tests/test_crawling_service_subdomain.py
new file mode 100644
index 00000000..543423c8
--- /dev/null
+++ b/python/tests/test_crawling_service_subdomain.py
@@ -0,0 +1,152 @@
+"""Unit tests for CrawlingService subdomain checking functionality."""
+import pytest
+from src.server.services.crawling.crawling_service import CrawlingService
+
+
+class TestCrawlingServiceSubdomain:
+ """Test suite for CrawlingService subdomain checking methods."""
+
+ @pytest.fixture
+ def service(self):
+ """Create a CrawlingService instance for testing."""
+ # Create service without crawler or supabase for testing domain checking
+ return CrawlingService(crawler=None, supabase_client=None)
+
+ def test_is_same_domain_or_subdomain_exact_match(self, service):
+ """Test exact domain matches."""
+ # Same domain should match
+ assert service._is_same_domain_or_subdomain(
+ "https://supabase.com/docs",
+ "https://supabase.com"
+ ) is True
+
+ assert service._is_same_domain_or_subdomain(
+ "https://supabase.com/path/to/page",
+ "https://supabase.com"
+ ) is True
+
+ def test_is_same_domain_or_subdomain_subdomains(self, service):
+ """Test subdomain matching."""
+ # Subdomain should match
+ assert service._is_same_domain_or_subdomain(
+ "https://docs.supabase.com/llms.txt",
+ "https://supabase.com"
+ ) is True
+
+ assert service._is_same_domain_or_subdomain(
+ "https://api.supabase.com/v1/endpoint",
+ "https://supabase.com"
+ ) is True
+
+ # Multiple subdomain levels
+ assert service._is_same_domain_or_subdomain(
+ "https://dev.api.supabase.com/test",
+ "https://supabase.com"
+ ) is True
+
+ def test_is_same_domain_or_subdomain_different_domains(self, service):
+ """Test that different domains are rejected."""
+ # Different domain should not match
+ assert service._is_same_domain_or_subdomain(
+ "https://external.com/llms.txt",
+ "https://supabase.com"
+ ) is False
+
+ assert service._is_same_domain_or_subdomain(
+ "https://docs.other-site.com",
+ "https://supabase.com"
+ ) is False
+
+ # Similar but different domains
+ assert service._is_same_domain_or_subdomain(
+ "https://supabase.org",
+ "https://supabase.com"
+ ) is False
+
+ def test_is_same_domain_or_subdomain_protocols(self, service):
+ """Test that protocol differences don't affect matching."""
+ # Different protocols should still match
+ assert service._is_same_domain_or_subdomain(
+ "http://supabase.com/docs",
+ "https://supabase.com"
+ ) is True
+
+ assert service._is_same_domain_or_subdomain(
+ "https://docs.supabase.com",
+ "http://supabase.com"
+ ) is True
+
+ def test_is_same_domain_or_subdomain_ports(self, service):
+ """Test handling of port numbers."""
+ # Same root domain with different ports should match
+ assert service._is_same_domain_or_subdomain(
+ "https://supabase.com:8080/api",
+ "https://supabase.com"
+ ) is True
+
+ assert service._is_same_domain_or_subdomain(
+ "http://localhost:3000/dev",
+ "http://localhost:8080"
+ ) is True
+
+ def test_is_same_domain_or_subdomain_edge_cases(self, service):
+ """Test edge cases and error handling."""
+ # Empty or malformed URLs should return False
+ assert service._is_same_domain_or_subdomain(
+ "",
+ "https://supabase.com"
+ ) is False
+
+ assert service._is_same_domain_or_subdomain(
+ "https://supabase.com",
+ ""
+ ) is False
+
+ assert service._is_same_domain_or_subdomain(
+ "not-a-url",
+ "https://supabase.com"
+ ) is False
+
+ def test_is_same_domain_or_subdomain_real_world_examples(self, service):
+ """Test with real-world examples."""
+ # GitHub examples
+ assert service._is_same_domain_or_subdomain(
+ "https://api.github.com/repos",
+ "https://github.com"
+ ) is True
+
+ assert service._is_same_domain_or_subdomain(
+ "https://raw.githubusercontent.com/owner/repo",
+ "https://github.com"
+ ) is False # githubusercontent.com is different root domain
+
+ # Documentation sites
+ assert service._is_same_domain_or_subdomain(
+ "https://docs.python.org/3/library",
+ "https://python.org"
+ ) is True
+
+ assert service._is_same_domain_or_subdomain(
+ "https://api.stripe.com/v1",
+ "https://stripe.com"
+ ) is True
+
+ def test_is_same_domain_backward_compatibility(self, service):
+ """Test that _is_same_domain still works correctly for exact matches."""
+ # Exact domain match should work
+ assert service._is_same_domain(
+ "https://supabase.com/docs",
+ "https://supabase.com"
+ ) is True
+
+ # Subdomain should NOT match with _is_same_domain (only with _is_same_domain_or_subdomain)
+ assert service._is_same_domain(
+ "https://docs.supabase.com/llms.txt",
+ "https://supabase.com"
+ ) is False
+
+ # Different domain should not match
+ assert service._is_same_domain(
+ "https://external.com/llms.txt",
+ "https://supabase.com"
+ ) is False
diff --git a/python/tests/test_llms_txt_link_following.py b/python/tests/test_llms_txt_link_following.py
new file mode 100644
index 00000000..93cabb15
--- /dev/null
+++ b/python/tests/test_llms_txt_link_following.py
@@ -0,0 +1,217 @@
+"""Integration tests for llms.txt link following functionality."""
+import pytest
+from unittest.mock import AsyncMock, MagicMock, patch
+from src.server.services.crawling.crawling_service import CrawlingService
+
+
+class TestLlmsTxtLinkFollowing:
+ """Test suite for llms.txt link following feature."""
+
+ @pytest.fixture
+ def service(self):
+ """Create a CrawlingService instance for testing."""
+ return CrawlingService(crawler=None, supabase_client=None)
+
+ @pytest.fixture
+ def supabase_llms_content(self):
+ """Return the actual Supabase llms.txt content."""
+ return """# Supabase Docs
+
+- [Supabase Guides](https://supabase.com/llms/guides.txt)
+- [Supabase Reference (JavaScript)](https://supabase.com/llms/js.txt)
+- [Supabase Reference (Dart)](https://supabase.com/llms/dart.txt)
+- [Supabase Reference (Swift)](https://supabase.com/llms/swift.txt)
+- [Supabase Reference (Kotlin)](https://supabase.com/llms/kotlin.txt)
+- [Supabase Reference (Python)](https://supabase.com/llms/python.txt)
+- [Supabase Reference (C#)](https://supabase.com/llms/csharp.txt)
+- [Supabase CLI Reference](https://supabase.com/llms/cli.txt)
+"""
+
+ def test_extract_links_from_supabase_llms_txt(self, service, supabase_llms_content):
+ """Test that links are correctly extracted from Supabase llms.txt."""
+ url = "https://supabase.com/docs/llms.txt"
+
+ extracted_links = service.url_handler.extract_markdown_links_with_text(
+ supabase_llms_content, url
+ )
+
+ # Should extract 8 links
+ assert len(extracted_links) == 8
+
+ # Verify all extracted links
+ expected_links = [
+ "https://supabase.com/llms/guides.txt",
+ "https://supabase.com/llms/js.txt",
+ "https://supabase.com/llms/dart.txt",
+ "https://supabase.com/llms/swift.txt",
+ "https://supabase.com/llms/kotlin.txt",
+ "https://supabase.com/llms/python.txt",
+ "https://supabase.com/llms/csharp.txt",
+ "https://supabase.com/llms/cli.txt",
+ ]
+
+ extracted_urls = [link for link, _ in extracted_links]
+ assert extracted_urls == expected_links
+
+ def test_all_links_are_llms_variants(self, service, supabase_llms_content):
+ """Test that all extracted links are recognized as llms.txt variants."""
+ url = "https://supabase.com/docs/llms.txt"
+
+ extracted_links = service.url_handler.extract_markdown_links_with_text(
+ supabase_llms_content, url
+ )
+
+ # All links should be recognized as llms variants
+ for link, _ in extracted_links:
+ is_llms = service.url_handler.is_llms_variant(link)
+ assert is_llms, f"Link {link} should be recognized as llms.txt variant"
+
+ def test_all_links_are_same_domain(self, service, supabase_llms_content):
+ """Test that all extracted links are from the same domain."""
+ url = "https://supabase.com/docs/llms.txt"
+ original_domain = "https://supabase.com"
+
+ extracted_links = service.url_handler.extract_markdown_links_with_text(
+ supabase_llms_content, url
+ )
+
+ # All links should be from the same domain
+ for link, _ in extracted_links:
+ is_same = service._is_same_domain_or_subdomain(link, original_domain)
+ assert is_same, f"Link {link} should match domain {original_domain}"
+
+ def test_filter_llms_links_from_supabase(self, service, supabase_llms_content):
+ """Test the complete filtering logic for Supabase llms.txt."""
+ url = "https://supabase.com/docs/llms.txt"
+ original_domain = "https://supabase.com"
+
+ # Extract all links
+ extracted_links = service.url_handler.extract_markdown_links_with_text(
+ supabase_llms_content, url
+ )
+
+ # Filter for llms.txt files on same domain (mimics actual code)
+ llms_links = []
+ for link, text in extracted_links:
+ if service.url_handler.is_llms_variant(link):
+ if service._is_same_domain_or_subdomain(link, original_domain):
+ llms_links.append((link, text))
+
+ # Should have all 8 links
+ assert len(llms_links) == 8, f"Expected 8 llms links, got {len(llms_links)}"
+
+ @pytest.mark.asyncio
+ async def test_llms_txt_link_following_integration(self, service, supabase_llms_content):
+ """Integration test for the complete llms.txt link following flow."""
+ url = "https://supabase.com/docs/llms.txt"
+
+ # Mock the crawl_batch_with_progress to verify it's called with correct URLs
+ mock_batch_results = [
+ {'url': f'https://supabase.com/llms/{name}.txt', 'markdown': f'# {name}', 'title': f'{name}'}
+ for name in ['guides', 'js', 'dart', 'swift', 'kotlin', 'python', 'csharp', 'cli']
+ ]
+
+ service.crawl_batch_with_progress = AsyncMock(return_value=mock_batch_results)
+ service.crawl_markdown_file = AsyncMock(return_value=[{
+ 'url': url,
+ 'markdown': supabase_llms_content,
+ 'title': 'Supabase Docs'
+ }])
+
+ # Create progress tracker mock
+ service.progress_tracker = MagicMock()
+ service.progress_tracker.update = AsyncMock()
+
+ # Simulate the request that would come from orchestration
+ request = {
+ "is_discovery_target": True,
+ "original_domain": "https://supabase.com",
+ "max_concurrent": 5
+ }
+
+ # Call the actual crawl method
+ crawl_results, crawl_type = await service._crawl_by_url_type(url, request)
+
+ # Verify batch crawl was called with the 8 llms.txt URLs
+ service.crawl_batch_with_progress.assert_called_once()
+ call_args = service.crawl_batch_with_progress.call_args
+ crawled_urls = call_args[0][0] # First positional argument
+
+ assert len(crawled_urls) == 8, f"Should crawl 8 linked files, got {len(crawled_urls)}"
+
+ expected_urls = [
+ "https://supabase.com/llms/guides.txt",
+ "https://supabase.com/llms/js.txt",
+ "https://supabase.com/llms/dart.txt",
+ "https://supabase.com/llms/swift.txt",
+ "https://supabase.com/llms/kotlin.txt",
+ "https://supabase.com/llms/python.txt",
+ "https://supabase.com/llms/csharp.txt",
+ "https://supabase.com/llms/cli.txt",
+ ]
+
+ assert set(crawled_urls) == set(expected_urls)
+
+ # Verify total results include main file + linked files
+ assert len(crawl_results) == 9, f"Should have 9 total files (1 main + 8 linked), got {len(crawl_results)}"
+
+ # Verify crawl type
+ assert crawl_type == "llms_txt_with_linked_files"
+
+ def test_external_llms_links_are_filtered(self, service):
+ """Test that external domain llms.txt links are filtered out."""
+ content = """# Test llms.txt
+
+- [Internal Link](https://supabase.com/llms/internal.txt)
+- [External Link](https://external.com/llms/external.txt)
+- [Another Internal](https://docs.supabase.com/llms/docs.txt)
+"""
+ url = "https://supabase.com/llms.txt"
+ original_domain = "https://supabase.com"
+
+ extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
+
+ # Filter for same-domain llms links
+ llms_links = []
+ for link, text in extracted_links:
+ if service.url_handler.is_llms_variant(link):
+ if service._is_same_domain_or_subdomain(link, original_domain):
+ llms_links.append((link, text))
+
+ # Should only have 2 links (internal and subdomain), external filtered out
+ assert len(llms_links) == 2
+
+ urls = [link for link, _ in llms_links]
+ assert "https://supabase.com/llms/internal.txt" in urls
+ assert "https://docs.supabase.com/llms/docs.txt" in urls
+ assert "https://external.com/llms/external.txt" not in urls
+
+ def test_non_llms_links_are_filtered(self, service):
+ """Test that non-llms.txt links are filtered out."""
+ content = """# Test llms.txt
+
+- [LLMs Link](https://supabase.com/llms/guide.txt)
+- [Regular Doc](https://supabase.com/docs/guide)
+- [PDF File](https://supabase.com/docs/guide.pdf)
+- [Another LLMs](https://supabase.com/llms/api.txt)
+"""
+ url = "https://supabase.com/llms.txt"
+ original_domain = "https://supabase.com"
+
+ extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
+
+ # Filter for llms links only
+ llms_links = []
+ for link, text in extracted_links:
+ if service.url_handler.is_llms_variant(link):
+ if service._is_same_domain_or_subdomain(link, original_domain):
+ llms_links.append((link, text))
+
+ # Should only have 2 llms.txt links
+ assert len(llms_links) == 2
+
+ urls = [link for link, _ in llms_links]
+ assert "https://supabase.com/llms/guide.txt" in urls
+ assert "https://supabase.com/llms/api.txt" in urls
+ assert "https://supabase.com/docs/guide" not in urls
+ assert "https://supabase.com/docs/guide.pdf" not in urls