diff --git a/PRPs/llms-txt-link-following.md b/PRPs/llms-txt-link-following.md new file mode 100644 index 00000000..7dbf4424 --- /dev/null +++ b/PRPs/llms-txt-link-following.md @@ -0,0 +1,538 @@ +# PRP: Follow llms.txt Links to Other llms.txt Files + +## Problem Statement + +When discovering and crawling llms.txt files, Archon currently operates in "single-file mode" and ignores all links within the file. However, many sites use llms.txt files that reference other llms.txt files on the same domain or subdomains (e.g., a main llms.txt pointing to `/docs/llms.txt`, `/api/llms.txt`, etc.). + +Additionally, users have no visibility into what files were discovered and chosen during the discovery phase, making it difficult to understand what content is being indexed. + +## Goals + +1. **Follow llms.txt links**: When an llms.txt file contains links to other llms.txt files on the same domain/subdomain, follow and index those files +2. **Same-domain only**: Only follow llms.txt links that are on the same root domain or subdomain +3. **UI feedback**: Show users what was discovered and what is being crawled in real-time + +## Current Behavior + +### Discovery Flow +1. `DiscoveryService.discover_files(base_url)` finds best file (e.g., `/docs/llms.txt`) +2. Returns single URL to `CrawlingService` +3. Crawls discovered file with `is_discovery_target=True` flag +4. At line 802-806 of `crawling_service.py`, skips ALL link extraction +5. Returns immediately with just the discovered file content + +### Progress Updates +- Discovery phase shows: "Discovery completed: selected 1 best file" +- No information about what was discovered or why +- No information about followed links + +## Proposed Solution + +### Phase 1: Backend - llms.txt Link Following + +#### 1.1 Modify Discovery Mode Link Extraction + +**File**: `python/src/server/services/crawling/crawling_service.py` +**Location**: Lines 800-806 + +**Current Code**: +```python +if self.url_handler.is_link_collection_file(url, content): + # If this file was selected by discovery, skip link extraction (single-file mode) + if request.get("is_discovery_target"): + logger.info(f"Discovery single-file mode: skipping link extraction for {url}") + crawl_type = "discovery_single_file" + logger.info(f"Discovery file crawling completed: {len(crawl_results)} result") + return crawl_results, crawl_type +``` + +**Proposed Code**: +```python +if self.url_handler.is_link_collection_file(url, content): + # If this file was selected by discovery, check if it's an llms.txt file + if request.get("is_discovery_target"): + # Check if this is an llms.txt file (not sitemap or other discovery targets) + is_llms_file = self.url_handler.is_llms_variant(url) + + if is_llms_file: + logger.info(f"Discovery llms.txt mode: checking for linked llms.txt files at {url}") + + # Extract all links from the file + extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url) + + # Filter for llms.txt files only on same domain + llms_links = [] + if extracted_links_with_text: + original_domain = request.get("original_domain") + for link, text in extracted_links_with_text: + # Check if link is to another llms.txt file + if self.url_handler.is_llms_variant(link): + # Check same domain/subdomain + if self._is_same_domain_or_subdomain(link, original_domain): + llms_links.append((link, text)) + logger.info(f"Found linked llms.txt: {link}") + + if llms_links: + # Build mapping and extract just URLs + url_to_link_text = dict(llms_links) + extracted_llms_urls = [link for link, _ in llms_links] + + logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files") + + # Crawl linked llms.txt files (no recursion, just one level) + batch_results = await self.crawl_batch_with_progress( + extracted_llms_urls, + max_concurrent=request.get('max_concurrent'), + progress_callback=await self._create_crawl_progress_callback("crawling"), + link_text_fallbacks=url_to_link_text, + ) + + # Combine original llms.txt with linked files + crawl_results.extend(batch_results) + crawl_type = "llms_txt_with_linked_files" + logger.info(f"llms.txt crawling completed: {len(crawl_results)} total files (1 main + {len(batch_results)} linked)") + return crawl_results, crawl_type + + # For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode + logger.info(f"Discovery single-file mode: skipping link extraction for {url}") + crawl_type = "discovery_single_file" + logger.info(f"Discovery file crawling completed: {len(crawl_results)} result") + return crawl_results, crawl_type +``` + +#### 1.2 Add Subdomain Checking Method + +**File**: `python/src/server/services/crawling/crawling_service.py` +**Location**: After `_is_same_domain` method (around line 728) + +**New Method**: +```python +def _is_same_domain_or_subdomain(self, url: str, base_domain: str) -> bool: + """ + Check if a URL belongs to the same root domain or subdomain. + + Examples: + - docs.supabase.com matches supabase.com (subdomain) + - api.supabase.com matches supabase.com (subdomain) + - supabase.com matches supabase.com (exact match) + - external.com does NOT match supabase.com + + Args: + url: URL to check + base_domain: Base domain URL to compare against + + Returns: + True if the URL is from the same root domain or subdomain + """ + try: + from urllib.parse import urlparse + u, b = urlparse(url), urlparse(base_domain) + url_host = (u.hostname or "").lower() + base_host = (b.hostname or "").lower() + + if not url_host or not base_host: + return False + + # Exact match + if url_host == base_host: + return True + + # Check if url_host is a subdomain of base_host + # Extract root domain (last 2 parts for .com, .org, etc.) + def get_root_domain(host: str) -> str: + parts = host.split('.') + if len(parts) >= 2: + return '.'.join(parts[-2:]) + return host + + url_root = get_root_domain(url_host) + base_root = get_root_domain(base_host) + + return url_root == base_root + except Exception: + # If parsing fails, be conservative and exclude the URL + return False +``` + +#### 1.3 Add llms.txt Variant Detection to URLHandler + +**File**: `python/src/server/services/crawling/helpers/url_handler.py` + +**Verify/Add Method** (should already exist, verify it works correctly): +```python +@staticmethod +def is_llms_variant(url: str) -> bool: + """Check if URL is an llms.txt variant file.""" + url_lower = url.lower() + return any(pattern in url_lower for pattern in [ + 'llms.txt', + 'llms-full.txt', + 'llms.md', + 'llms.mdx', + 'llms.markdown' + ]) +``` + +### Phase 2: Enhanced Progress Reporting + +#### 2.1 Add Discovery Metadata to Progress Updates + +**File**: `python/src/server/services/crawling/crawling_service.py` +**Location**: Lines 383-398 (discovery phase) + +**Proposed Changes**: +```python +# Add the single best discovered file to crawl list +if discovered_file: + safe_logfire_info(f"Discovery found file: {discovered_file}") + # Filter through is_binary_file() check like existing code + if not self.url_handler.is_binary_file(discovered_file): + discovered_urls.append(discovered_file) + safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}") + + # Determine file type for user feedback + discovered_file_type = "unknown" + if self.url_handler.is_llms_variant(discovered_file): + discovered_file_type = "llms.txt" + elif self.url_handler.is_sitemap(discovered_file): + discovered_file_type = "sitemap" + elif self.url_handler.is_robots_txt(discovered_file): + discovered_file_type = "robots.txt" + + await update_mapped_progress( + "discovery", 100, + f"Discovery completed: found {discovered_file_type} file", + current_url=url, + discovered_file=discovered_file, + discovered_file_type=discovered_file_type + ) + else: + safe_logfire_info(f"Skipping binary file: {discovered_file}") +else: + safe_logfire_info(f"Discovery found no files for {url}") + await update_mapped_progress( + "discovery", 100, + "Discovery completed: no special files found, will crawl main URL", + current_url=url + ) +``` + +#### 2.2 Add Linked Files Progress + +When following llms.txt links, add progress update: + +```python +if llms_links: + logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files") + + # Notify user about linked files being crawled + await update_crawl_progress( + 60, # 60% of crawling stage + f"Found {len(extracted_llms_urls)} linked llms.txt files, crawling them now...", + crawl_type="llms_txt_linked_files", + linked_files=extracted_llms_urls + ) + + # Crawl linked llms.txt files + batch_results = await self.crawl_batch_with_progress(...) +``` + +### Phase 3: Frontend UI Updates + +#### 3.1 Progress Tracker UI Enhancement + +**File**: `archon-ui-main/src/features/progress/components/ProgressCard.tsx` (or equivalent) + +**Add Discovery Details Section**: +```tsx +// Show discovered file info +{progress.discovered_file && ( +
+

Discovery Results

+

+ Found: {progress.discovered_file_type} + + {progress.discovered_file} + +

+
+)} + +// Show linked files being crawled +{progress.linked_files && progress.linked_files.length > 0 && ( +
+

Following Linked Files

+ +
+)} +``` + +#### 3.2 Progress Status Messages + +Update progress messages to be more informative: + +- **Before**: "Discovery completed: selected 1 best file" +- **After**: "Discovery completed: found llms.txt file at /docs/llms.txt" + +- **New**: "Found 3 linked llms.txt files, crawling them now..." +- **New**: "Crawled 4 llms.txt files total (1 main + 3 linked)" + +## Implementation Plan + +### Sprint 1: Backend Core Functionality ✅ COMPLETED +- [x] Add `_is_same_domain_or_subdomain` method to CrawlingService +- [x] Fix `is_llms_variant` method to detect llms.txt files in paths +- [x] Modify discovery mode link extraction logic +- [x] Add unit tests for subdomain checking (8 tests) +- [x] Add integration tests for llms.txt link following (7 tests) +- [x] Fix discovery priority bug (two-phase approach) + +### Sprint 2: Progress Reporting ✅ COMPLETED +- [x] Add discovery metadata to progress updates (already in backend) +- [x] Add linked files progress updates (already in backend) +- [x] Update progress tracking to include new fields +- [x] Updated ProgressResponse and CrawlProgressData types + +### Sprint 3: Frontend UI ✅ COMPLETED +- [x] Updated progress types to include new fields (discoveredFile, linkedFiles) +- [x] Added discovery status to ProgressStatus type +- [x] Added new crawl types (llms_txt_with_linked_files, discovery_*) +- [x] Implemented discovery info display in CrawlingProgress component +- [x] Implemented linked files display in CrawlingProgress component +- [x] Added "discovery" to active statuses list + +## Testing Strategy + +### Unit Tests + +**File**: `python/tests/test_crawling_service.py` + +```python +def test_is_same_domain_or_subdomain(): + service = CrawlingService() + + # Same domain + assert service._is_same_domain_or_subdomain( + "https://supabase.com/docs", + "https://supabase.com" + ) == True + + # Subdomain + assert service._is_same_domain_or_subdomain( + "https://docs.supabase.com/llms.txt", + "https://supabase.com" + ) == True + + # Different domain + assert service._is_same_domain_or_subdomain( + "https://external.com/llms.txt", + "https://supabase.com" + ) == False +``` + +### Integration Tests + +**Test Cases**: +1. Discover llms.txt with no links → should crawl single file +2. Discover llms.txt with links to other llms.txt files on same domain → should crawl all +3. Discover llms.txt with mix of same-domain and external llms.txt links → should only crawl same-domain +4. Discover llms.txt with links to non-llms.txt files → should ignore them +5. Discover sitemap.xml → should remain in single-file mode (no change to current behavior) + +### Manual Testing + +Test with real sites: +- `supabase.com/docs` → May have links to other llms.txt files +- `anthropic.com` → Test with main site +- Sites with subdomain structure + +## Edge Cases + +1. **Circular references**: llms.txt A links to B, B links to A + - **Solution**: Track visited URLs, skip if already crawled + +2. **Deep nesting**: llms.txt A → B → C → D + - **Solution**: Only follow one level (don't recursively follow links in linked files) + +3. **Large number of linked files**: llms.txt with 100+ links + - **Solution**: Respect max_concurrent settings, show progress + +4. **Mixed content**: llms.txt with both llms.txt links and regular documentation links + - **Solution**: Only follow llms.txt links, ignore others + +5. **Subdomain vs different domain**: docs.site.com vs site.com vs docs.site.org + - **Solution**: Check root domain (site.com), allow docs.site.com but not docs.site.org + +## Success Metrics + +1. **Functionality**: Successfully follows llms.txt links on real sites +2. **Safety**: Only follows same-domain/subdomain links +3. **Performance**: No significant slowdown for sites without linked files +4. **User Experience**: Clear visibility into what is being discovered and crawled +5. **Test Coverage**: >90% coverage for new code + +## Open Questions + +1. Should we limit the maximum number of linked llms.txt files to follow? (e.g., max 10) +2. Should linked llms.txt files themselves be allowed to have links? (currently: no, single level only) +3. Should we add a UI setting to enable/disable llms.txt link following? +4. Should we show a warning if external llms.txt links are found and ignored? + +## References + +- Current discovery logic: `python/src/server/services/crawling/discovery_service.py` +- Current crawling logic: `python/src/server/services/crawling/crawling_service.py` (lines 800-880) +- URL handler: `python/src/server/services/crawling/helpers/url_handler.py` +- Progress tracking: `python/src/server/utils/progress/progress_tracker.py` + +--- + +## Implementation Summary + +### Completed Implementation (Sprint 1) + +#### Core Functionality ✅ +All backend core functionality has been successfully implemented and tested: + +1. **Subdomain Matching** (`crawling_service.py:744-788`) + - Added `_is_same_domain_or_subdomain` method + - Correctly matches subdomains (e.g., docs.supabase.com with supabase.com) + - Extracts root domain for comparison + - All 8 unit tests passing in `tests/test_crawling_service_subdomain.py` + +2. **llms.txt Variant Detection** (`url_handler.py:633-665`) + - **CRITICAL FIX**: Updated `is_llms_variant` method to detect: + - Exact filename matches: `llms.txt`, `llms-full.txt`, `llms.md`, etc. + - Files in `/llms/` directories: `/llms/guides.txt`, `/llms/swift.txt`, etc. + - This was the root cause bug preventing link following from working + - Method now properly recognizes all llms.txt variant files + +3. **Link Following Logic** (`crawling_service.py:862-920`) + - Implemented llms.txt link extraction and following + - Filters for same-domain/subdomain links only + - Respects discovery target mode + - Crawls linked files in batch with progress tracking + - Returns `llms_txt_with_linked_files` crawl type + +4. **Discovery Priority Fix** (`discovery_service.py:137-214`) + - **CRITICAL FIX**: Implemented two-phase discovery + - Phase 1: Check ALL llms.txt files at ALL locations before sitemaps + - Phase 2: Only check sitemaps if no llms.txt found + - Resolves bug where sitemap.xml was found before llms.txt + +5. **Enhanced Progress Reporting** (`crawling_service.py:389-413, 901-906`) + - Discovery metadata includes file type information + - Progress updates show linked files being crawled + - Clear logging throughout the flow + +#### Test Coverage ✅ +Comprehensive test suite created and passing: + +1. **Subdomain Tests** (`tests/test_crawling_service_subdomain.py`) + - 8 tests covering: exact matches, subdomains, different domains, protocols, ports, edge cases, real-world examples + - All tests passing + +2. **Link Following Tests** (`tests/test_llms_txt_link_following.py`) + - 7 tests covering: + - Link extraction from Supabase llms.txt + - llms.txt variant detection + - Same-domain filtering + - External link filtering + - Non-llms link filtering + - Complete integration flow + - All tests passing + +### Critical Bugs Fixed + +1. **Discovery Priority Bug** + - **Problem**: Sitemap.xml being found before llms.txt at root + - **Solution**: Two-phase discovery prioritizes ALL llms.txt locations first + - **File**: `discovery_service.py:137-214` + +2. **is_llms_variant Bug** + - **Problem**: Method only matched exact filenames, not paths like `/llms/guides.txt` + - **Solution**: Added check for `.txt` files in `/llms/` directories + - **File**: `url_handler.py:658-660` + - **Impact**: This was THE blocking bug preventing link following + +### Testing with Supabase Example + +The implementation was validated against the real Supabase llms.txt structure: +- Main file: `https://supabase.com/docs/llms.txt` +- 8 linked files in `/llms/` directory: + - `guides.txt`, `js.txt`, `dart.txt`, `swift.txt`, `kotlin.txt`, `python.txt`, `csharp.txt`, `cli.txt` + +All tests pass, confirming: +- ✅ All 8 links are extracted +- ✅ All 8 links are recognized as llms.txt variants +- ✅ All 8 links match same domain +- ✅ External links are filtered out +- ✅ Non-llms links are filtered out +- ✅ Integration flow crawls 9 total files (1 main + 8 linked) + +### Sprint 2 & 3 Completed ✅ + +**Progress Reporting Enhancement** - Completed +- Backend already passing discovered_file, discovered_file_type, and linked_files in progress updates +- Updated TypeScript types to support new fields +- Both camelCase and snake_case supported for backend compatibility + +**Frontend UI Updates** - Completed +- Updated `progress.ts:6-26`: Added "discovery" to ProgressStatus type +- Updated `progress.ts:27-36`: Added new crawl types (llms_txt_with_linked_files, etc.) +- Updated `progress.ts:49-70`: Added discoveredFile, discoveredFileType, linkedFiles to CrawlProgressData +- Updated `progress.ts:124-169`: Added discovery fields to ProgressResponse (both case formats) +- Updated `CrawlingProgress.tsx:126-138`: Added "discovery" to active statuses +- Updated `CrawlingProgress.tsx:248-291`: Added Discovery Information and Linked Files UI sections + +### How to Test + +```bash +# Run unit tests +uv run pytest tests/test_crawling_service_subdomain.py -v +uv run pytest tests/test_llms_txt_link_following.py -v + +# Test with actual crawl (after restarting backend) +docker compose restart archon-server +# Then crawl: https://supabase.com/docs +# Should discover /docs/llms.txt and follow 8 linked files +``` + +### Files Modified + +**Backend:** + +1. `python/src/server/services/crawling/crawling_service.py` + - Lines 744-788: `_is_same_domain_or_subdomain` method + - Lines 862-920: llms.txt link following logic + - Lines 389-413: Enhanced discovery progress + +2. `python/src/server/services/crawling/helpers/url_handler.py` + - Lines 633-665: Fixed `is_llms_variant` method + +3. `python/src/server/services/crawling/discovery_service.py` + - Lines 137-214: Two-phase discovery priority fix + +4. `python/tests/test_crawling_service_subdomain.py` (NEW) + - 152 lines, 8 comprehensive test cases + +5. `python/tests/test_llms_txt_link_following.py` (NEW) + - 218 lines, 7 integration test cases + +**Frontend:** + +6. `archon-ui-main/src/features/progress/types/progress.ts` + - Lines 6-26: Added "discovery" to ProgressStatus + - Lines 27-36: Added new crawl types + - Lines 49-70: Added discovery fields to CrawlProgressData + - Lines 124-169: Added discovery fields to ProgressResponse + +7. `archon-ui-main/src/features/progress/components/CrawlingProgress.tsx` + - Lines 126-138: Added "discovery" to active statuses + - Lines 248-291: Added Discovery Information and Linked Files UI sections diff --git a/archon-ui-main/src/features/progress/components/CrawlingProgress.tsx b/archon-ui-main/src/features/progress/components/CrawlingProgress.tsx index ca03ecfb..7e5f6308 100644 --- a/archon-ui-main/src/features/progress/components/CrawlingProgress.tsx +++ b/archon-ui-main/src/features/progress/components/CrawlingProgress.tsx @@ -129,6 +129,7 @@ export const CrawlingProgress: React.FC = ({ onSwitchToBr "in_progress", "starting", "initializing", + "discovery", "analyzing", "storing", "source_creation", @@ -245,6 +246,51 @@ export const CrawlingProgress: React.FC = ({ onSwitchToBr )} + {/* Discovery Information */} + {(operation as any).discovered_file && ( +
+
+ Discovery Result + {(operation as any).discovered_file_type && ( + + {(operation as any).discovered_file_type} + + )} +
+ + {(operation as any).discovered_file} + +
+ )} + + {/* Linked Files */} + {(operation as any).linked_files && (operation as any).linked_files.length > 0 && ( +
+
+ Following {(operation as any).linked_files.length} Linked File + {(operation as any).linked_files.length > 1 ? "s" : ""} +
+
+ {(operation as any).linked_files.map((file: string, idx: number) => ( + + • {file} + + ))} +
+
+ )} + {/* Current Action or Operation Type Info */} {(operation.current_url || operation.operation_type) && (
diff --git a/archon-ui-main/src/features/progress/types/progress.ts b/archon-ui-main/src/features/progress/types/progress.ts index f129d191..74cbc5b8 100644 --- a/archon-ui-main/src/features/progress/types/progress.ts +++ b/archon-ui-main/src/features/progress/types/progress.ts @@ -6,6 +6,7 @@ export type ProgressStatus = | "starting" | "initializing" + | "discovery" | "analyzing" | "crawling" | "processing" @@ -24,7 +25,16 @@ export type ProgressStatus = | "cancelled" | "stopping"; -export type CrawlType = "normal" | "sitemap" | "llms-txt" | "text_file" | "refresh"; +export type CrawlType = + | "normal" + | "sitemap" + | "llms-txt" + | "text_file" + | "refresh" + | "llms_txt_with_linked_files" + | "llms_txt_linked_files" + | "discovery_single_file" + | "discovery_sitemap"; export type UploadType = "document"; export interface BaseProgressData { @@ -48,6 +58,10 @@ export interface CrawlProgressData extends BaseProgressData { codeBlocksFound?: number; totalSummaries?: number; completedSummaries?: number; + // Discovery-related fields + discoveredFile?: string; + discoveredFileType?: string; + linkedFiles?: string[]; originalCrawlParams?: { url: string; knowledge_type?: string; @@ -127,6 +141,13 @@ export interface ProgressResponse { codeBlocksFound?: number; totalSummaries?: number; completedSummaries?: number; + // Discovery-related fields + discoveredFile?: string; + discovered_file?: string; // Snake case from backend + discoveredFileType?: string; + discovered_file_type?: string; // Snake case from backend + linkedFiles?: string[]; + linked_files?: string[]; // Snake case from backend fileName?: string; fileSize?: number; chunksProcessed?: number; diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index ab8ccdad..c11a6312 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -385,17 +385,32 @@ class CrawlingService: if not self.url_handler.is_binary_file(discovered_file): discovered_urls.append(discovered_file) safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}") + + # Determine file type for user feedback + discovered_file_type = "unknown" + if self.url_handler.is_llms_variant(discovered_file): + discovered_file_type = "llms.txt" + elif self.url_handler.is_sitemap(discovered_file): + discovered_file_type = "sitemap" + elif self.url_handler.is_robots_txt(discovered_file): + discovered_file_type = "robots.txt" + + await update_mapped_progress( + "discovery", 100, + f"Discovery completed: found {discovered_file_type} file", + current_url=url, + discovered_file=discovered_file, + discovered_file_type=discovered_file_type + ) else: safe_logfire_info(f"Skipping binary file: {discovered_file}") else: safe_logfire_info(f"Discovery found no files for {url}") - - file_count = len(discovered_urls) - safe_logfire_info(f"Discovery selected {file_count} best file to crawl") - - await update_mapped_progress( - "discovery", 100, f"Discovery completed: selected {file_count} best file", current_url=url - ) + await update_mapped_progress( + "discovery", 100, + "Discovery completed: no special files found, will crawl main URL", + current_url=url + ) except Exception as e: safe_logfire_error(f"Discovery phase failed: {e}") @@ -726,6 +741,52 @@ class CrawlingService: # If parsing fails, be conservative and exclude the URL return False + def _is_same_domain_or_subdomain(self, url: str, base_domain: str) -> bool: + """ + Check if a URL belongs to the same root domain or subdomain. + + Examples: + - docs.supabase.com matches supabase.com (subdomain) + - api.supabase.com matches supabase.com (subdomain) + - supabase.com matches supabase.com (exact match) + - external.com does NOT match supabase.com + + Args: + url: URL to check + base_domain: Base domain URL to compare against + + Returns: + True if the URL is from the same root domain or subdomain + """ + try: + from urllib.parse import urlparse + u, b = urlparse(url), urlparse(base_domain) + url_host = (u.hostname or "").lower() + base_host = (b.hostname or "").lower() + + if not url_host or not base_host: + return False + + # Exact match + if url_host == base_host: + return True + + # Check if url_host is a subdomain of base_host + # Extract root domain (last 2 parts for .com, .org, etc.) + def get_root_domain(host: str) -> str: + parts = host.split('.') + if len(parts) >= 2: + return '.'.join(parts[-2:]) + return host + + url_root = get_root_domain(url_host) + base_root = get_root_domain(base_host) + + return url_root == base_root + except Exception: + # If parsing fails, be conservative and exclude the URL + return False + def _is_self_link(self, link: str, base_url: str) -> bool: """ Check if a link is a self-referential link to the base URL. @@ -798,8 +859,60 @@ class CrawlingService: if crawl_results and len(crawl_results) > 0: content = crawl_results[0].get('markdown', '') if self.url_handler.is_link_collection_file(url, content): - # If this file was selected by discovery, skip link extraction (single-file mode) + # If this file was selected by discovery, check if it's an llms.txt file if request.get("is_discovery_target"): + # Check if this is an llms.txt file (not sitemap or other discovery targets) + is_llms_file = self.url_handler.is_llms_variant(url) + + if is_llms_file: + logger.info(f"Discovery llms.txt mode: checking for linked llms.txt files at {url}") + + # Extract all links from the file + extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url) + + # Filter for llms.txt files only on same domain + llms_links = [] + if extracted_links_with_text: + original_domain = request.get("original_domain") + if original_domain: + for link, text in extracted_links_with_text: + # Check if link is to another llms.txt file + if self.url_handler.is_llms_variant(link): + # Check same domain/subdomain + if self._is_same_domain_or_subdomain(link, original_domain): + llms_links.append((link, text)) + logger.info(f"Found linked llms.txt: {link}") + + if llms_links: + # Build mapping and extract just URLs + url_to_link_text = dict(llms_links) + extracted_llms_urls = [link for link, _ in llms_links] + + logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files") + + # Notify user about linked files being crawled + await update_crawl_progress( + 60, # 60% of crawling stage + f"Found {len(extracted_llms_urls)} linked llms.txt files, crawling them now...", + crawl_type="llms_txt_linked_files", + linked_files=extracted_llms_urls + ) + + # Crawl linked llms.txt files (no recursion, just one level) + batch_results = await self.crawl_batch_with_progress( + extracted_llms_urls, + max_concurrent=request.get('max_concurrent'), + progress_callback=await self._create_crawl_progress_callback("crawling"), + link_text_fallbacks=url_to_link_text, + ) + + # Combine original llms.txt with linked files + crawl_results.extend(batch_results) + crawl_type = "llms_txt_with_linked_files" + logger.info(f"llms.txt crawling completed: {len(crawl_results)} total files (1 main + {len(batch_results)} linked)") + return crawl_results, crawl_type + + # For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode logger.info(f"Discovery single-file mode: skipping link extraction for {url}") crawl_type = "discovery_single_file" logger.info(f"Discovery file crawling completed: {len(crawl_results)} result") diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index fc1671d0..28ea2f5e 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -135,51 +135,71 @@ class DiscoveryService: logger.info(f"Starting single-file discovery for {base_url}") # Check files in global priority order - # Note: robots.txt sitemaps are not given special priority as llms files should be preferred + # IMPORTANT: Check root-level llms files BEFORE same-directory sitemaps + # This ensures llms.txt at root is preferred over /docs/sitemap.xml + from urllib.parse import urlparse + + # Get the directory path of the base URL + parsed = urlparse(base_url) + base_path = parsed.path.rstrip('/') + # Extract directory (remove filename if present) + if '.' in base_path.split('/')[-1]: + base_dir = '/'.join(base_path.split('/')[:-1]) + else: + base_dir = base_path + + # Phase 1: Check llms files at ALL priority levels before checking sitemaps for filename in self.DISCOVERY_PRIORITY: - from urllib.parse import urlparse + if not filename.startswith('llms') and not filename.startswith('.well-known/llms') and not filename.startswith('.well-known/ai'): + continue # Skip non-llms files in this phase - # Get the directory path of the base URL - parsed = urlparse(base_url) - base_path = parsed.path.rstrip('/') - # Extract directory (remove filename if present) - if '.' in base_path.split('/')[-1]: - base_dir = '/'.join(base_path.split('/')[:-1]) - else: - base_dir = base_path - - # Priority 1: Check same directory as base_url (e.g., /docs/llms.txt for /docs URL) + # Priority 1a: Check same directory for llms files if base_dir and base_dir != '/': same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}" if self._check_url_exists(same_dir_url): logger.info(f"Discovery found best file in same directory: {same_dir_url}") return same_dir_url - # Priority 2: Check root-level (standard urljoin behavior) + # Priority 1b: Check root-level for llms files file_url = urljoin(base_url, filename) if self._check_url_exists(file_url): logger.info(f"Discovery found best file at root: {file_url}") return file_url - # Priority 3: For llms files, check common subdirectories (including base directory name) - if filename.startswith('llms'): - # Extract base directory name to check it first - subdirs = [] - if base_dir and base_dir != '/': - base_dir_name = base_dir.split('/')[-1] - if base_dir_name: - subdirs.append(base_dir_name) - subdirs.extend(["docs", "static", "public", "assets", "doc", "api"]) + # Priority 1c: Check subdirectories for llms files + subdirs = [] + if base_dir and base_dir != '/': + base_dir_name = base_dir.split('/')[-1] + if base_dir_name: + subdirs.append(base_dir_name) + subdirs.extend(["docs", "static", "public", "assets", "doc", "api"]) - for subdir in subdirs: - subdir_url = urljoin(base_url, f"{subdir}/{filename}") - if self._check_url_exists(subdir_url): - logger.info(f"Discovery found best file in subdirectory: {subdir_url}") - return subdir_url + for subdir in subdirs: + subdir_url = urljoin(base_url, f"{subdir}/{filename}") + if self._check_url_exists(subdir_url): + logger.info(f"Discovery found best file in subdirectory: {subdir_url}") + return subdir_url - # Priority 4: For sitemap files, check common subdirectories (including base directory name) + # Phase 2: Check sitemaps and robots.txt (only if no llms files found) + for filename in self.DISCOVERY_PRIORITY: + if filename.startswith('llms') or filename.startswith('.well-known/llms') or filename.startswith('.well-known/ai'): + continue # Skip llms files, already checked + + # Priority 2a: Check same directory + if base_dir and base_dir != '/': + same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}" + if self._check_url_exists(same_dir_url): + logger.info(f"Discovery found best file in same directory: {same_dir_url}") + return same_dir_url + + # Priority 2b: Check root-level + file_url = urljoin(base_url, filename) + if self._check_url_exists(file_url): + logger.info(f"Discovery found best file at root: {file_url}") + return file_url + + # Priority 2c: For sitemap files, check common subdirectories if filename.endswith('.xml') and not filename.startswith('.well-known'): - # Extract base directory name to check it first subdirs = [] if base_dir and base_dir != '/': base_dir_name = base_dir.split('/')[-1] diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index fa79ebe3..ac8513fe 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -634,6 +634,10 @@ class URLHandler: """ Check if a URL is a llms.txt/llms.md variant with error handling. + Matches: + - Exact filename matches: llms.txt, llms-full.txt, llms.md, etc. + - Files in /llms/ directories: /llms/guides.txt, /llms/swift.txt, etc. + Args: url: URL to check @@ -646,9 +650,16 @@ class URLHandler: path = parsed.path.lower() filename = path.split('/')[-1] if '/' in path else path - # Check for llms file variants + # Check for exact llms file variants (llms.txt, llms.md, etc.) llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown'] - return filename in llms_variants + if filename in llms_variants: + return True + + # Check for .txt files in /llms/ directory (e.g., /llms/guides.txt, /llms/swift.txt) + if '/llms/' in path and path.endswith('.txt'): + return True + + return False except Exception as e: logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True) return False diff --git a/python/tests/test_crawling_service_subdomain.py b/python/tests/test_crawling_service_subdomain.py new file mode 100644 index 00000000..543423c8 --- /dev/null +++ b/python/tests/test_crawling_service_subdomain.py @@ -0,0 +1,152 @@ +"""Unit tests for CrawlingService subdomain checking functionality.""" +import pytest +from src.server.services.crawling.crawling_service import CrawlingService + + +class TestCrawlingServiceSubdomain: + """Test suite for CrawlingService subdomain checking methods.""" + + @pytest.fixture + def service(self): + """Create a CrawlingService instance for testing.""" + # Create service without crawler or supabase for testing domain checking + return CrawlingService(crawler=None, supabase_client=None) + + def test_is_same_domain_or_subdomain_exact_match(self, service): + """Test exact domain matches.""" + # Same domain should match + assert service._is_same_domain_or_subdomain( + "https://supabase.com/docs", + "https://supabase.com" + ) is True + + assert service._is_same_domain_or_subdomain( + "https://supabase.com/path/to/page", + "https://supabase.com" + ) is True + + def test_is_same_domain_or_subdomain_subdomains(self, service): + """Test subdomain matching.""" + # Subdomain should match + assert service._is_same_domain_or_subdomain( + "https://docs.supabase.com/llms.txt", + "https://supabase.com" + ) is True + + assert service._is_same_domain_or_subdomain( + "https://api.supabase.com/v1/endpoint", + "https://supabase.com" + ) is True + + # Multiple subdomain levels + assert service._is_same_domain_or_subdomain( + "https://dev.api.supabase.com/test", + "https://supabase.com" + ) is True + + def test_is_same_domain_or_subdomain_different_domains(self, service): + """Test that different domains are rejected.""" + # Different domain should not match + assert service._is_same_domain_or_subdomain( + "https://external.com/llms.txt", + "https://supabase.com" + ) is False + + assert service._is_same_domain_or_subdomain( + "https://docs.other-site.com", + "https://supabase.com" + ) is False + + # Similar but different domains + assert service._is_same_domain_or_subdomain( + "https://supabase.org", + "https://supabase.com" + ) is False + + def test_is_same_domain_or_subdomain_protocols(self, service): + """Test that protocol differences don't affect matching.""" + # Different protocols should still match + assert service._is_same_domain_or_subdomain( + "http://supabase.com/docs", + "https://supabase.com" + ) is True + + assert service._is_same_domain_or_subdomain( + "https://docs.supabase.com", + "http://supabase.com" + ) is True + + def test_is_same_domain_or_subdomain_ports(self, service): + """Test handling of port numbers.""" + # Same root domain with different ports should match + assert service._is_same_domain_or_subdomain( + "https://supabase.com:8080/api", + "https://supabase.com" + ) is True + + assert service._is_same_domain_or_subdomain( + "http://localhost:3000/dev", + "http://localhost:8080" + ) is True + + def test_is_same_domain_or_subdomain_edge_cases(self, service): + """Test edge cases and error handling.""" + # Empty or malformed URLs should return False + assert service._is_same_domain_or_subdomain( + "", + "https://supabase.com" + ) is False + + assert service._is_same_domain_or_subdomain( + "https://supabase.com", + "" + ) is False + + assert service._is_same_domain_or_subdomain( + "not-a-url", + "https://supabase.com" + ) is False + + def test_is_same_domain_or_subdomain_real_world_examples(self, service): + """Test with real-world examples.""" + # GitHub examples + assert service._is_same_domain_or_subdomain( + "https://api.github.com/repos", + "https://github.com" + ) is True + + assert service._is_same_domain_or_subdomain( + "https://raw.githubusercontent.com/owner/repo", + "https://github.com" + ) is False # githubusercontent.com is different root domain + + # Documentation sites + assert service._is_same_domain_or_subdomain( + "https://docs.python.org/3/library", + "https://python.org" + ) is True + + assert service._is_same_domain_or_subdomain( + "https://api.stripe.com/v1", + "https://stripe.com" + ) is True + + def test_is_same_domain_backward_compatibility(self, service): + """Test that _is_same_domain still works correctly for exact matches.""" + # Exact domain match should work + assert service._is_same_domain( + "https://supabase.com/docs", + "https://supabase.com" + ) is True + + # Subdomain should NOT match with _is_same_domain (only with _is_same_domain_or_subdomain) + assert service._is_same_domain( + "https://docs.supabase.com/llms.txt", + "https://supabase.com" + ) is False + + # Different domain should not match + assert service._is_same_domain( + "https://external.com/llms.txt", + "https://supabase.com" + ) is False diff --git a/python/tests/test_llms_txt_link_following.py b/python/tests/test_llms_txt_link_following.py new file mode 100644 index 00000000..93cabb15 --- /dev/null +++ b/python/tests/test_llms_txt_link_following.py @@ -0,0 +1,217 @@ +"""Integration tests for llms.txt link following functionality.""" +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +from src.server.services.crawling.crawling_service import CrawlingService + + +class TestLlmsTxtLinkFollowing: + """Test suite for llms.txt link following feature.""" + + @pytest.fixture + def service(self): + """Create a CrawlingService instance for testing.""" + return CrawlingService(crawler=None, supabase_client=None) + + @pytest.fixture + def supabase_llms_content(self): + """Return the actual Supabase llms.txt content.""" + return """# Supabase Docs + +- [Supabase Guides](https://supabase.com/llms/guides.txt) +- [Supabase Reference (JavaScript)](https://supabase.com/llms/js.txt) +- [Supabase Reference (Dart)](https://supabase.com/llms/dart.txt) +- [Supabase Reference (Swift)](https://supabase.com/llms/swift.txt) +- [Supabase Reference (Kotlin)](https://supabase.com/llms/kotlin.txt) +- [Supabase Reference (Python)](https://supabase.com/llms/python.txt) +- [Supabase Reference (C#)](https://supabase.com/llms/csharp.txt) +- [Supabase CLI Reference](https://supabase.com/llms/cli.txt) +""" + + def test_extract_links_from_supabase_llms_txt(self, service, supabase_llms_content): + """Test that links are correctly extracted from Supabase llms.txt.""" + url = "https://supabase.com/docs/llms.txt" + + extracted_links = service.url_handler.extract_markdown_links_with_text( + supabase_llms_content, url + ) + + # Should extract 8 links + assert len(extracted_links) == 8 + + # Verify all extracted links + expected_links = [ + "https://supabase.com/llms/guides.txt", + "https://supabase.com/llms/js.txt", + "https://supabase.com/llms/dart.txt", + "https://supabase.com/llms/swift.txt", + "https://supabase.com/llms/kotlin.txt", + "https://supabase.com/llms/python.txt", + "https://supabase.com/llms/csharp.txt", + "https://supabase.com/llms/cli.txt", + ] + + extracted_urls = [link for link, _ in extracted_links] + assert extracted_urls == expected_links + + def test_all_links_are_llms_variants(self, service, supabase_llms_content): + """Test that all extracted links are recognized as llms.txt variants.""" + url = "https://supabase.com/docs/llms.txt" + + extracted_links = service.url_handler.extract_markdown_links_with_text( + supabase_llms_content, url + ) + + # All links should be recognized as llms variants + for link, _ in extracted_links: + is_llms = service.url_handler.is_llms_variant(link) + assert is_llms, f"Link {link} should be recognized as llms.txt variant" + + def test_all_links_are_same_domain(self, service, supabase_llms_content): + """Test that all extracted links are from the same domain.""" + url = "https://supabase.com/docs/llms.txt" + original_domain = "https://supabase.com" + + extracted_links = service.url_handler.extract_markdown_links_with_text( + supabase_llms_content, url + ) + + # All links should be from the same domain + for link, _ in extracted_links: + is_same = service._is_same_domain_or_subdomain(link, original_domain) + assert is_same, f"Link {link} should match domain {original_domain}" + + def test_filter_llms_links_from_supabase(self, service, supabase_llms_content): + """Test the complete filtering logic for Supabase llms.txt.""" + url = "https://supabase.com/docs/llms.txt" + original_domain = "https://supabase.com" + + # Extract all links + extracted_links = service.url_handler.extract_markdown_links_with_text( + supabase_llms_content, url + ) + + # Filter for llms.txt files on same domain (mimics actual code) + llms_links = [] + for link, text in extracted_links: + if service.url_handler.is_llms_variant(link): + if service._is_same_domain_or_subdomain(link, original_domain): + llms_links.append((link, text)) + + # Should have all 8 links + assert len(llms_links) == 8, f"Expected 8 llms links, got {len(llms_links)}" + + @pytest.mark.asyncio + async def test_llms_txt_link_following_integration(self, service, supabase_llms_content): + """Integration test for the complete llms.txt link following flow.""" + url = "https://supabase.com/docs/llms.txt" + + # Mock the crawl_batch_with_progress to verify it's called with correct URLs + mock_batch_results = [ + {'url': f'https://supabase.com/llms/{name}.txt', 'markdown': f'# {name}', 'title': f'{name}'} + for name in ['guides', 'js', 'dart', 'swift', 'kotlin', 'python', 'csharp', 'cli'] + ] + + service.crawl_batch_with_progress = AsyncMock(return_value=mock_batch_results) + service.crawl_markdown_file = AsyncMock(return_value=[{ + 'url': url, + 'markdown': supabase_llms_content, + 'title': 'Supabase Docs' + }]) + + # Create progress tracker mock + service.progress_tracker = MagicMock() + service.progress_tracker.update = AsyncMock() + + # Simulate the request that would come from orchestration + request = { + "is_discovery_target": True, + "original_domain": "https://supabase.com", + "max_concurrent": 5 + } + + # Call the actual crawl method + crawl_results, crawl_type = await service._crawl_by_url_type(url, request) + + # Verify batch crawl was called with the 8 llms.txt URLs + service.crawl_batch_with_progress.assert_called_once() + call_args = service.crawl_batch_with_progress.call_args + crawled_urls = call_args[0][0] # First positional argument + + assert len(crawled_urls) == 8, f"Should crawl 8 linked files, got {len(crawled_urls)}" + + expected_urls = [ + "https://supabase.com/llms/guides.txt", + "https://supabase.com/llms/js.txt", + "https://supabase.com/llms/dart.txt", + "https://supabase.com/llms/swift.txt", + "https://supabase.com/llms/kotlin.txt", + "https://supabase.com/llms/python.txt", + "https://supabase.com/llms/csharp.txt", + "https://supabase.com/llms/cli.txt", + ] + + assert set(crawled_urls) == set(expected_urls) + + # Verify total results include main file + linked files + assert len(crawl_results) == 9, f"Should have 9 total files (1 main + 8 linked), got {len(crawl_results)}" + + # Verify crawl type + assert crawl_type == "llms_txt_with_linked_files" + + def test_external_llms_links_are_filtered(self, service): + """Test that external domain llms.txt links are filtered out.""" + content = """# Test llms.txt + +- [Internal Link](https://supabase.com/llms/internal.txt) +- [External Link](https://external.com/llms/external.txt) +- [Another Internal](https://docs.supabase.com/llms/docs.txt) +""" + url = "https://supabase.com/llms.txt" + original_domain = "https://supabase.com" + + extracted_links = service.url_handler.extract_markdown_links_with_text(content, url) + + # Filter for same-domain llms links + llms_links = [] + for link, text in extracted_links: + if service.url_handler.is_llms_variant(link): + if service._is_same_domain_or_subdomain(link, original_domain): + llms_links.append((link, text)) + + # Should only have 2 links (internal and subdomain), external filtered out + assert len(llms_links) == 2 + + urls = [link for link, _ in llms_links] + assert "https://supabase.com/llms/internal.txt" in urls + assert "https://docs.supabase.com/llms/docs.txt" in urls + assert "https://external.com/llms/external.txt" not in urls + + def test_non_llms_links_are_filtered(self, service): + """Test that non-llms.txt links are filtered out.""" + content = """# Test llms.txt + +- [LLMs Link](https://supabase.com/llms/guide.txt) +- [Regular Doc](https://supabase.com/docs/guide) +- [PDF File](https://supabase.com/docs/guide.pdf) +- [Another LLMs](https://supabase.com/llms/api.txt) +""" + url = "https://supabase.com/llms.txt" + original_domain = "https://supabase.com" + + extracted_links = service.url_handler.extract_markdown_links_with_text(content, url) + + # Filter for llms links only + llms_links = [] + for link, text in extracted_links: + if service.url_handler.is_llms_variant(link): + if service._is_same_domain_or_subdomain(link, original_domain): + llms_links.append((link, text)) + + # Should only have 2 llms.txt links + assert len(llms_links) == 2 + + urls = [link for link, _ in llms_links] + assert "https://supabase.com/llms/guide.txt" in urls + assert "https://supabase.com/llms/api.txt" in urls + assert "https://supabase.com/docs/guide" not in urls + assert "https://supabase.com/docs/guide.pdf" not in urls