feat: Improve discovery system with SSRF protection and optimize file detection

## Backend Improvements ### Discovery Service - Fix SSRF protection: Use requests.Session() for max_redirects parameter - Add comprehensive IP validation (_is_safe_ip, _resolve_and_validate_hostname) - Add hostname DNS resolution validation before requests - Fix llms.txt link following to crawl ALL same-domain pages (not just llms.txt files) - Remove unused file variants: llms.md, llms.markdown, sitemap_index.xml, sitemap-index.xml - Optimize DISCOVERY_PRIORITY based on real-world usage research - Update priority: llms.txt > llms-full.txt > sitemap.xml > robots.txt ### URL Handler - Fix .well-known path to be case-sensitive per RFC 8615 - Remove llms.md, llms.markdown, llms.mdx from variant detection - Simplify link collection patterns to only .txt files (most common) - Update llms_variants list to only include spec-compliant files ### Crawling Service - Add tldextract for proper root domain extraction (handles .co.uk, .com.au, etc.) - Replace naive domain extraction with robust get_root_domain() function - Add tldextract>=5.0.0 to dependencies ## Frontend Improvements ### Type Safety - Extend ActiveOperation type with discovery fields (discovered_file, discovered_file_type, linked_files) - Remove all type casting (operation as any) from CrawlingProgress component - Add proper TypeScript types for discovery information ### Security - Create URL validation utility (urlValidation.ts) - Only render clickable links for validated HTTP/HTTPS URLs - Reject unsafe protocols (javascript:, data:, vbscript:, file:) - Display invalid URLs as plain text instead of links ## Testing - Update test mocks to include history and url attributes for redirect checking - Fix .well-known case sensitivity tests (must be lowercase per RFC 8615) - Update discovery priority tests to match new order - Remove tests for deprecated file variants 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-30 21:49:30 -05:00 · 2025-10-19 15:31:08 +02:00
parent ddcd364cb5
commit 13796abbe8
10 changed files with 714 additions and 653 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -11,6 +11,8 @@ import uuid
 from collections.abc import Awaitable, Callable
 from typing import Any, Optional

+import tldextract
+
 from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
 from ...utils import get_supabase_client
 from ...utils.progress.progress_tracker import ProgressTracker
@@ -38,6 +40,34 @@ _active_orchestrations: dict[str, "CrawlingService"] = {}
 _orchestration_lock: asyncio.Lock | None = None


+def get_root_domain(host: str) -> str:
+    """
+    Extract the root domain from a hostname using tldextract.
+    Handles multi-part public suffixes correctly (e.g., .co.uk, .com.au).
+
+    Args:
+        host: Hostname to extract root domain from
+
+    Returns:
+        Root domain (domain + suffix) or original host if extraction fails
+
+    Examples:
+        - "docs.example.com" -> "example.com"
+        - "api.example.co.uk" -> "example.co.uk"
+        - "localhost" -> "localhost"
+    """
+    try:
+        extracted = tldextract.extract(host)
+        # Return domain.suffix if both are present
+        if extracted.domain and extracted.suffix:
+            return f"{extracted.domain}.{extracted.suffix}"
+        # Fallback to original host if extraction yields no domain or suffix
+        return host
+    except Exception:
+        # If extraction fails, return original host
+        return host
+
+
 def _ensure_orchestration_lock() -> asyncio.Lock:
    global _orchestration_lock
    if _orchestration_lock is None:
@@ -771,14 +801,7 @@ class CrawlingService:
            if url_host == base_host:
                return True

-            # Check if url_host is a subdomain of base_host
-            # Extract root domain (last 2 parts for .com, .org, etc.)
-            def get_root_domain(host: str) -> str:
-                parts = host.split('.')
-                if len(parts) >= 2:
-                    return '.'.join(parts[-2:])
-                return host
-
+            # Check if url_host is a subdomain of base_host using tldextract
            url_root = get_root_domain(url_host)
            base_root = get_root_domain(base_host)

@@ -865,51 +888,49 @@ class CrawlingService:
                        is_llms_file = self.url_handler.is_llms_variant(url)

                        if is_llms_file:
-                            logger.info(f"Discovery llms.txt mode: checking for linked llms.txt files at {url}")
+                            logger.info(f"Discovery llms.txt mode: following ALL same-domain links from {url}")

                            # Extract all links from the file
                            extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)

-                            # Filter for llms.txt files only on same domain
-                            llms_links = []
+                            # Filter for same-domain links (all types, not just llms.txt)
+                            same_domain_links = []
                            if extracted_links_with_text:
                                original_domain = request.get("original_domain")
                                if original_domain:
                                    for link, text in extracted_links_with_text:
-                                        # Check if link is to another llms.txt file
-                                        if self.url_handler.is_llms_variant(link):
-                                            # Check same domain/subdomain
-                                            if self._is_same_domain_or_subdomain(link, original_domain):
-                                                llms_links.append((link, text))
-                                                logger.info(f"Found linked llms.txt: {link}")
+                                        # Check same domain/subdomain for ALL links
+                                        if self._is_same_domain_or_subdomain(link, original_domain):
+                                            same_domain_links.append((link, text))
+                                            logger.debug(f"Found same-domain link: {link}")

-                            if llms_links:
+                            if same_domain_links:
                                # Build mapping and extract just URLs
-                                url_to_link_text = dict(llms_links)
-                                extracted_llms_urls = [link for link, _ in llms_links]
+                                url_to_link_text = dict(same_domain_links)
+                                extracted_urls = [link for link, _ in same_domain_links]

-                                logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files")
+                                logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt")

                                # Notify user about linked files being crawled
                                await update_crawl_progress(
                                    60,  # 60% of crawling stage
-                                    f"Found {len(extracted_llms_urls)} linked llms.txt files, crawling them now...",
+                                    f"Found {len(extracted_urls)} links in llms.txt, crawling them now...",
                                    crawl_type="llms_txt_linked_files",
-                                    linked_files=extracted_llms_urls
+                                    linked_files=extracted_urls
                                )

-                                # Crawl linked llms.txt files (no recursion, just one level)
+                                # Crawl all same-domain links from llms.txt (no recursion, just one level)
                                batch_results = await self.crawl_batch_with_progress(
-                                    extracted_llms_urls,
+                                    extracted_urls,
                                    max_concurrent=request.get('max_concurrent'),
                                    progress_callback=await self._create_crawl_progress_callback("crawling"),
                                    link_text_fallbacks=url_to_link_text,
                                )

-                                # Combine original llms.txt with linked files
+                                # Combine original llms.txt with linked pages
                                crawl_results.extend(batch_results)
-                                crawl_type = "llms_txt_with_linked_files"
-                                logger.info(f"llms.txt crawling completed: {len(crawl_results)} total files (1 main + {len(batch_results)} linked)")
+                                crawl_type = "llms_txt_with_linked_pages"
+                                logger.info(f"llms.txt crawling completed: {len(crawl_results)} total pages (1 llms.txt + {len(batch_results)} linked pages)")
                                return crawl_results, crawl_type

                        # For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -405,13 +405,10 @@ class URLHandler:

            # Check for specific link collection filenames
            # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
+            # Only includes commonly used formats found in the wild
            link_collection_patterns = [
                # .txt variants - files that typically contain lists of links
                'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
-                # .md/.mdx/.markdown variants
-                'llms.md', 'links.md', 'resources.md', 'references.md',
-                'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
-                'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
            ]

            # Direct filename match
@@ -421,7 +418,7 @@ class URLHandler:

            # Pattern-based detection for variations, but exclude "full" variants
            # Only match files that are likely link collections, not complete content files
-            if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
+            if filename.endswith('.txt'):
                # Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
                import re
                if not re.search(r'(^|[._-])full([._-]|$)', filename):
@@ -650,8 +647,8 @@ class URLHandler:
            path = parsed.path.lower()
            filename = path.split('/')[-1] if '/' in path else path

-            # Check for exact llms file variants (llms.txt, llms.md, etc.)
-            llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
+            # Check for exact llms file variants (only standard spec files)
+            llms_variants = ['llms.txt', 'llms-full.txt']
            if filename in llms_variants:
                return True

@@ -668,6 +665,7 @@ class URLHandler:
    def is_well_known_file(url: str) -> bool:
        """
        Check if a URL is a .well-known/* file with error handling.
+        Per RFC 8615, the path is case-sensitive and must be lowercase.

        Args:
            url: URL to check
@@ -677,8 +675,8 @@ class URLHandler:
        """
        try:
            parsed = urlparse(url)
-            # Normalize to lowercase and ignore query/fragment
-            path = parsed.path.lower()
+            # RFC 8615: path segments are case-sensitive, must be lowercase
+            path = parsed.path
            # Only detect .well-known files at root level
            return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
        except Exception as e: