feat: Improve discovery system with SSRF protection and optimize file detection

## Backend Improvements

### Discovery Service
- Fix SSRF protection: Use requests.Session() for max_redirects parameter
- Add comprehensive IP validation (_is_safe_ip, _resolve_and_validate_hostname)
- Add hostname DNS resolution validation before requests
- Fix llms.txt link following to crawl ALL same-domain pages (not just llms.txt files)
- Remove unused file variants: llms.md, llms.markdown, sitemap_index.xml, sitemap-index.xml
- Optimize DISCOVERY_PRIORITY based on real-world usage research
- Update priority: llms.txt > llms-full.txt > sitemap.xml > robots.txt

### URL Handler
- Fix .well-known path to be case-sensitive per RFC 8615
- Remove llms.md, llms.markdown, llms.mdx from variant detection
- Simplify link collection patterns to only .txt files (most common)
- Update llms_variants list to only include spec-compliant files

### Crawling Service
- Add tldextract for proper root domain extraction (handles .co.uk, .com.au, etc.)
- Replace naive domain extraction with robust get_root_domain() function
- Add tldextract>=5.0.0 to dependencies

## Frontend Improvements

### Type Safety
- Extend ActiveOperation type with discovery fields (discovered_file, discovered_file_type, linked_files)
- Remove all type casting (operation as any) from CrawlingProgress component
- Add proper TypeScript types for discovery information

### Security
- Create URL validation utility (urlValidation.ts)
- Only render clickable links for validated HTTP/HTTPS URLs
- Reject unsafe protocols (javascript:, data:, vbscript:, file:)
- Display invalid URLs as plain text instead of links

## Testing
- Update test mocks to include history and url attributes for redirect checking
- Fix .well-known case sensitivity tests (must be lowercase per RFC 8615)
- Update discovery priority tests to match new order
- Remove tests for deprecated file variants

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-10-19 15:31:08 +02:00
parent ddcd364cb5
commit 13796abbe8
10 changed files with 714 additions and 653 deletions

View File

@@ -11,6 +11,8 @@ import uuid
from collections.abc import Awaitable, Callable
from typing import Any, Optional
import tldextract
from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
from ...utils import get_supabase_client
from ...utils.progress.progress_tracker import ProgressTracker
@@ -38,6 +40,34 @@ _active_orchestrations: dict[str, "CrawlingService"] = {}
_orchestration_lock: asyncio.Lock | None = None
def get_root_domain(host: str) -> str:
"""
Extract the root domain from a hostname using tldextract.
Handles multi-part public suffixes correctly (e.g., .co.uk, .com.au).
Args:
host: Hostname to extract root domain from
Returns:
Root domain (domain + suffix) or original host if extraction fails
Examples:
- "docs.example.com" -> "example.com"
- "api.example.co.uk" -> "example.co.uk"
- "localhost" -> "localhost"
"""
try:
extracted = tldextract.extract(host)
# Return domain.suffix if both are present
if extracted.domain and extracted.suffix:
return f"{extracted.domain}.{extracted.suffix}"
# Fallback to original host if extraction yields no domain or suffix
return host
except Exception:
# If extraction fails, return original host
return host
def _ensure_orchestration_lock() -> asyncio.Lock:
global _orchestration_lock
if _orchestration_lock is None:
@@ -771,14 +801,7 @@ class CrawlingService:
if url_host == base_host:
return True
# Check if url_host is a subdomain of base_host
# Extract root domain (last 2 parts for .com, .org, etc.)
def get_root_domain(host: str) -> str:
parts = host.split('.')
if len(parts) >= 2:
return '.'.join(parts[-2:])
return host
# Check if url_host is a subdomain of base_host using tldextract
url_root = get_root_domain(url_host)
base_root = get_root_domain(base_host)
@@ -865,51 +888,49 @@ class CrawlingService:
is_llms_file = self.url_handler.is_llms_variant(url)
if is_llms_file:
logger.info(f"Discovery llms.txt mode: checking for linked llms.txt files at {url}")
logger.info(f"Discovery llms.txt mode: following ALL same-domain links from {url}")
# Extract all links from the file
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
# Filter for llms.txt files only on same domain
llms_links = []
# Filter for same-domain links (all types, not just llms.txt)
same_domain_links = []
if extracted_links_with_text:
original_domain = request.get("original_domain")
if original_domain:
for link, text in extracted_links_with_text:
# Check if link is to another llms.txt file
if self.url_handler.is_llms_variant(link):
# Check same domain/subdomain
if self._is_same_domain_or_subdomain(link, original_domain):
llms_links.append((link, text))
logger.info(f"Found linked llms.txt: {link}")
# Check same domain/subdomain for ALL links
if self._is_same_domain_or_subdomain(link, original_domain):
same_domain_links.append((link, text))
logger.debug(f"Found same-domain link: {link}")
if llms_links:
if same_domain_links:
# Build mapping and extract just URLs
url_to_link_text = dict(llms_links)
extracted_llms_urls = [link for link, _ in llms_links]
url_to_link_text = dict(same_domain_links)
extracted_urls = [link for link, _ in same_domain_links]
logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files")
logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt")
# Notify user about linked files being crawled
await update_crawl_progress(
60, # 60% of crawling stage
f"Found {len(extracted_llms_urls)} linked llms.txt files, crawling them now...",
f"Found {len(extracted_urls)} links in llms.txt, crawling them now...",
crawl_type="llms_txt_linked_files",
linked_files=extracted_llms_urls
linked_files=extracted_urls
)
# Crawl linked llms.txt files (no recursion, just one level)
# Crawl all same-domain links from llms.txt (no recursion, just one level)
batch_results = await self.crawl_batch_with_progress(
extracted_llms_urls,
extracted_urls,
max_concurrent=request.get('max_concurrent'),
progress_callback=await self._create_crawl_progress_callback("crawling"),
link_text_fallbacks=url_to_link_text,
)
# Combine original llms.txt with linked files
# Combine original llms.txt with linked pages
crawl_results.extend(batch_results)
crawl_type = "llms_txt_with_linked_files"
logger.info(f"llms.txt crawling completed: {len(crawl_results)} total files (1 main + {len(batch_results)} linked)")
crawl_type = "llms_txt_with_linked_pages"
logger.info(f"llms.txt crawling completed: {len(crawl_results)} total pages (1 llms.txt + {len(batch_results)} linked pages)")
return crawl_results, crawl_type
# For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode

File diff suppressed because it is too large Load Diff

View File

@@ -405,13 +405,10 @@ class URLHandler:
# Check for specific link collection filenames
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
# Only includes commonly used formats found in the wild
link_collection_patterns = [
# .txt variants - files that typically contain lists of links
'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
# .md/.mdx/.markdown variants
'llms.md', 'links.md', 'resources.md', 'references.md',
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
]
# Direct filename match
@@ -421,7 +418,7 @@ class URLHandler:
# Pattern-based detection for variations, but exclude "full" variants
# Only match files that are likely link collections, not complete content files
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
if filename.endswith('.txt'):
# Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
import re
if not re.search(r'(^|[._-])full([._-]|$)', filename):
@@ -650,8 +647,8 @@ class URLHandler:
path = parsed.path.lower()
filename = path.split('/')[-1] if '/' in path else path
# Check for exact llms file variants (llms.txt, llms.md, etc.)
llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
# Check for exact llms file variants (only standard spec files)
llms_variants = ['llms.txt', 'llms-full.txt']
if filename in llms_variants:
return True
@@ -668,6 +665,7 @@ class URLHandler:
def is_well_known_file(url: str) -> bool:
"""
Check if a URL is a .well-known/* file with error handling.
Per RFC 8615, the path is case-sensitive and must be lowercase.
Args:
url: URL to check
@@ -677,8 +675,8 @@ class URLHandler:
"""
try:
parsed = urlparse(url)
# Normalize to lowercase and ignore query/fragment
path = parsed.path.lower()
# RFC 8615: path segments are case-sensitive, must be lowercase
path = parsed.path
# Only detect .well-known files at root level
return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
except Exception as e: