mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-30 21:49:30 -05:00
feat: Improve discovery system with SSRF protection and optimize file detection
## Backend Improvements ### Discovery Service - Fix SSRF protection: Use requests.Session() for max_redirects parameter - Add comprehensive IP validation (_is_safe_ip, _resolve_and_validate_hostname) - Add hostname DNS resolution validation before requests - Fix llms.txt link following to crawl ALL same-domain pages (not just llms.txt files) - Remove unused file variants: llms.md, llms.markdown, sitemap_index.xml, sitemap-index.xml - Optimize DISCOVERY_PRIORITY based on real-world usage research - Update priority: llms.txt > llms-full.txt > sitemap.xml > robots.txt ### URL Handler - Fix .well-known path to be case-sensitive per RFC 8615 - Remove llms.md, llms.markdown, llms.mdx from variant detection - Simplify link collection patterns to only .txt files (most common) - Update llms_variants list to only include spec-compliant files ### Crawling Service - Add tldextract for proper root domain extraction (handles .co.uk, .com.au, etc.) - Replace naive domain extraction with robust get_root_domain() function - Add tldextract>=5.0.0 to dependencies ## Frontend Improvements ### Type Safety - Extend ActiveOperation type with discovery fields (discovered_file, discovered_file_type, linked_files) - Remove all type casting (operation as any) from CrawlingProgress component - Add proper TypeScript types for discovery information ### Security - Create URL validation utility (urlValidation.ts) - Only render clickable links for validated HTTP/HTTPS URLs - Reject unsafe protocols (javascript:, data:, vbscript:, file:) - Display invalid URLs as plain text instead of links ## Testing - Update test mocks to include history and url attributes for redirect checking - Fix .well-known case sensitivity tests (must be lowercase per RFC 8615) - Update discovery priority tests to match new order - Remove tests for deprecated file variants 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -11,6 +11,8 @@ import uuid
|
||||
from collections.abc import Awaitable, Callable
|
||||
from typing import Any, Optional
|
||||
|
||||
import tldextract
|
||||
|
||||
from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
|
||||
from ...utils import get_supabase_client
|
||||
from ...utils.progress.progress_tracker import ProgressTracker
|
||||
@@ -38,6 +40,34 @@ _active_orchestrations: dict[str, "CrawlingService"] = {}
|
||||
_orchestration_lock: asyncio.Lock | None = None
|
||||
|
||||
|
||||
def get_root_domain(host: str) -> str:
|
||||
"""
|
||||
Extract the root domain from a hostname using tldextract.
|
||||
Handles multi-part public suffixes correctly (e.g., .co.uk, .com.au).
|
||||
|
||||
Args:
|
||||
host: Hostname to extract root domain from
|
||||
|
||||
Returns:
|
||||
Root domain (domain + suffix) or original host if extraction fails
|
||||
|
||||
Examples:
|
||||
- "docs.example.com" -> "example.com"
|
||||
- "api.example.co.uk" -> "example.co.uk"
|
||||
- "localhost" -> "localhost"
|
||||
"""
|
||||
try:
|
||||
extracted = tldextract.extract(host)
|
||||
# Return domain.suffix if both are present
|
||||
if extracted.domain and extracted.suffix:
|
||||
return f"{extracted.domain}.{extracted.suffix}"
|
||||
# Fallback to original host if extraction yields no domain or suffix
|
||||
return host
|
||||
except Exception:
|
||||
# If extraction fails, return original host
|
||||
return host
|
||||
|
||||
|
||||
def _ensure_orchestration_lock() -> asyncio.Lock:
|
||||
global _orchestration_lock
|
||||
if _orchestration_lock is None:
|
||||
@@ -771,14 +801,7 @@ class CrawlingService:
|
||||
if url_host == base_host:
|
||||
return True
|
||||
|
||||
# Check if url_host is a subdomain of base_host
|
||||
# Extract root domain (last 2 parts for .com, .org, etc.)
|
||||
def get_root_domain(host: str) -> str:
|
||||
parts = host.split('.')
|
||||
if len(parts) >= 2:
|
||||
return '.'.join(parts[-2:])
|
||||
return host
|
||||
|
||||
# Check if url_host is a subdomain of base_host using tldextract
|
||||
url_root = get_root_domain(url_host)
|
||||
base_root = get_root_domain(base_host)
|
||||
|
||||
@@ -865,51 +888,49 @@ class CrawlingService:
|
||||
is_llms_file = self.url_handler.is_llms_variant(url)
|
||||
|
||||
if is_llms_file:
|
||||
logger.info(f"Discovery llms.txt mode: checking for linked llms.txt files at {url}")
|
||||
logger.info(f"Discovery llms.txt mode: following ALL same-domain links from {url}")
|
||||
|
||||
# Extract all links from the file
|
||||
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
|
||||
|
||||
# Filter for llms.txt files only on same domain
|
||||
llms_links = []
|
||||
# Filter for same-domain links (all types, not just llms.txt)
|
||||
same_domain_links = []
|
||||
if extracted_links_with_text:
|
||||
original_domain = request.get("original_domain")
|
||||
if original_domain:
|
||||
for link, text in extracted_links_with_text:
|
||||
# Check if link is to another llms.txt file
|
||||
if self.url_handler.is_llms_variant(link):
|
||||
# Check same domain/subdomain
|
||||
if self._is_same_domain_or_subdomain(link, original_domain):
|
||||
llms_links.append((link, text))
|
||||
logger.info(f"Found linked llms.txt: {link}")
|
||||
# Check same domain/subdomain for ALL links
|
||||
if self._is_same_domain_or_subdomain(link, original_domain):
|
||||
same_domain_links.append((link, text))
|
||||
logger.debug(f"Found same-domain link: {link}")
|
||||
|
||||
if llms_links:
|
||||
if same_domain_links:
|
||||
# Build mapping and extract just URLs
|
||||
url_to_link_text = dict(llms_links)
|
||||
extracted_llms_urls = [link for link, _ in llms_links]
|
||||
url_to_link_text = dict(same_domain_links)
|
||||
extracted_urls = [link for link, _ in same_domain_links]
|
||||
|
||||
logger.info(f"Following {len(extracted_llms_urls)} linked llms.txt files")
|
||||
logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt")
|
||||
|
||||
# Notify user about linked files being crawled
|
||||
await update_crawl_progress(
|
||||
60, # 60% of crawling stage
|
||||
f"Found {len(extracted_llms_urls)} linked llms.txt files, crawling them now...",
|
||||
f"Found {len(extracted_urls)} links in llms.txt, crawling them now...",
|
||||
crawl_type="llms_txt_linked_files",
|
||||
linked_files=extracted_llms_urls
|
||||
linked_files=extracted_urls
|
||||
)
|
||||
|
||||
# Crawl linked llms.txt files (no recursion, just one level)
|
||||
# Crawl all same-domain links from llms.txt (no recursion, just one level)
|
||||
batch_results = await self.crawl_batch_with_progress(
|
||||
extracted_llms_urls,
|
||||
extracted_urls,
|
||||
max_concurrent=request.get('max_concurrent'),
|
||||
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||
link_text_fallbacks=url_to_link_text,
|
||||
)
|
||||
|
||||
# Combine original llms.txt with linked files
|
||||
# Combine original llms.txt with linked pages
|
||||
crawl_results.extend(batch_results)
|
||||
crawl_type = "llms_txt_with_linked_files"
|
||||
logger.info(f"llms.txt crawling completed: {len(crawl_results)} total files (1 main + {len(batch_results)} linked)")
|
||||
crawl_type = "llms_txt_with_linked_pages"
|
||||
logger.info(f"llms.txt crawling completed: {len(crawl_results)} total pages (1 llms.txt + {len(batch_results)} linked pages)")
|
||||
return crawl_results, crawl_type
|
||||
|
||||
# For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -405,13 +405,10 @@ class URLHandler:
|
||||
|
||||
# Check for specific link collection filenames
|
||||
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
|
||||
# Only includes commonly used formats found in the wild
|
||||
link_collection_patterns = [
|
||||
# .txt variants - files that typically contain lists of links
|
||||
'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
|
||||
# .md/.mdx/.markdown variants
|
||||
'llms.md', 'links.md', 'resources.md', 'references.md',
|
||||
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
|
||||
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
|
||||
]
|
||||
|
||||
# Direct filename match
|
||||
@@ -421,7 +418,7 @@ class URLHandler:
|
||||
|
||||
# Pattern-based detection for variations, but exclude "full" variants
|
||||
# Only match files that are likely link collections, not complete content files
|
||||
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
|
||||
if filename.endswith('.txt'):
|
||||
# Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
|
||||
import re
|
||||
if not re.search(r'(^|[._-])full([._-]|$)', filename):
|
||||
@@ -650,8 +647,8 @@ class URLHandler:
|
||||
path = parsed.path.lower()
|
||||
filename = path.split('/')[-1] if '/' in path else path
|
||||
|
||||
# Check for exact llms file variants (llms.txt, llms.md, etc.)
|
||||
llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
|
||||
# Check for exact llms file variants (only standard spec files)
|
||||
llms_variants = ['llms.txt', 'llms-full.txt']
|
||||
if filename in llms_variants:
|
||||
return True
|
||||
|
||||
@@ -668,6 +665,7 @@ class URLHandler:
|
||||
def is_well_known_file(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL is a .well-known/* file with error handling.
|
||||
Per RFC 8615, the path is case-sensitive and must be lowercase.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
@@ -677,8 +675,8 @@ class URLHandler:
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Normalize to lowercase and ignore query/fragment
|
||||
path = parsed.path.lower()
|
||||
# RFC 8615: path segments are case-sensitive, must be lowercase
|
||||
path = parsed.path
|
||||
# Only detect .well-known files at root level
|
||||
return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user