mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
feat: Improve discovery system with SSRF protection and optimize file detection
## Backend Improvements ### Discovery Service - Fix SSRF protection: Use requests.Session() for max_redirects parameter - Add comprehensive IP validation (_is_safe_ip, _resolve_and_validate_hostname) - Add hostname DNS resolution validation before requests - Fix llms.txt link following to crawl ALL same-domain pages (not just llms.txt files) - Remove unused file variants: llms.md, llms.markdown, sitemap_index.xml, sitemap-index.xml - Optimize DISCOVERY_PRIORITY based on real-world usage research - Update priority: llms.txt > llms-full.txt > sitemap.xml > robots.txt ### URL Handler - Fix .well-known path to be case-sensitive per RFC 8615 - Remove llms.md, llms.markdown, llms.mdx from variant detection - Simplify link collection patterns to only .txt files (most common) - Update llms_variants list to only include spec-compliant files ### Crawling Service - Add tldextract for proper root domain extraction (handles .co.uk, .com.au, etc.) - Replace naive domain extraction with robust get_root_domain() function - Add tldextract>=5.0.0 to dependencies ## Frontend Improvements ### Type Safety - Extend ActiveOperation type with discovery fields (discovered_file, discovered_file_type, linked_files) - Remove all type casting (operation as any) from CrawlingProgress component - Add proper TypeScript types for discovery information ### Security - Create URL validation utility (urlValidation.ts) - Only render clickable links for validated HTTP/HTTPS URLs - Reject unsafe protocols (javascript:, data:, vbscript:, file:) - Display invalid URLs as plain text instead of links ## Testing - Update test mocks to include history and url attributes for redirect checking - Fix .well-known case sensitivity tests (must be lowercase per RFC 8615) - Update discovery priority tests to match new order - Remove tests for deprecated file variants 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
29
python/uv.lock
generated
29
python/uv.lock
generated
@@ -247,6 +247,7 @@ server = [
|
||||
{ name = "python-multipart" },
|
||||
{ name = "slowapi" },
|
||||
{ name = "supabase" },
|
||||
{ name = "tldextract" },
|
||||
{ name = "uvicorn" },
|
||||
{ name = "watchfiles" },
|
||||
]
|
||||
@@ -342,6 +343,7 @@ server = [
|
||||
{ name = "python-multipart", specifier = ">=0.0.20" },
|
||||
{ name = "slowapi", specifier = ">=0.1.9" },
|
||||
{ name = "supabase", specifier = "==2.15.1" },
|
||||
{ name = "tldextract", specifier = ">=5.0.0" },
|
||||
{ name = "uvicorn", specifier = ">=0.24.0" },
|
||||
{ name = "watchfiles", specifier = ">=0.18" },
|
||||
]
|
||||
@@ -2601,6 +2603,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests-file"
|
||||
version = "3.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "requests" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/fe/5e/2aca791207e542a16a8cc91fd0e19f5c26f4dff030ee3062deb5606f84ae/requests_file-3.0.0.tar.gz", hash = "sha256:68789589cfde7098e8933fe3e69bbd864f7f0c22f118937b424d94d0e1b7760f", size = 6897 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e4/85/689c218feb21a66919bd667969d4ed60a64db67f6ea5ceb00c9795ae19b0/requests_file-3.0.0-py2.py3-none-any.whl", hash = "sha256:aca222ec94a19310be2a0ed6bdcdebb09058b0f6c3e984af56361c8fca59653c", size = 4486 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rich"
|
||||
version = "14.0.0"
|
||||
@@ -3086,6 +3100,21 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tldextract"
|
||||
version = "5.3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "filelock" },
|
||||
{ name = "idna" },
|
||||
{ name = "requests" },
|
||||
{ name = "requests-file" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/97/78/182641ea38e3cfd56e9c7b3c0d48a53d432eea755003aa544af96403d4ac/tldextract-5.3.0.tar.gz", hash = "sha256:b3d2b70a1594a0ecfa6967d57251527d58e00bb5a91a74387baa0d87a0678609", size = 128502 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/67/7c/ea488ef48f2f544566947ced88541bc45fae9e0e422b2edbf165ee07da99/tldextract-5.3.0-py3-none-any.whl", hash = "sha256:f70f31d10b55c83993f55e91ecb7c5d84532a8972f22ec578ecfbe5ea2292db2", size = 107384 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.21.1"
|
||||
|
||||
Reference in New Issue
Block a user