diff --git a/archon-ui-main/src/features/progress/utils/urlValidation.ts b/archon-ui-main/src/features/progress/utils/urlValidation.ts index 6a8d8564..79f70bda 100644 --- a/archon-ui-main/src/features/progress/utils/urlValidation.ts +++ b/archon-ui-main/src/features/progress/utils/urlValidation.ts @@ -4,7 +4,6 @@ */ const SAFE_PROTOCOLS = ["http:", "https:"]; -const UNSAFE_PROTOCOLS = ["javascript:", "data:", "vbscript:", "file:"]; /** * Validates that a URL is safe to render as a clickable link. @@ -32,11 +31,6 @@ export function isValidHttpUrl(url: string | undefined | null): boolean { return false; } - // Explicitly reject known unsafe protocols - if (UNSAFE_PROTOCOLS.some((unsafe) => trimmed.toLowerCase().startsWith(unsafe))) { - return false; - } - // Basic hostname validation (must have at least one dot or be localhost) if (!parsed.hostname.includes(".") && parsed.hostname !== "localhost") { return false; diff --git a/python/tests/test_discovery_service.py b/python/tests/test_discovery_service.py index e7de6170..b7b41a95 100644 --- a/python/tests/test_discovery_service.py +++ b/python/tests/test_discovery_service.py @@ -13,14 +13,14 @@ def create_mock_dns_response(): ] -def create_mock_response(status_code: int, text: str = "") -> Mock: +def create_mock_response(status_code: int, text: str = "", url: str = "https://example.com") -> Mock: """Create a mock response object that supports streaming API.""" response = Mock() response.status_code = status_code response.text = text response.encoding = 'utf-8' response.history = [] # Empty list for no redirects - response.url = "" # Mock URL for redirect checks + response.url = url # Mock URL for redirect checks (must be string, not Mock) # Mock iter_content to yield text in chunks as bytes text_bytes = text.encode('utf-8') @@ -40,8 +40,9 @@ class TestDiscoveryService: """Test suite for DiscoveryService class.""" @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_discover_files_basic(self, mock_get, mock_dns): + def test_discover_files_basic(self, mock_get, mock_session, mock_dns): """Test main discovery method returns single best file.""" service = DiscoveryService() base_url = "https://example.com" @@ -61,6 +62,7 @@ class TestDiscoveryService: return create_mock_response(404) mock_get.side_effect = mock_get_side_effect + mock_session.return_value.get.side_effect = mock_get_side_effect result = service.discover_files(base_url) @@ -69,14 +71,16 @@ class TestDiscoveryService: assert result == 'https://example.com/llms.txt' @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_discover_files_no_files_found(self, mock_get, mock_dns): + def test_discover_files_no_files_found(self, mock_get, mock_session, mock_dns): """Test discovery when no files are found.""" service = DiscoveryService() base_url = "https://example.com" # Mock all HTTP requests to return 404 mock_get.return_value = create_mock_response(404) + mock_session.return_value.get.return_value = create_mock_response(404) result = service.discover_files(base_url) @@ -84,8 +88,9 @@ class TestDiscoveryService: assert result is None @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_discover_files_priority_order(self, mock_get, mock_dns): + def test_discover_files_priority_order(self, mock_get, mock_session, mock_dns): """Test that discovery follows the correct priority order.""" service = DiscoveryService() base_url = "https://example.com" @@ -103,6 +108,7 @@ class TestDiscoveryService: return create_mock_response(404) mock_get.side_effect = mock_get_side_effect + mock_session.return_value.get.side_effect = mock_get_side_effect result = service.discover_files(base_url) @@ -110,8 +116,9 @@ class TestDiscoveryService: assert result == 'https://example.com/llms.txt' @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_discover_files_robots_sitemap_priority(self, mock_get, mock_dns): + def test_discover_files_robots_sitemap_priority(self, mock_get, mock_session, mock_dns): """Test that llms files have priority over robots.txt sitemap declarations.""" service = DiscoveryService() base_url = "https://example.com" @@ -129,16 +136,18 @@ class TestDiscoveryService: return create_mock_response(404) mock_get.side_effect = mock_get_side_effect + mock_session.return_value.get.side_effect = mock_get_side_effect result = service.discover_files(base_url) - # Should return llms-full.txt since llms files have priority over sitemaps + # Should return llms.txt (highest priority llms file) since llms files have priority over sitemaps # even when sitemaps are declared in robots.txt - assert result == 'https://example.com/llms-full.txt' + assert result == 'https://example.com/llms.txt' @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_discover_files_subdirectory_fallback(self, mock_get, mock_dns): + def test_discover_files_subdirectory_fallback(self, mock_get, mock_session, mock_dns): """Test discovery falls back to subdirectories for llms files.""" service = DiscoveryService() base_url = "https://example.com" @@ -156,6 +165,7 @@ class TestDiscoveryService: return create_mock_response(404) mock_get.side_effect = mock_get_side_effect + mock_session.return_value.get.side_effect = mock_get_side_effect result = service.discover_files(base_url) @@ -163,26 +173,31 @@ class TestDiscoveryService: assert result == 'https://example.com/static/llms.txt' @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_check_url_exists(self, mock_get, mock_dns): + def test_check_url_exists(self, mock_get, mock_session, mock_dns): """Test URL existence checking.""" service = DiscoveryService() # Test successful response mock_get.return_value = create_mock_response(200) + mock_session.return_value.get.return_value = create_mock_response(200) assert service._check_url_exists("https://example.com/exists") is True # Test 404 response mock_get.return_value = create_mock_response(404) + mock_session.return_value.get.return_value = create_mock_response(404) assert service._check_url_exists("https://example.com/not-found") is False # Test network error - mock_get.side_effect = Exception("Network error") + mock_get.side_effect = Exception + mock_session.return_value.get.side_effect = Exception("Network error") assert service._check_url_exists("https://example.com/error") is False @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_parse_robots_txt_with_sitemap(self, mock_get, mock_dns): + def test_parse_robots_txt_with_sitemap(self, mock_get, mock_session, mock_dns): """Test robots.txt parsing with sitemap directives.""" service = DiscoveryService() @@ -201,8 +216,9 @@ Sitemap: https://example.com/sitemap-news.xml""" mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'}) @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_parse_robots_txt_no_sitemap(self, mock_get, mock_dns): + def test_parse_robots_txt_no_sitemap(self, mock_get, mock_session, mock_dns): """Test robots.txt parsing without sitemap directives.""" service = DiscoveryService() @@ -218,8 +234,9 @@ Allow: /public/""" mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'}) @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_parse_html_meta_tags(self, mock_get, mock_dns): + def test_parse_html_meta_tags(self, mock_get, mock_session, mock_dns): """Test HTML meta tag parsing for sitemaps.""" service = DiscoveryService() @@ -243,8 +260,9 @@ Allow: /public/""" mock_get.assert_called_once_with("https://example.com", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'}) @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_discovery_priority_behavior(self, mock_get, mock_dns): + def test_discovery_priority_behavior(self, mock_get, mock_session, mock_dns): """Test that discovery returns highest-priority file when multiple files exist.""" service = DiscoveryService() base_url = "https://example.com" @@ -262,6 +280,7 @@ Allow: /public/""" return create_mock_response(404) mock_get.side_effect = mock_all_exist + mock_session.return_value.get.side_effect = mock_all_exist result = service.discover_files(base_url) assert result == 'https://example.com/llms.txt', "Should return llms.txt when all files exist (highest priority)" @@ -277,6 +296,7 @@ Allow: /public/""" return create_mock_response(404) mock_get.side_effect = mock_without_txt + mock_session.return_value.get.side_effect = mock_without_txt result = service.discover_files(base_url) assert result == 'https://example.com/llms-full.txt', "Should return llms-full.txt when llms.txt is missing" @@ -292,6 +312,7 @@ Allow: /public/""" return create_mock_response(404) mock_get.side_effect = mock_only_sitemaps + mock_session.return_value.get.side_effect = mock_only_sitemaps result = service.discover_files(base_url) assert result == 'https://example.com/sitemap.xml', "Should return sitemap.xml when llms files are missing" @@ -305,17 +326,20 @@ Allow: /public/""" return create_mock_response(404) mock_get.side_effect = mock_llms_and_sitemap + mock_session.return_value.get.side_effect = mock_llms_and_sitemap result = service.discover_files(base_url) assert result == 'https://example.com/llms.txt', "Should prefer llms.txt over sitemap.xml" @patch('socket.getaddrinfo', return_value=create_mock_dns_response()) + @patch('requests.Session') @patch('requests.get') - def test_network_error_handling(self, mock_get, mock_dns): + def test_network_error_handling(self, mock_get, mock_session, mock_dns): """Test error scenarios with network failures.""" service = DiscoveryService() # Mock network error mock_get.side_effect = Exception("Network error") + mock_session.return_value.get.side_effect = Exception("Network error") # Should not raise exception, but return None result = service.discover_files("https://example.com")