diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index 511b63a2..467f1673 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -17,30 +17,109 @@ logger = get_logger(__name__) class DiscoveryService: """Service for discovering related files automatically during crawls.""" + # Maximum response size to prevent memory exhaustion (10MB default) + MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10 MB + # Global priority order - select ONE best file from all categories # All these files contain similar AI/crawling guidance content DISCOVERY_PRIORITY = [ # LLMs files (highest priority - most comprehensive AI guidance) "llms-full.txt", - "llms.txt", + "llms.txt", "llms.md", "llms.mdx", "llms.markdown", - - # Sitemap files (structural crawling guidance) + + # Sitemap files (structural crawling guidance) "sitemap_index.xml", "sitemap-index.xml", "sitemap.xml", - + # Robots file (basic crawling rules) "robots.txt", - + # Well-known variants (alternative locations) ".well-known/ai.txt", - ".well-known/llms.txt", + ".well-known/llms.txt", ".well-known/sitemap.xml" ] + # Categorized discovery targets for helper methods + # Maintains the same order and values as DISCOVERY_PRIORITY + DISCOVERY_TARGETS = { + "llms_files": [ + "llms-full.txt", + "llms.txt", + "llms.md", + "llms.mdx", + "llms.markdown", + ], + "sitemap_files": [ + "sitemap_index.xml", + "sitemap-index.xml", + "sitemap.xml", + ], + "robots_files": [ + "robots.txt", + ], + "well_known_files": [ + ".well-known/ai.txt", + ".well-known/llms.txt", + ".well-known/sitemap.xml", + ], + } + + def _read_response_with_limit(self, response: requests.Response, url: str, max_size: int | None = None) -> str: + """ + Read response content with size limit to prevent memory exhaustion. + + Args: + response: The response object to read from + url: URL being read (for logging) + max_size: Maximum bytes to read (defaults to MAX_RESPONSE_SIZE) + + Returns: + Response text content + + Raises: + ValueError: If response exceeds size limit + """ + if max_size is None: + max_size = self.MAX_RESPONSE_SIZE + + try: + chunks = [] + total_size = 0 + + # Read response in chunks to enforce size limit + for chunk in response.iter_content(chunk_size=8192, decode_unicode=False): + if chunk: + total_size += len(chunk) + if total_size > max_size: + response.close() + size_mb = max_size / (1024 * 1024) + logger.warning( + f"Response size exceeded limit of {size_mb:.1f}MB for {url}, " + f"received {total_size / (1024 * 1024):.1f}MB" + ) + raise ValueError(f"Response size exceeds {size_mb:.1f}MB limit") + chunks.append(chunk) + + # Decode the complete response + content_bytes = b''.join(chunks) + # Try to decode with the response encoding or fall back to utf-8 + encoding = response.encoding or 'utf-8' + try: + return content_bytes.decode(encoding) + except UnicodeDecodeError: + # Fallback to utf-8 with error replacement + return content_bytes.decode('utf-8', errors='replace') + + except Exception: + # Ensure response is closed on any error + response.close() + raise + def discover_files(self, base_url: str) -> str | None: """ Main discovery orchestrator - selects ONE best file across all categories. @@ -199,9 +278,10 @@ class DiscoveryService: Check if a URL exists and returns a successful response. """ try: - resp = requests.get(url, timeout=5, allow_redirects=True) + resp = requests.get(url, timeout=5, allow_redirects=True, verify=True) success = resp.status_code == 200 logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})") + resp.close() return success except Exception as e: logger.debug(f"URL check failed: {url} -> {e}") @@ -224,24 +304,35 @@ class DiscoveryService: robots_url = urljoin(base_url, "robots.txt") logger.info(f"Checking robots.txt at {robots_url}") - resp = requests.get(robots_url, timeout=30) + resp = requests.get(robots_url, timeout=30, stream=True, verify=True) - if resp.status_code != 200: - logger.info(f"No robots.txt found: HTTP {resp.status_code}") - return sitemaps + try: + if resp.status_code != 200: + logger.info(f"No robots.txt found: HTTP {resp.status_code}") + return sitemaps - # Parse robots.txt content for sitemap directives - for line in resp.text.splitlines(): - line = line.strip().lower() - if line.startswith("sitemap:"): - sitemap_url = line.split(":", 1)[1].strip() - # Validate URL format before adding - if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')): - sitemaps.append(sitemap_url) - logger.info(f"Found sitemap in robots.txt: {sitemap_url}") + # Read response with size limit + content = self._read_response_with_limit(resp, robots_url) + + # Parse robots.txt content for sitemap directives + for line in content.splitlines(): + line = line.strip().lower() + if line.startswith("sitemap:"): + sitemap_url = line.split(":", 1)[1].strip() + # Validate URL format before adding + if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')): + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in robots.txt: {sitemap_url}") + + finally: + # Ensure response is always closed + resp.close() except requests.exceptions.RequestException: logger.exception(f"Network error fetching robots.txt from {base_url}") + except ValueError as e: + # Size limit exceeded + logger.warning(f"robots.txt too large at {base_url}: {e}") except Exception: logger.exception(f"Unexpected error parsing robots.txt from {base_url}") @@ -274,18 +365,22 @@ class DiscoveryService: for target_type, filename in all_targets: try: file_url = urljoin(base_url, filename) - resp = requests.get(file_url, timeout=30, allow_redirects=True) + resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True) - if resp.status_code == 200: - # Map target type to discovery category - if target_type == "sitemap_files": - discovered["sitemaps"].append(file_url) - elif target_type == "llms_files": - discovered["llms_files"].append(file_url) - elif target_type == "robots_files": - discovered["robots_files"].append(file_url) + try: + if resp.status_code == 200: + # Map target type to discovery category + if target_type == "sitemap_files": + discovered["sitemaps"].append(file_url) + elif target_type == "llms_files": + discovered["llms_files"].append(file_url) + elif target_type == "robots_files": + discovered["robots_files"].append(file_url) - logger.info(f"Found {target_type} file: {file_url}") + logger.info(f"Found {target_type} file: {file_url}") + + finally: + resp.close() except requests.exceptions.RequestException: logger.debug(f"File not found or network error: {filename}") @@ -311,37 +406,46 @@ class DiscoveryService: try: logger.info(f"Checking HTML meta tags for sitemaps at {base_url}") - resp = requests.get(base_url, timeout=30) + resp = requests.get(base_url, timeout=30, stream=True, verify=True) - if resp.status_code != 200: - logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}") - return sitemaps + try: + if resp.status_code != 200: + logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}") + return sitemaps - content = resp.text.lower() + # Read response with size limit + content = self._read_response_with_limit(resp, base_url) + content = content.lower() - # Look for sitemap meta tags or link elements - import re + # Look for sitemap meta tags or link elements + import re - # Check for - sitemap_link_pattern = r']*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']' - matches = re.findall(sitemap_link_pattern, content) + # Check for + sitemap_link_pattern = r']*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']' + matches = re.findall(sitemap_link_pattern, content) - for match in matches: - sitemap_url = urljoin(base_url, match) - sitemaps.append(sitemap_url) - logger.info(f"Found sitemap in HTML link tag: {sitemap_url}") + for match in matches: + sitemap_url = urljoin(base_url, match) + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in HTML link tag: {sitemap_url}") - # Check for - sitemap_meta_pattern = r']*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']' - matches = re.findall(sitemap_meta_pattern, content) + # Check for + sitemap_meta_pattern = r']*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']' + matches = re.findall(sitemap_meta_pattern, content) - for match in matches: - sitemap_url = urljoin(base_url, match) - sitemaps.append(sitemap_url) - logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}") + for match in matches: + sitemap_url = urljoin(base_url, match) + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}") + + finally: + resp.close() except requests.exceptions.RequestException: logger.exception(f"Network error fetching HTML from {base_url}") + except ValueError as e: + # Size limit exceeded + logger.warning(f"HTML response too large at {base_url}: {e}") except Exception: logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}") @@ -363,11 +467,15 @@ class DiscoveryService: for filename in self.DISCOVERY_TARGETS["well_known_files"]: try: file_url = urljoin(base_url, filename) - resp = requests.get(file_url, timeout=30, allow_redirects=True) + resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True) - if resp.status_code == 200: - well_known_files.append(file_url) - logger.info(f"Found .well-known file: {file_url}") + try: + if resp.status_code == 200: + well_known_files.append(file_url) + logger.info(f"Found .well-known file: {file_url}") + + finally: + resp.close() except requests.exceptions.RequestException: logger.debug(f"Well-known file not found or network error: {filename}") @@ -403,11 +511,15 @@ class DiscoveryService: for llms_file in self.DISCOVERY_TARGETS["llms_files"]: try: file_url = urljoin(base_url, f"{subdir}/{llms_file}") - resp = requests.get(file_url, timeout=30, allow_redirects=True) + resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True) - if resp.status_code == 200: - discovered["llms_files"].append(file_url) - logger.info(f"Found llms file variant: {file_url}") + try: + if resp.status_code == 200: + discovered["llms_files"].append(file_url) + logger.info(f"Found llms file variant: {file_url}") + + finally: + resp.close() except requests.exceptions.RequestException: logger.debug(f"Variant not found: {subdir}/{llms_file}") @@ -425,11 +537,15 @@ class DiscoveryService: for sitemap_path in sitemap_paths: try: file_url = urljoin(base_url, sitemap_path) - resp = requests.get(file_url, timeout=30, allow_redirects=True) + resp = requests.get(file_url, timeout=30, allow_redirects=True, stream=True, verify=True) - if resp.status_code == 200: - discovered["sitemaps"].append(file_url) - logger.info(f"Found sitemap variant: {file_url}") + try: + if resp.status_code == 200: + discovered["sitemaps"].append(file_url) + logger.info(f"Found sitemap variant: {file_url}") + + finally: + resp.close() except requests.exceptions.RequestException: logger.debug(f"Sitemap variant not found: {sitemap_path}") diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index f9955e4d..fa79ebe3 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -6,6 +6,7 @@ Handles URL transformations and validations. import hashlib import re +from typing import List, Optional from urllib.parse import urljoin, urlparse from ....config.logfire_config import get_logger diff --git a/python/tests/progress_tracking/test_progress_mapper.py b/python/tests/progress_tracking/test_progress_mapper.py index 6492cf64..37532f88 100644 --- a/python/tests/progress_tracking/test_progress_mapper.py +++ b/python/tests/progress_tracking/test_progress_mapper.py @@ -265,10 +265,10 @@ class TestProgressMapper: # Code extraction (longest phase) assert mapper.map_progress("code_extraction", 0) == 40 progress_25 = mapper.map_progress("code_extraction", 25) - assert progress_25 in [52, 53] # 40 + (25% of 50) = 52.5, could round to 52 or 53 + assert progress_25 in [52, 53] # 40 + (25% of 50) = 52.5, banker's rounding rounds to 52 (even) assert mapper.map_progress("code_extraction", 50) == 65 # 40 + (50% of 50) = 65 progress_75 = mapper.map_progress("code_extraction", 75) - assert progress_75 in [77, 78] # 40 + (75% of 50) = 77.5, could round to 77 or 78 + assert progress_75 == 78 # 40 + (75% of 50) = 77.5 -> 78 (rounds to even per banker's rounding) assert mapper.map_progress("code_extraction", 100) == 90 # Finalization diff --git a/python/tests/test_discovery_service.py b/python/tests/test_discovery_service.py index 47915362..c815c29b 100644 --- a/python/tests/test_discovery_service.py +++ b/python/tests/test_discovery_service.py @@ -4,6 +4,27 @@ from unittest.mock import Mock, patch from src.server.services.crawling.discovery_service import DiscoveryService +def create_mock_response(status_code: int, text: str = "") -> Mock: + """Create a mock response object that supports streaming API.""" + response = Mock() + response.status_code = status_code + response.text = text + response.encoding = 'utf-8' + + # Mock iter_content to yield text in chunks as bytes + text_bytes = text.encode('utf-8') + chunk_size = 8192 + chunks = [text_bytes[i:i+chunk_size] for i in range(0, len(text_bytes), chunk_size)] + if not chunks: + chunks = [b''] # Ensure at least one empty chunk + response.iter_content = Mock(return_value=iter(chunks)) + + # Mock close method + response.close = Mock() + + return response + + class TestDiscoveryService: """Test suite for DiscoveryService class.""" @@ -14,22 +35,18 @@ class TestDiscoveryService: base_url = "https://example.com" # Mock robots.txt response (no sitemaps) - robots_response = Mock() - robots_response.status_code = 200 - robots_response.text = "User-agent: *\nDisallow: /admin/" + robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/") # Mock file existence - llms-full.txt doesn't exist, but llms.txt does def mock_get_side_effect(url, **kwargs): - response = Mock() if url.endswith('robots.txt'): return robots_response elif url.endswith('llms-full.txt'): - response.status_code = 404 # Highest priority doesn't exist + return create_mock_response(404) # Highest priority doesn't exist elif url.endswith('llms.txt'): - response.status_code = 200 # Second priority exists + return create_mock_response(200) # Second priority exists else: - response.status_code = 404 - return response + return create_mock_response(404) mock_get.side_effect = mock_get_side_effect @@ -46,9 +63,7 @@ class TestDiscoveryService: base_url = "https://example.com" # Mock all HTTP requests to return 404 - mock_response = Mock() - mock_response.status_code = 404 - mock_get.return_value = mock_response + mock_get.return_value = create_mock_response(404) result = service.discover_files(base_url) @@ -62,20 +77,16 @@ class TestDiscoveryService: base_url = "https://example.com" # Mock robots.txt response (no sitemaps declared) - robots_response = Mock() - robots_response.status_code = 200 - robots_response.text = "User-agent: *\nDisallow: /admin/" + robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/") # Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority def mock_get_side_effect(url, **kwargs): - response = Mock() if url.endswith('robots.txt'): return robots_response elif url.endswith('llms.txt') or url.endswith('sitemap.xml'): - response.status_code = 200 # Both exist + return create_mock_response(200) # Both exist else: - response.status_code = 404 - return response + return create_mock_response(404) mock_get.side_effect = mock_get_side_effect @@ -91,20 +102,16 @@ class TestDiscoveryService: base_url = "https://example.com" # Mock robots.txt response WITH sitemap declaration - robots_response = Mock() - robots_response.status_code = 200 - robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml" + robots_response = create_mock_response(200, "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml") # Mock other files also exist def mock_get_side_effect(url, **kwargs): - response = Mock() if url.endswith('robots.txt'): return robots_response elif 'llms' in url or 'sitemap' in url: - response.status_code = 200 + return create_mock_response(200) else: - response.status_code = 404 - return response + return create_mock_response(404) mock_get.side_effect = mock_get_side_effect @@ -120,20 +127,16 @@ class TestDiscoveryService: base_url = "https://example.com" # Mock robots.txt response (no sitemaps declared) - robots_response = Mock() - robots_response.status_code = 200 - robots_response.text = "User-agent: *\nDisallow: /admin/" + robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/") # Mock file existence - no root llms files, but static/llms.txt exists def mock_get_side_effect(url, **kwargs): - response = Mock() if url.endswith('robots.txt'): return robots_response elif '/static/llms.txt' in url: - response.status_code = 200 # Found in subdirectory + return create_mock_response(200) # Found in subdirectory else: - response.status_code = 404 - return response + return create_mock_response(404) mock_get.side_effect = mock_get_side_effect @@ -148,14 +151,11 @@ class TestDiscoveryService: service = DiscoveryService() # Test successful response - mock_response = Mock() - mock_response.status_code = 200 - mock_get.return_value = mock_response - + mock_get.return_value = create_mock_response(200) assert service._check_url_exists("https://example.com/exists") is True # Test 404 response - mock_response.status_code = 404 + mock_get.return_value = create_mock_response(404) assert service._check_url_exists("https://example.com/not-found") is False # Test network error @@ -168,20 +168,18 @@ class TestDiscoveryService: service = DiscoveryService() # Mock successful robots.txt response - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """User-agent: * + robots_text = """User-agent: * Disallow: /admin/ Sitemap: https://example.com/sitemap.xml Sitemap: https://example.com/sitemap-news.xml""" - mock_get.return_value = mock_response + mock_get.return_value = create_mock_response(200, robots_text) result = service._parse_robots_txt("https://example.com") assert len(result) == 2 assert "https://example.com/sitemap.xml" in result assert "https://example.com/sitemap-news.xml" in result - mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) + mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True) @patch('requests.get') def test_parse_robots_txt_no_sitemap(self, mock_get): @@ -189,17 +187,15 @@ Sitemap: https://example.com/sitemap-news.xml""" service = DiscoveryService() # Mock robots.txt without sitemaps - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """User-agent: * + robots_text = """User-agent: * Disallow: /admin/ Allow: /public/""" - mock_get.return_value = mock_response + mock_get.return_value = create_mock_response(200, robots_text) result = service._parse_robots_txt("https://example.com") assert len(result) == 0 - mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) + mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True) @patch('requests.get') def test_parse_html_meta_tags(self, mock_get): @@ -207,9 +203,7 @@ Allow: /public/""" service = DiscoveryService() # Mock HTML with sitemap references - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """ + html_content = """
@@ -218,31 +212,79 @@ Allow: /public/""" Content here """ - mock_get.return_value = mock_response + mock_get.return_value = create_mock_response(200, html_content) result = service._parse_html_meta_tags("https://example.com") # Should find sitemaps from both link and meta tags assert len(result) >= 1 assert any('sitemap' in url.lower() for url in result) - mock_get.assert_called_once_with("https://example.com", timeout=30) + mock_get.assert_called_once_with("https://example.com", timeout=30, stream=True, verify=True) - def test_discovery_priority_constant(self): - """Test that discovery priority constant is properly defined.""" + @patch('requests.get') + def test_discovery_priority_behavior(self, mock_get): + """Test that discovery returns highest-priority file when multiple files exist.""" service = DiscoveryService() + base_url = "https://example.com" - # Verify the priority list exists and has expected order - assert hasattr(service, 'DISCOVERY_PRIORITY') - assert isinstance(service.DISCOVERY_PRIORITY, list) - assert len(service.DISCOVERY_PRIORITY) > 0 + # Mock robots.txt response (no sitemaps declared) + robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/") - # Verify llms-full.txt is first (highest priority) - assert service.DISCOVERY_PRIORITY[0] == 'llms-full.txt' + # Scenario 1: All files exist - should return llms-full.txt (highest priority) + def mock_all_exist(url, **kwargs): + if url.endswith('robots.txt'): + return robots_response + elif any(file in url for file in ['llms-full.txt', 'llms.txt', 'llms.md', 'sitemap.xml', 'sitemap_index.xml']): + return create_mock_response(200) + else: + return create_mock_response(404) - # Verify llms.txt comes before sitemap files - llms_txt_index = service.DISCOVERY_PRIORITY.index('llms.txt') - sitemap_index = service.DISCOVERY_PRIORITY.index('sitemap.xml') - assert llms_txt_index < sitemap_index + mock_get.side_effect = mock_all_exist + result = service.discover_files(base_url) + assert result == 'https://example.com/llms-full.txt', "Should return llms-full.txt when all files exist" + + # Scenario 2: llms-full.txt missing, others exist - should return llms.txt + def mock_without_full(url, **kwargs): + if url.endswith('robots.txt'): + return robots_response + elif url.endswith('llms-full.txt'): + return create_mock_response(404) + elif any(file in url for file in ['llms.txt', 'llms.md', 'sitemap.xml']): + return create_mock_response(200) + else: + return create_mock_response(404) + + mock_get.side_effect = mock_without_full + result = service.discover_files(base_url) + assert result == 'https://example.com/llms.txt', "Should return llms.txt when llms-full.txt is missing" + + # Scenario 3: Only sitemap files exist - should return sitemap_index.xml over sitemap.xml + def mock_only_sitemaps(url, **kwargs): + if url.endswith('robots.txt'): + return robots_response + elif any(file in url for file in ['llms-full.txt', 'llms.txt', 'llms.md']): + return create_mock_response(404) + elif any(file in url for file in ['sitemap_index.xml', 'sitemap.xml']): + return create_mock_response(200) + else: + return create_mock_response(404) + + mock_get.side_effect = mock_only_sitemaps + result = service.discover_files(base_url) + assert result == 'https://example.com/sitemap_index.xml', "Should return sitemap_index.xml when llms files are missing" + + # Scenario 4: llms files have priority over sitemap files + def mock_llms_and_sitemap(url, **kwargs): + if url.endswith('robots.txt'): + return robots_response + elif url.endswith('llms.txt') or url.endswith('sitemap.xml'): + return create_mock_response(200) + else: + return create_mock_response(404) + + mock_get.side_effect = mock_llms_and_sitemap + result = service.discover_files(base_url) + assert result == 'https://example.com/llms.txt', "Should prefer llms.txt over sitemap.xml" @patch('requests.get') def test_network_error_handling(self, mock_get):