From 43af7b747c1adb866698aa587fb0c676fb2b50f2 Mon Sep 17 00:00:00 2001 From: leex279 Date: Mon, 8 Sep 2025 10:27:50 +0200 Subject: [PATCH] fix: Update tests for single-file discovery and discovery stage integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix discovery service tests to match new single-file return format - Remove obsolete tests for removed discovery methods - Update progress mapper tests for new discovery stage ranges - Fix stage range expectations after adding discovery stage (2,3) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../progress_tracking/test_progress_mapper.py | 61 ++-- python/tests/test_discovery_service.py | 331 ++++-------------- 2 files changed, 104 insertions(+), 288 deletions(-) diff --git a/python/tests/progress_tracking/test_progress_mapper.py b/python/tests/progress_tracking/test_progress_mapper.py index f0f87b8b..8b8ba381 100644 --- a/python/tests/progress_tracking/test_progress_mapper.py +++ b/python/tests/progress_tracking/test_progress_mapper.py @@ -21,12 +21,12 @@ class TestProgressMapper: def test_stage_ranges_are_valid(self, progress_mapper): """Test that all stage ranges are valid and sequential.""" ranges = progress_mapper.STAGE_RANGES - + # Test that ranges don't overlap (except for aliases) - crawl_stages = ["starting", "analyzing", "crawling", "processing", - "source_creation", "document_storage", "code_extraction", + crawl_stages = ["starting", "analyzing", "crawling", "processing", + "source_creation", "document_storage", "code_extraction", "finalization", "completed"] - + last_end = 0 for stage in crawl_stages[:-1]: # Exclude completed which is (100, 100) start, end = ranges[stage] @@ -37,17 +37,17 @@ class TestProgressMapper: # Test that code extraction gets the largest range (it's the longest) code_start, code_end = ranges["code_extraction"] code_range = code_end - code_start - - doc_start, doc_end = ranges["document_storage"] + + doc_start, doc_end = ranges["document_storage"] doc_range = doc_end - doc_start - + assert code_range > doc_range, "Code extraction should have larger range than document storage" def test_map_progress_basic_functionality(self, progress_mapper): """Test basic progress mapping functionality.""" # Test crawling stage at 50% result = progress_mapper.map_progress("crawling", 50.0) - + # Should be halfway between crawling range (2-5%) expected = 2 + (50 / 100) * (5 - 2) # 3.5%, rounded to 4 assert result == 4 @@ -56,16 +56,16 @@ class TestProgressMapper: """Test progress mapping for document storage stage.""" # Test document storage at 25% result = progress_mapper.map_progress("document_storage", 25.0) - + # Should be 25% through document_storage range (10-30%) expected = 10 + (25 / 100) * (30 - 10) # 10 + 5 = 15 assert result == 15 def test_map_progress_code_extraction(self, progress_mapper): """Test progress mapping for code extraction stage.""" - # Test code extraction at 50% + # Test code extraction at 50% result = progress_mapper.map_progress("code_extraction", 50.0) - + # Should be 50% through code_extraction range (30-95%) expected = 30 + (50 / 100) * (95 - 30) # 30 + 32.5 = 62.5, rounded to 62 assert result == 62 @@ -75,10 +75,10 @@ class TestProgressMapper: # Set initial progress to 50% result1 = progress_mapper.map_progress("document_storage", 100.0) # Should be 30% assert result1 == 30 - + # Try to map a lower stage with lower progress result2 = progress_mapper.map_progress("crawling", 50.0) # Would normally be ~3.5% - + # Should maintain higher progress assert result2 == 30 # Stays at previous high value @@ -86,11 +86,11 @@ class TestProgressMapper: """Test that stage progress is clamped to 0-100 range.""" # Test negative progress result = progress_mapper.map_progress("crawling", -10.0) - expected = 2 # Start of crawling range + expected = 3 # Start of crawling range (updated after discovery stage) assert result == expected - + # Test progress over 100 - result = progress_mapper.map_progress("crawling", 150.0) + result = progress_mapper.map_progress("crawling", 150.0) expected = 5 # End of crawling range assert result == expected @@ -109,16 +109,17 @@ class TestProgressMapper: # Set some initial progress progress_mapper.map_progress("crawling", 50) current = progress_mapper.last_overall_progress - + # Try unknown stage result = progress_mapper.map_progress("unknown_stage", 75) - + # Should maintain current progress assert result == current def test_get_stage_range(self, progress_mapper): """Test getting stage ranges.""" - assert progress_mapper.get_stage_range("crawling") == (2, 5) + assert progress_mapper.get_stage_range("discovery") == (2, 3) # New discovery stage + assert progress_mapper.get_stage_range("crawling") == (3, 5) # Updated after discovery assert progress_mapper.get_stage_range("document_storage") == (10, 30) assert progress_mapper.get_stage_range("code_extraction") == (30, 95) assert progress_mapper.get_stage_range("unknown") == (0, 100) # Default @@ -128,11 +129,11 @@ class TestProgressMapper: # Test normal case result = progress_mapper.calculate_stage_progress(25, 100) assert result == 25.0 - + # Test division by zero protection result = progress_mapper.calculate_stage_progress(10, 0) assert result == 0.0 - + # Test negative max protection result = progress_mapper.calculate_stage_progress(10, -5) assert result == 0.0 @@ -141,7 +142,7 @@ class TestProgressMapper: """Test batch progress mapping.""" # Test batch 3 of 6 in document_storage stage result = progress_mapper.map_batch_progress("document_storage", 3, 6) - + # Should be (3-1)/6 = 33.3% through document_storage stage # document_storage is 10-30%, so 33.3% of 20% = 6.67%, so 10 + 6.67 = 16.67 ≈ 17 assert result == 17 @@ -159,10 +160,10 @@ class TestProgressMapper: progress_mapper.map_progress("crawling", 50) assert progress_mapper.last_overall_progress > 0 assert progress_mapper.current_stage != "starting" - + # Reset progress_mapper.reset() - + # Should be back to initial state assert progress_mapper.last_overall_progress == 0 assert progress_mapper.current_stage == "starting" @@ -172,7 +173,7 @@ class TestProgressMapper: # Initial state assert progress_mapper.get_current_stage() == "starting" assert progress_mapper.get_current_progress() == 0 - + # After mapping some progress progress_mapper.map_progress("document_storage", 50) assert progress_mapper.get_current_stage() == "document_storage" @@ -196,9 +197,9 @@ class TestProgressMapper: ("finalization", 100, 100), # Finalization ("completed", 0, 100), # Completion ] - + progress_mapper.reset() - + for stage, stage_progress, expected_overall in stages: result = progress_mapper.map_progress(stage, stage_progress) assert result == expected_overall, f"Stage {stage} at {stage_progress}% should map to {expected_overall}%, got {result}%" @@ -206,7 +207,7 @@ class TestProgressMapper: def test_upload_stage_ranges(self, progress_mapper): """Test upload-specific stage ranges.""" upload_stages = ["reading", "extracting", "chunking", "creating_source", "summarizing", "storing"] - + # Test that upload stages have valid ranges last_end = 0 for stage in upload_stages: @@ -214,6 +215,6 @@ class TestProgressMapper: assert start >= last_end, f"Upload stage {stage} overlaps with previous" assert end > start, f"Upload stage {stage} has invalid range" last_end = end - + # Test that final upload stage reaches 100% - assert progress_mapper.get_stage_range("storing")[1] == 100 \ No newline at end of file + assert progress_mapper.get_stage_range("storing")[1] == 100 diff --git a/python/tests/test_discovery_service.py b/python/tests/test_discovery_service.py index 5c31b0e6..47915362 100644 --- a/python/tests/test_discovery_service.py +++ b/python/tests/test_discovery_service.py @@ -1,6 +1,6 @@ """Unit tests for DiscoveryService class.""" -import pytest -from unittest.mock import patch, Mock +from unittest.mock import Mock, patch + from src.server.services.crawling.discovery_service import DiscoveryService @@ -12,12 +12,12 @@ class TestDiscoveryService: """Test main discovery method returns single best file.""" service = DiscoveryService() base_url = "https://example.com" - + # Mock robots.txt response (no sitemaps) robots_response = Mock() robots_response.status_code = 200 robots_response.text = "User-agent: *\nDisallow: /admin/" - + # Mock file existence - llms-full.txt doesn't exist, but llms.txt does def mock_get_side_effect(url, **kwargs): response = Mock() @@ -30,42 +30,42 @@ class TestDiscoveryService: else: response.status_code = 404 return response - + mock_get.side_effect = mock_get_side_effect - + result = service.discover_files(base_url) - + # Should return single URL string (not dict, not list) assert isinstance(result, str) assert result == 'https://example.com/llms.txt' - @patch('requests.get') + @patch('requests.get') def test_discover_files_no_files_found(self, mock_get): """Test discovery when no files are found.""" service = DiscoveryService() base_url = "https://example.com" - + # Mock all HTTP requests to return 404 mock_response = Mock() mock_response.status_code = 404 mock_get.return_value = mock_response - + result = service.discover_files(base_url) - + # Should return None when no files found assert result is None - @patch('requests.get') + @patch('requests.get') def test_discover_files_priority_order(self, mock_get): """Test that discovery follows the correct priority order.""" service = DiscoveryService() base_url = "https://example.com" - + # Mock robots.txt response (no sitemaps declared) robots_response = Mock() robots_response.status_code = 200 robots_response.text = "User-agent: *\nDisallow: /admin/" - + # Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority def mock_get_side_effect(url, **kwargs): response = Mock() @@ -76,11 +76,11 @@ class TestDiscoveryService: else: response.status_code = 404 return response - + mock_get.side_effect = mock_get_side_effect - + result = service.discover_files(base_url) - + # Should return llms.txt since it has higher priority than sitemap.xml assert result == 'https://example.com/llms.txt' @@ -89,12 +89,12 @@ class TestDiscoveryService: """Test that robots.txt sitemap declarations have highest priority.""" service = DiscoveryService() base_url = "https://example.com" - - # Mock robots.txt response WITH sitemap declaration + + # Mock robots.txt response WITH sitemap declaration robots_response = Mock() robots_response.status_code = 200 robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml" - + # Mock other files also exist def mock_get_side_effect(url, **kwargs): response = Mock() @@ -105,93 +105,59 @@ class TestDiscoveryService: else: response.status_code = 404 return response - + mock_get.side_effect = mock_get_side_effect - + result = service.discover_files(base_url) - + # Should return the sitemap declared in robots.txt (highest priority) assert result == 'https://example.com/declared-sitemap.xml' @patch('requests.get') - def test_discover_best_sitemap_robots_priority(self, mock_get): - """Test sitemap discovery prioritizes robots.txt declarations.""" + def test_discover_files_subdirectory_fallback(self, mock_get): + """Test discovery falls back to subdirectories for llms files.""" service = DiscoveryService() base_url = "https://example.com" - - # Mock robots.txt with sitemap - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = "Sitemap: https://example.com/robots-sitemap.xml" - mock_get.return_value = mock_response - - result = service._discover_best_sitemap(base_url) - - # Should return the sitemap from robots.txt (highest priority) - assert result == "https://example.com/robots-sitemap.xml" - @patch('requests.get') - def test_discover_best_llms_file_priority_order(self, mock_get): - """Test llms file discovery follows priority order.""" - service = DiscoveryService() - base_url = "https://example.com" - - # Mock HTTP responses - only llms.txt exists, not llms-full.txt + # Mock robots.txt response (no sitemaps declared) + robots_response = Mock() + robots_response.status_code = 200 + robots_response.text = "User-agent: *\nDisallow: /admin/" + + # Mock file existence - no root llms files, but static/llms.txt exists def mock_get_side_effect(url, **kwargs): response = Mock() - if url.endswith('llms-full.txt'): - response.status_code = 404 # Higher priority file doesn't exist - elif url.endswith('llms.txt'): - response.status_code = 200 # Standard file exists - else: - response.status_code = 404 - return response - - mock_get.side_effect = mock_get_side_effect - - result = service._discover_best_llms_file(base_url) - - # Should find llms.txt since llms-full.txt doesn't exist - assert result == "https://example.com/llms.txt" - - @patch('requests.get') - def test_discover_best_llms_file_subdirectory_fallback(self, mock_get): - """Test llms file discovery falls back to subdirectories.""" - service = DiscoveryService() - base_url = "https://example.com" - - # Mock HTTP responses - no root files, but static/llms.txt exists - def mock_get_side_effect(url, **kwargs): - response = Mock() - if '/static/llms.txt' in url: + if url.endswith('robots.txt'): + return robots_response + elif '/static/llms.txt' in url: response.status_code = 200 # Found in subdirectory else: response.status_code = 404 return response - + mock_get.side_effect = mock_get_side_effect - - result = service._discover_best_llms_file(base_url) - + + result = service.discover_files(base_url) + # Should find the file in static subdirectory - assert result == "https://example.com/static/llms.txt" + assert result == 'https://example.com/static/llms.txt' @patch('requests.get') def test_check_url_exists(self, mock_get): """Test URL existence checking.""" service = DiscoveryService() - + # Test successful response mock_response = Mock() mock_response.status_code = 200 mock_get.return_value = mock_response - + assert service._check_url_exists("https://example.com/exists") is True - + # Test 404 response mock_response.status_code = 404 assert service._check_url_exists("https://example.com/not-found") is False - + # Test network error mock_get.side_effect = Exception("Network error") assert service._check_url_exists("https://example.com/error") is False @@ -200,7 +166,7 @@ class TestDiscoveryService: def test_parse_robots_txt_with_sitemap(self, mock_get): """Test robots.txt parsing with sitemap directives.""" service = DiscoveryService() - + # Mock successful robots.txt response mock_response = Mock() mock_response.status_code = 200 @@ -209,9 +175,9 @@ Disallow: /admin/ Sitemap: https://example.com/sitemap.xml Sitemap: https://example.com/sitemap-news.xml""" mock_get.return_value = mock_response - + result = service._parse_robots_txt("https://example.com") - + assert len(result) == 2 assert "https://example.com/sitemap.xml" in result assert "https://example.com/sitemap-news.xml" in result @@ -221,7 +187,7 @@ Sitemap: https://example.com/sitemap-news.xml""" def test_parse_robots_txt_no_sitemap(self, mock_get): """Test robots.txt parsing without sitemap directives.""" service = DiscoveryService() - + # Mock robots.txt without sitemaps mock_response = Mock() mock_response.status_code = 200 @@ -229,60 +195,17 @@ Sitemap: https://example.com/sitemap-news.xml""" Disallow: /admin/ Allow: /public/""" mock_get.return_value = mock_response - + result = service._parse_robots_txt("https://example.com") - + assert len(result) == 0 mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) - @patch('requests.get') - def test_parse_robots_txt_not_found(self, mock_get): - """Test robots.txt parsing when file is not found.""" - service = DiscoveryService() - - # Mock 404 response - mock_response = Mock() - mock_response.status_code = 404 - mock_get.return_value = mock_response - - result = service._parse_robots_txt("https://example.com") - - assert len(result) == 0 - mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30) - - @patch('requests.get') - def test_check_standard_patterns(self, mock_get): - """Test standard file pattern checking.""" - service = DiscoveryService() - - # Mock responses for different files - def mock_response_side_effect(url, **kwargs): - mock_response = Mock() - if 'llms.txt' in url: - mock_response.status_code = 200 - elif 'sitemap.xml' in url: - mock_response.status_code = 200 - else: - mock_response.status_code = 404 - return mock_response - - mock_get.side_effect = mock_response_side_effect - - result = service._check_standard_patterns("https://example.com") - - assert 'sitemaps' in result - assert 'llms_files' in result - assert 'robots_files' in result - - # Should find the files that returned 200 - assert any('llms.txt' in url for url in result['llms_files']) - assert any('sitemap.xml' in url for url in result['sitemaps']) - @patch('requests.get') def test_parse_html_meta_tags(self, mock_get): """Test HTML meta tag parsing for sitemaps.""" service = DiscoveryService() - + # Mock HTML with sitemap references mock_response = Mock() mock_response.status_code = 200 @@ -296,154 +219,46 @@ Allow: /public/""" """ mock_get.return_value = mock_response - + result = service._parse_html_meta_tags("https://example.com") - + # Should find sitemaps from both link and meta tags assert len(result) >= 1 assert any('sitemap' in url.lower() for url in result) mock_get.assert_called_once_with("https://example.com", timeout=30) - @patch('requests.get') - def test_parse_html_meta_tags_not_found(self, mock_get): - """Test HTML meta tag parsing when page not found.""" + def test_discovery_priority_constant(self): + """Test that discovery priority constant is properly defined.""" service = DiscoveryService() - - # Mock 404 response - mock_response = Mock() - mock_response.status_code = 404 - mock_get.return_value = mock_response - - result = service._parse_html_meta_tags("https://example.com") - - assert len(result) == 0 - mock_get.assert_called_once_with("https://example.com", timeout=30) - @patch('requests.get') - def test_check_well_known_directory(self, mock_get): - """Test .well-known directory file checking.""" - service = DiscoveryService() - - # Mock responses - some files exist, some don't - def mock_response_side_effect(url, **kwargs): - mock_response = Mock() - if 'ai.txt' in url: - mock_response.status_code = 200 - else: - mock_response.status_code = 404 - return mock_response - - mock_get.side_effect = mock_response_side_effect - - result = service._check_well_known_directory("https://example.com") - - # Should find the ai.txt file - assert len(result) >= 1 - assert any('ai.txt' in url for url in result) + # Verify the priority list exists and has expected order + assert hasattr(service, 'DISCOVERY_PRIORITY') + assert isinstance(service.DISCOVERY_PRIORITY, list) + assert len(service.DISCOVERY_PRIORITY) > 0 - @patch('requests.get') - def test_try_common_variations(self, mock_get): - """Test pattern variations for discovery targets.""" - service = DiscoveryService() - - # Mock responses for variations - def mock_response_side_effect(url, **kwargs): - mock_response = Mock() - if 'docs/llms.txt' in url or 'sitemaps/sitemap.xml' in url: - mock_response.status_code = 200 - else: - mock_response.status_code = 404 - return mock_response - - mock_get.side_effect = mock_response_side_effect - - result = service._try_common_variations("https://example.com") - - assert 'sitemaps' in result - assert 'llms_files' in result - - # Should find at least one variation - assert len(result['llms_files']) >= 1 or len(result['sitemaps']) >= 1 + # Verify llms-full.txt is first (highest priority) + assert service.DISCOVERY_PRIORITY[0] == 'llms-full.txt' + + # Verify llms.txt comes before sitemap files + llms_txt_index = service.DISCOVERY_PRIORITY.index('llms.txt') + sitemap_index = service.DISCOVERY_PRIORITY.index('sitemap.xml') + assert llms_txt_index < sitemap_index @patch('requests.get') def test_network_error_handling(self, mock_get): """Test error scenarios with network failures.""" service = DiscoveryService() - + # Mock network error mock_get.side_effect = Exception("Network error") - - # Should not raise exception, but return empty results + + # Should not raise exception, but return None + result = service.discover_files("https://example.com") + assert result is None + + # Individual methods should also handle errors gracefully result = service._parse_robots_txt("https://example.com") assert result == [] - - result = service._check_standard_patterns("https://example.com") - assert isinstance(result, dict) - + result = service._parse_html_meta_tags("https://example.com") assert result == [] - - result = service._check_well_known_directory("https://example.com") - assert result == [] - - result = service._try_common_variations("https://example.com") - assert isinstance(result, dict) - - def test_discover_files_with_exceptions(self): - """Test main discovery method handles exceptions gracefully.""" - service = DiscoveryService() - - # Mock methods to raise exceptions - with patch.object(service, '_parse_robots_txt', side_effect=Exception("Test error")): - with patch.object(service, '_check_standard_patterns', side_effect=Exception("Test error")): - with patch.object(service, '_parse_html_meta_tags', side_effect=Exception("Test error")): - with patch.object(service, '_check_well_known_directory', side_effect=Exception("Test error")): - with patch.object(service, '_try_common_variations', side_effect=Exception("Test error")): - result = service.discover_files("https://example.com") - - # Should still return proper structure even with all methods failing - assert isinstance(result, dict) - assert 'sitemaps' in result - assert 'llms_files' in result - assert 'robots_files' in result - assert 'well_known_files' in result - - @patch('requests.get') - def test_robots_txt_with_malformed_content(self, mock_get): - """Test robots.txt parsing with malformed content.""" - service = DiscoveryService() - - # Mock malformed robots.txt content - mock_response = Mock() - mock_response.status_code = 200 - mock_response.text = """User-agent: * -Disallow: /admin/ -Sitemap: -Sitemap: not-a-valid-url -Sitemap: https://example.com/valid-sitemap.xml""" - mock_get.return_value = mock_response - - result = service._parse_robots_txt("https://example.com") - - # Should only include the valid sitemap URL - assert len(result) == 1 - assert "https://example.com/valid-sitemap.xml" in result - - def test_discovery_targets_constant(self): - """Test that discovery targets constant is properly defined.""" - service = DiscoveryService() - - assert hasattr(service, 'DISCOVERY_TARGETS') - targets = service.DISCOVERY_TARGETS - - # Verify required target types exist - assert 'llms_files' in targets - assert 'sitemap_files' in targets - assert 'robots_files' in targets - assert 'well_known_files' in targets - - # Verify they contain expected files - assert 'llms.txt' in targets['llms_files'] - assert 'sitemap.xml' in targets['sitemap_files'] - assert 'robots.txt' in targets['robots_files'] - assert '.well-known/ai.txt' in targets['well_known_files'] \ No newline at end of file