mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-30 21:49:30 -05:00
fix: Update tests for single-file discovery and discovery stage integration
- Fix discovery service tests to match new single-file return format - Remove obsolete tests for removed discovery methods - Update progress mapper tests for new discovery stage ranges - Fix stage range expectations after adding discovery stage (2,3) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -21,12 +21,12 @@ class TestProgressMapper:
|
||||
def test_stage_ranges_are_valid(self, progress_mapper):
|
||||
"""Test that all stage ranges are valid and sequential."""
|
||||
ranges = progress_mapper.STAGE_RANGES
|
||||
|
||||
|
||||
# Test that ranges don't overlap (except for aliases)
|
||||
crawl_stages = ["starting", "analyzing", "crawling", "processing",
|
||||
"source_creation", "document_storage", "code_extraction",
|
||||
crawl_stages = ["starting", "analyzing", "crawling", "processing",
|
||||
"source_creation", "document_storage", "code_extraction",
|
||||
"finalization", "completed"]
|
||||
|
||||
|
||||
last_end = 0
|
||||
for stage in crawl_stages[:-1]: # Exclude completed which is (100, 100)
|
||||
start, end = ranges[stage]
|
||||
@@ -37,17 +37,17 @@ class TestProgressMapper:
|
||||
# Test that code extraction gets the largest range (it's the longest)
|
||||
code_start, code_end = ranges["code_extraction"]
|
||||
code_range = code_end - code_start
|
||||
|
||||
doc_start, doc_end = ranges["document_storage"]
|
||||
|
||||
doc_start, doc_end = ranges["document_storage"]
|
||||
doc_range = doc_end - doc_start
|
||||
|
||||
|
||||
assert code_range > doc_range, "Code extraction should have larger range than document storage"
|
||||
|
||||
def test_map_progress_basic_functionality(self, progress_mapper):
|
||||
"""Test basic progress mapping functionality."""
|
||||
# Test crawling stage at 50%
|
||||
result = progress_mapper.map_progress("crawling", 50.0)
|
||||
|
||||
|
||||
# Should be halfway between crawling range (2-5%)
|
||||
expected = 2 + (50 / 100) * (5 - 2) # 3.5%, rounded to 4
|
||||
assert result == 4
|
||||
@@ -56,16 +56,16 @@ class TestProgressMapper:
|
||||
"""Test progress mapping for document storage stage."""
|
||||
# Test document storage at 25%
|
||||
result = progress_mapper.map_progress("document_storage", 25.0)
|
||||
|
||||
|
||||
# Should be 25% through document_storage range (10-30%)
|
||||
expected = 10 + (25 / 100) * (30 - 10) # 10 + 5 = 15
|
||||
assert result == 15
|
||||
|
||||
def test_map_progress_code_extraction(self, progress_mapper):
|
||||
"""Test progress mapping for code extraction stage."""
|
||||
# Test code extraction at 50%
|
||||
# Test code extraction at 50%
|
||||
result = progress_mapper.map_progress("code_extraction", 50.0)
|
||||
|
||||
|
||||
# Should be 50% through code_extraction range (30-95%)
|
||||
expected = 30 + (50 / 100) * (95 - 30) # 30 + 32.5 = 62.5, rounded to 62
|
||||
assert result == 62
|
||||
@@ -75,10 +75,10 @@ class TestProgressMapper:
|
||||
# Set initial progress to 50%
|
||||
result1 = progress_mapper.map_progress("document_storage", 100.0) # Should be 30%
|
||||
assert result1 == 30
|
||||
|
||||
|
||||
# Try to map a lower stage with lower progress
|
||||
result2 = progress_mapper.map_progress("crawling", 50.0) # Would normally be ~3.5%
|
||||
|
||||
|
||||
# Should maintain higher progress
|
||||
assert result2 == 30 # Stays at previous high value
|
||||
|
||||
@@ -86,11 +86,11 @@ class TestProgressMapper:
|
||||
"""Test that stage progress is clamped to 0-100 range."""
|
||||
# Test negative progress
|
||||
result = progress_mapper.map_progress("crawling", -10.0)
|
||||
expected = 2 # Start of crawling range
|
||||
expected = 3 # Start of crawling range (updated after discovery stage)
|
||||
assert result == expected
|
||||
|
||||
|
||||
# Test progress over 100
|
||||
result = progress_mapper.map_progress("crawling", 150.0)
|
||||
result = progress_mapper.map_progress("crawling", 150.0)
|
||||
expected = 5 # End of crawling range
|
||||
assert result == expected
|
||||
|
||||
@@ -109,16 +109,17 @@ class TestProgressMapper:
|
||||
# Set some initial progress
|
||||
progress_mapper.map_progress("crawling", 50)
|
||||
current = progress_mapper.last_overall_progress
|
||||
|
||||
|
||||
# Try unknown stage
|
||||
result = progress_mapper.map_progress("unknown_stage", 75)
|
||||
|
||||
|
||||
# Should maintain current progress
|
||||
assert result == current
|
||||
|
||||
def test_get_stage_range(self, progress_mapper):
|
||||
"""Test getting stage ranges."""
|
||||
assert progress_mapper.get_stage_range("crawling") == (2, 5)
|
||||
assert progress_mapper.get_stage_range("discovery") == (2, 3) # New discovery stage
|
||||
assert progress_mapper.get_stage_range("crawling") == (3, 5) # Updated after discovery
|
||||
assert progress_mapper.get_stage_range("document_storage") == (10, 30)
|
||||
assert progress_mapper.get_stage_range("code_extraction") == (30, 95)
|
||||
assert progress_mapper.get_stage_range("unknown") == (0, 100) # Default
|
||||
@@ -128,11 +129,11 @@ class TestProgressMapper:
|
||||
# Test normal case
|
||||
result = progress_mapper.calculate_stage_progress(25, 100)
|
||||
assert result == 25.0
|
||||
|
||||
|
||||
# Test division by zero protection
|
||||
result = progress_mapper.calculate_stage_progress(10, 0)
|
||||
assert result == 0.0
|
||||
|
||||
|
||||
# Test negative max protection
|
||||
result = progress_mapper.calculate_stage_progress(10, -5)
|
||||
assert result == 0.0
|
||||
@@ -141,7 +142,7 @@ class TestProgressMapper:
|
||||
"""Test batch progress mapping."""
|
||||
# Test batch 3 of 6 in document_storage stage
|
||||
result = progress_mapper.map_batch_progress("document_storage", 3, 6)
|
||||
|
||||
|
||||
# Should be (3-1)/6 = 33.3% through document_storage stage
|
||||
# document_storage is 10-30%, so 33.3% of 20% = 6.67%, so 10 + 6.67 = 16.67 ≈ 17
|
||||
assert result == 17
|
||||
@@ -159,10 +160,10 @@ class TestProgressMapper:
|
||||
progress_mapper.map_progress("crawling", 50)
|
||||
assert progress_mapper.last_overall_progress > 0
|
||||
assert progress_mapper.current_stage != "starting"
|
||||
|
||||
|
||||
# Reset
|
||||
progress_mapper.reset()
|
||||
|
||||
|
||||
# Should be back to initial state
|
||||
assert progress_mapper.last_overall_progress == 0
|
||||
assert progress_mapper.current_stage == "starting"
|
||||
@@ -172,7 +173,7 @@ class TestProgressMapper:
|
||||
# Initial state
|
||||
assert progress_mapper.get_current_stage() == "starting"
|
||||
assert progress_mapper.get_current_progress() == 0
|
||||
|
||||
|
||||
# After mapping some progress
|
||||
progress_mapper.map_progress("document_storage", 50)
|
||||
assert progress_mapper.get_current_stage() == "document_storage"
|
||||
@@ -196,9 +197,9 @@ class TestProgressMapper:
|
||||
("finalization", 100, 100), # Finalization
|
||||
("completed", 0, 100), # Completion
|
||||
]
|
||||
|
||||
|
||||
progress_mapper.reset()
|
||||
|
||||
|
||||
for stage, stage_progress, expected_overall in stages:
|
||||
result = progress_mapper.map_progress(stage, stage_progress)
|
||||
assert result == expected_overall, f"Stage {stage} at {stage_progress}% should map to {expected_overall}%, got {result}%"
|
||||
@@ -206,7 +207,7 @@ class TestProgressMapper:
|
||||
def test_upload_stage_ranges(self, progress_mapper):
|
||||
"""Test upload-specific stage ranges."""
|
||||
upload_stages = ["reading", "extracting", "chunking", "creating_source", "summarizing", "storing"]
|
||||
|
||||
|
||||
# Test that upload stages have valid ranges
|
||||
last_end = 0
|
||||
for stage in upload_stages:
|
||||
@@ -214,6 +215,6 @@ class TestProgressMapper:
|
||||
assert start >= last_end, f"Upload stage {stage} overlaps with previous"
|
||||
assert end > start, f"Upload stage {stage} has invalid range"
|
||||
last_end = end
|
||||
|
||||
|
||||
# Test that final upload stage reaches 100%
|
||||
assert progress_mapper.get_stage_range("storing")[1] == 100
|
||||
assert progress_mapper.get_stage_range("storing")[1] == 100
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
"""Unit tests for DiscoveryService class."""
|
||||
import pytest
|
||||
from unittest.mock import patch, Mock
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from src.server.services.crawling.discovery_service import DiscoveryService
|
||||
|
||||
|
||||
@@ -12,12 +12,12 @@ class TestDiscoveryService:
|
||||
"""Test main discovery method returns single best file."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
|
||||
# Mock robots.txt response (no sitemaps)
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nDisallow: /admin/"
|
||||
|
||||
|
||||
# Mock file existence - llms-full.txt doesn't exist, but llms.txt does
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
@@ -30,42 +30,42 @@ class TestDiscoveryService:
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
|
||||
# Should return single URL string (not dict, not list)
|
||||
assert isinstance(result, str)
|
||||
assert result == 'https://example.com/llms.txt'
|
||||
|
||||
@patch('requests.get')
|
||||
@patch('requests.get')
|
||||
def test_discover_files_no_files_found(self, mock_get):
|
||||
"""Test discovery when no files are found."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
|
||||
# Mock all HTTP requests to return 404
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
|
||||
# Should return None when no files found
|
||||
assert result is None
|
||||
|
||||
@patch('requests.get')
|
||||
@patch('requests.get')
|
||||
def test_discover_files_priority_order(self, mock_get):
|
||||
"""Test that discovery follows the correct priority order."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
|
||||
# Mock robots.txt response (no sitemaps declared)
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nDisallow: /admin/"
|
||||
|
||||
|
||||
# Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
@@ -76,11 +76,11 @@ class TestDiscoveryService:
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
|
||||
# Should return llms.txt since it has higher priority than sitemap.xml
|
||||
assert result == 'https://example.com/llms.txt'
|
||||
|
||||
@@ -89,12 +89,12 @@ class TestDiscoveryService:
|
||||
"""Test that robots.txt sitemap declarations have highest priority."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock robots.txt response WITH sitemap declaration
|
||||
|
||||
# Mock robots.txt response WITH sitemap declaration
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml"
|
||||
|
||||
|
||||
# Mock other files also exist
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
@@ -105,93 +105,59 @@ class TestDiscoveryService:
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
|
||||
# Should return the sitemap declared in robots.txt (highest priority)
|
||||
assert result == 'https://example.com/declared-sitemap.xml'
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_best_sitemap_robots_priority(self, mock_get):
|
||||
"""Test sitemap discovery prioritizes robots.txt declarations."""
|
||||
def test_discover_files_subdirectory_fallback(self, mock_get):
|
||||
"""Test discovery falls back to subdirectories for llms files."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock robots.txt with sitemap
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = "Sitemap: https://example.com/robots-sitemap.xml"
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._discover_best_sitemap(base_url)
|
||||
|
||||
# Should return the sitemap from robots.txt (highest priority)
|
||||
assert result == "https://example.com/robots-sitemap.xml"
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_best_llms_file_priority_order(self, mock_get):
|
||||
"""Test llms file discovery follows priority order."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock HTTP responses - only llms.txt exists, not llms-full.txt
|
||||
# Mock robots.txt response (no sitemaps declared)
|
||||
robots_response = Mock()
|
||||
robots_response.status_code = 200
|
||||
robots_response.text = "User-agent: *\nDisallow: /admin/"
|
||||
|
||||
# Mock file existence - no root llms files, but static/llms.txt exists
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if url.endswith('llms-full.txt'):
|
||||
response.status_code = 404 # Higher priority file doesn't exist
|
||||
elif url.endswith('llms.txt'):
|
||||
response.status_code = 200 # Standard file exists
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service._discover_best_llms_file(base_url)
|
||||
|
||||
# Should find llms.txt since llms-full.txt doesn't exist
|
||||
assert result == "https://example.com/llms.txt"
|
||||
|
||||
@patch('requests.get')
|
||||
def test_discover_best_llms_file_subdirectory_fallback(self, mock_get):
|
||||
"""Test llms file discovery falls back to subdirectories."""
|
||||
service = DiscoveryService()
|
||||
base_url = "https://example.com"
|
||||
|
||||
# Mock HTTP responses - no root files, but static/llms.txt exists
|
||||
def mock_get_side_effect(url, **kwargs):
|
||||
response = Mock()
|
||||
if '/static/llms.txt' in url:
|
||||
if url.endswith('robots.txt'):
|
||||
return robots_response
|
||||
elif '/static/llms.txt' in url:
|
||||
response.status_code = 200 # Found in subdirectory
|
||||
else:
|
||||
response.status_code = 404
|
||||
return response
|
||||
|
||||
|
||||
mock_get.side_effect = mock_get_side_effect
|
||||
|
||||
result = service._discover_best_llms_file(base_url)
|
||||
|
||||
|
||||
result = service.discover_files(base_url)
|
||||
|
||||
# Should find the file in static subdirectory
|
||||
assert result == "https://example.com/static/llms.txt"
|
||||
assert result == 'https://example.com/static/llms.txt'
|
||||
|
||||
@patch('requests.get')
|
||||
def test_check_url_exists(self, mock_get):
|
||||
"""Test URL existence checking."""
|
||||
service = DiscoveryService()
|
||||
|
||||
|
||||
# Test successful response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
|
||||
assert service._check_url_exists("https://example.com/exists") is True
|
||||
|
||||
|
||||
# Test 404 response
|
||||
mock_response.status_code = 404
|
||||
assert service._check_url_exists("https://example.com/not-found") is False
|
||||
|
||||
|
||||
# Test network error
|
||||
mock_get.side_effect = Exception("Network error")
|
||||
assert service._check_url_exists("https://example.com/error") is False
|
||||
@@ -200,7 +166,7 @@ class TestDiscoveryService:
|
||||
def test_parse_robots_txt_with_sitemap(self, mock_get):
|
||||
"""Test robots.txt parsing with sitemap directives."""
|
||||
service = DiscoveryService()
|
||||
|
||||
|
||||
# Mock successful robots.txt response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
@@ -209,9 +175,9 @@ Disallow: /admin/
|
||||
Sitemap: https://example.com/sitemap.xml
|
||||
Sitemap: https://example.com/sitemap-news.xml"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
|
||||
assert len(result) == 2
|
||||
assert "https://example.com/sitemap.xml" in result
|
||||
assert "https://example.com/sitemap-news.xml" in result
|
||||
@@ -221,7 +187,7 @@ Sitemap: https://example.com/sitemap-news.xml"""
|
||||
def test_parse_robots_txt_no_sitemap(self, mock_get):
|
||||
"""Test robots.txt parsing without sitemap directives."""
|
||||
service = DiscoveryService()
|
||||
|
||||
|
||||
# Mock robots.txt without sitemaps
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
@@ -229,60 +195,17 @@ Sitemap: https://example.com/sitemap-news.xml"""
|
||||
Disallow: /admin/
|
||||
Allow: /public/"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
|
||||
assert len(result) == 0
|
||||
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_robots_txt_not_found(self, mock_get):
|
||||
"""Test robots.txt parsing when file is not found."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock 404 response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
assert len(result) == 0
|
||||
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_check_standard_patterns(self, mock_get):
|
||||
"""Test standard file pattern checking."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock responses for different files
|
||||
def mock_response_side_effect(url, **kwargs):
|
||||
mock_response = Mock()
|
||||
if 'llms.txt' in url:
|
||||
mock_response.status_code = 200
|
||||
elif 'sitemap.xml' in url:
|
||||
mock_response.status_code = 200
|
||||
else:
|
||||
mock_response.status_code = 404
|
||||
return mock_response
|
||||
|
||||
mock_get.side_effect = mock_response_side_effect
|
||||
|
||||
result = service._check_standard_patterns("https://example.com")
|
||||
|
||||
assert 'sitemaps' in result
|
||||
assert 'llms_files' in result
|
||||
assert 'robots_files' in result
|
||||
|
||||
# Should find the files that returned 200
|
||||
assert any('llms.txt' in url for url in result['llms_files'])
|
||||
assert any('sitemap.xml' in url for url in result['sitemaps'])
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_html_meta_tags(self, mock_get):
|
||||
"""Test HTML meta tag parsing for sitemaps."""
|
||||
service = DiscoveryService()
|
||||
|
||||
|
||||
# Mock HTML with sitemap references
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
@@ -296,154 +219,46 @@ Allow: /public/"""
|
||||
</html>
|
||||
"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
|
||||
result = service._parse_html_meta_tags("https://example.com")
|
||||
|
||||
|
||||
# Should find sitemaps from both link and meta tags
|
||||
assert len(result) >= 1
|
||||
assert any('sitemap' in url.lower() for url in result)
|
||||
mock_get.assert_called_once_with("https://example.com", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_parse_html_meta_tags_not_found(self, mock_get):
|
||||
"""Test HTML meta tag parsing when page not found."""
|
||||
def test_discovery_priority_constant(self):
|
||||
"""Test that discovery priority constant is properly defined."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock 404 response
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 404
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_html_meta_tags("https://example.com")
|
||||
|
||||
assert len(result) == 0
|
||||
mock_get.assert_called_once_with("https://example.com", timeout=30)
|
||||
|
||||
@patch('requests.get')
|
||||
def test_check_well_known_directory(self, mock_get):
|
||||
"""Test .well-known directory file checking."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock responses - some files exist, some don't
|
||||
def mock_response_side_effect(url, **kwargs):
|
||||
mock_response = Mock()
|
||||
if 'ai.txt' in url:
|
||||
mock_response.status_code = 200
|
||||
else:
|
||||
mock_response.status_code = 404
|
||||
return mock_response
|
||||
|
||||
mock_get.side_effect = mock_response_side_effect
|
||||
|
||||
result = service._check_well_known_directory("https://example.com")
|
||||
|
||||
# Should find the ai.txt file
|
||||
assert len(result) >= 1
|
||||
assert any('ai.txt' in url for url in result)
|
||||
# Verify the priority list exists and has expected order
|
||||
assert hasattr(service, 'DISCOVERY_PRIORITY')
|
||||
assert isinstance(service.DISCOVERY_PRIORITY, list)
|
||||
assert len(service.DISCOVERY_PRIORITY) > 0
|
||||
|
||||
@patch('requests.get')
|
||||
def test_try_common_variations(self, mock_get):
|
||||
"""Test pattern variations for discovery targets."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock responses for variations
|
||||
def mock_response_side_effect(url, **kwargs):
|
||||
mock_response = Mock()
|
||||
if 'docs/llms.txt' in url or 'sitemaps/sitemap.xml' in url:
|
||||
mock_response.status_code = 200
|
||||
else:
|
||||
mock_response.status_code = 404
|
||||
return mock_response
|
||||
|
||||
mock_get.side_effect = mock_response_side_effect
|
||||
|
||||
result = service._try_common_variations("https://example.com")
|
||||
|
||||
assert 'sitemaps' in result
|
||||
assert 'llms_files' in result
|
||||
|
||||
# Should find at least one variation
|
||||
assert len(result['llms_files']) >= 1 or len(result['sitemaps']) >= 1
|
||||
# Verify llms-full.txt is first (highest priority)
|
||||
assert service.DISCOVERY_PRIORITY[0] == 'llms-full.txt'
|
||||
|
||||
# Verify llms.txt comes before sitemap files
|
||||
llms_txt_index = service.DISCOVERY_PRIORITY.index('llms.txt')
|
||||
sitemap_index = service.DISCOVERY_PRIORITY.index('sitemap.xml')
|
||||
assert llms_txt_index < sitemap_index
|
||||
|
||||
@patch('requests.get')
|
||||
def test_network_error_handling(self, mock_get):
|
||||
"""Test error scenarios with network failures."""
|
||||
service = DiscoveryService()
|
||||
|
||||
|
||||
# Mock network error
|
||||
mock_get.side_effect = Exception("Network error")
|
||||
|
||||
# Should not raise exception, but return empty results
|
||||
|
||||
# Should not raise exception, but return None
|
||||
result = service.discover_files("https://example.com")
|
||||
assert result is None
|
||||
|
||||
# Individual methods should also handle errors gracefully
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
assert result == []
|
||||
|
||||
result = service._check_standard_patterns("https://example.com")
|
||||
assert isinstance(result, dict)
|
||||
|
||||
|
||||
result = service._parse_html_meta_tags("https://example.com")
|
||||
assert result == []
|
||||
|
||||
result = service._check_well_known_directory("https://example.com")
|
||||
assert result == []
|
||||
|
||||
result = service._try_common_variations("https://example.com")
|
||||
assert isinstance(result, dict)
|
||||
|
||||
def test_discover_files_with_exceptions(self):
|
||||
"""Test main discovery method handles exceptions gracefully."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock methods to raise exceptions
|
||||
with patch.object(service, '_parse_robots_txt', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_check_standard_patterns', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_parse_html_meta_tags', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_check_well_known_directory', side_effect=Exception("Test error")):
|
||||
with patch.object(service, '_try_common_variations', side_effect=Exception("Test error")):
|
||||
result = service.discover_files("https://example.com")
|
||||
|
||||
# Should still return proper structure even with all methods failing
|
||||
assert isinstance(result, dict)
|
||||
assert 'sitemaps' in result
|
||||
assert 'llms_files' in result
|
||||
assert 'robots_files' in result
|
||||
assert 'well_known_files' in result
|
||||
|
||||
@patch('requests.get')
|
||||
def test_robots_txt_with_malformed_content(self, mock_get):
|
||||
"""Test robots.txt parsing with malformed content."""
|
||||
service = DiscoveryService()
|
||||
|
||||
# Mock malformed robots.txt content
|
||||
mock_response = Mock()
|
||||
mock_response.status_code = 200
|
||||
mock_response.text = """User-agent: *
|
||||
Disallow: /admin/
|
||||
Sitemap:
|
||||
Sitemap: not-a-valid-url
|
||||
Sitemap: https://example.com/valid-sitemap.xml"""
|
||||
mock_get.return_value = mock_response
|
||||
|
||||
result = service._parse_robots_txt("https://example.com")
|
||||
|
||||
# Should only include the valid sitemap URL
|
||||
assert len(result) == 1
|
||||
assert "https://example.com/valid-sitemap.xml" in result
|
||||
|
||||
def test_discovery_targets_constant(self):
|
||||
"""Test that discovery targets constant is properly defined."""
|
||||
service = DiscoveryService()
|
||||
|
||||
assert hasattr(service, 'DISCOVERY_TARGETS')
|
||||
targets = service.DISCOVERY_TARGETS
|
||||
|
||||
# Verify required target types exist
|
||||
assert 'llms_files' in targets
|
||||
assert 'sitemap_files' in targets
|
||||
assert 'robots_files' in targets
|
||||
assert 'well_known_files' in targets
|
||||
|
||||
# Verify they contain expected files
|
||||
assert 'llms.txt' in targets['llms_files']
|
||||
assert 'sitemap.xml' in targets['sitemap_files']
|
||||
assert 'robots.txt' in targets['robots_files']
|
||||
assert '.well-known/ai.txt' in targets['well_known_files']
|
||||
Reference in New Issue
Block a user