Files
archon/python/tests/test_source_url_shadowing.py
Rasmus Widing aad773eccf fix: resolve test failures after merge by fixing async/sync mismatch
After merging main into refactor-remove-sockets, 14 tests failed due to
architecture mismatches between the two branches.

Key fixes:
- Removed asyncio.to_thread calls for extract_source_summary and update_source_info
  since they are already async functions
- Updated test_source_race_condition.py to handle async functions properly
  by using event loops in sync test contexts
- Fixed mock return values in test_source_url_shadowing.py to return
  proper statistics dict instead of None
- Adjusted URL normalization expectations in test_source_id_refactor.py
  to match actual behavior (path case is preserved)

All 350 tests now passing.
2025-08-29 15:50:25 +03:00

125 lines
5.4 KiB
Python

"""
Test that source_url parameter is not shadowed by document URLs.
This test ensures that the original crawl URL (e.g., sitemap URL)
is correctly passed to _create_source_records and not overwritten
by individual document URLs during processing.
"""
import pytest
from unittest.mock import Mock, AsyncMock, MagicMock, patch
from src.server.services.crawling.document_storage_operations import DocumentStorageOperations
class TestSourceUrlShadowing:
"""Test that source_url parameter is preserved correctly."""
@pytest.mark.asyncio
async def test_source_url_not_shadowed(self):
"""Test that the original source_url is passed to _create_source_records."""
# Create mock supabase client
mock_supabase = Mock()
# Create DocumentStorageOperations instance
doc_storage = DocumentStorageOperations(mock_supabase)
# Mock the storage service
doc_storage.doc_storage_service.smart_chunk_text = Mock(return_value=["chunk1", "chunk2"])
# Track what gets passed to _create_source_records
captured_source_url = None
async def mock_create_source_records(all_metadatas, all_contents, source_word_counts,
request, source_url, source_display_name):
nonlocal captured_source_url
captured_source_url = source_url
doc_storage._create_source_records = mock_create_source_records
# Mock add_documents_to_supabase
with patch('src.server.services.crawling.document_storage_operations.add_documents_to_supabase') as mock_add:
mock_add.return_value = {"chunks_stored": 3}
# Test data - simulating a sitemap crawl
original_source_url = "https://mem0.ai/sitemap.xml"
crawl_results = [
{
"url": "https://mem0.ai/page1",
"markdown": "Content of page 1",
"title": "Page 1"
},
{
"url": "https://mem0.ai/page2",
"markdown": "Content of page 2",
"title": "Page 2"
},
{
"url": "https://mem0.ai/models/openai-o3", # Last document URL
"markdown": "Content of models page",
"title": "Models"
}
]
request = {"knowledge_type": "documentation", "tags": []}
# Call the method
result = await doc_storage.process_and_store_documents(
crawl_results=crawl_results,
request=request,
crawl_type="sitemap",
original_source_id="test123",
progress_callback=None,
cancellation_check=None,
source_url=original_source_url, # This should NOT be overwritten
source_display_name="Test Sitemap"
)
# Verify the original source_url was preserved
assert captured_source_url == original_source_url, \
f"source_url should be '{original_source_url}', not '{captured_source_url}'"
# Verify it's NOT the last document's URL
assert captured_source_url != "https://mem0.ai/models/openai-o3", \
"source_url should NOT be overwritten with the last document's URL"
# Verify url_to_full_document has correct URLs
assert "https://mem0.ai/page1" in result["url_to_full_document"]
assert "https://mem0.ai/page2" in result["url_to_full_document"]
assert "https://mem0.ai/models/openai-o3" in result["url_to_full_document"]
@pytest.mark.asyncio
async def test_metadata_uses_document_urls(self):
"""Test that metadata correctly uses individual document URLs."""
mock_supabase = Mock()
doc_storage = DocumentStorageOperations(mock_supabase)
# Mock the storage service
doc_storage.doc_storage_service.smart_chunk_text = Mock(return_value=["chunk1"])
# Capture metadata
captured_metadatas = None
async def mock_create_source_records(all_metadatas, all_contents, source_word_counts,
request, source_url, source_display_name):
nonlocal captured_metadatas
captured_metadatas = all_metadatas
doc_storage._create_source_records = mock_create_source_records
with patch('src.server.services.crawling.document_storage_operations.add_documents_to_supabase') as mock_add:
mock_add.return_value = {"chunks_stored": 2}
crawl_results = [
{"url": "https://example.com/doc1", "markdown": "Doc 1"},
{"url": "https://example.com/doc2", "markdown": "Doc 2"}
]
await doc_storage.process_and_store_documents(
crawl_results=crawl_results,
request={},
crawl_type="normal",
original_source_id="test456",
source_url="https://example.com",
source_display_name="Example"
)
# Each metadata should have the correct document URL
assert captured_metadatas[0]["url"] == "https://example.com/doc1"
assert captured_metadatas[1]["url"] == "https://example.com/doc2"