archon/python/tests/test_source_id_refactor.py

"""
Test Suite for Source ID Architecture Refactor

Tests the new unique source ID generation and display name extraction
to ensure the race condition fix works correctly.
"""

import time
from concurrent.futures import ThreadPoolExecutor

# Import the URLHandler class
from src.server.services.crawling.helpers.url_handler import URLHandler


class TestSourceIDGeneration:
    """Test the unique source ID generation."""

    def test_unique_id_generation_basic(self):
        """Test basic unique ID generation."""
        handler = URLHandler()

        # Test various URLs
        test_urls = [
            "https://github.com/microsoft/typescript",
            "https://github.com/facebook/react",
            "https://docs.python.org/3/",
            "https://fastapi.tiangolo.com/",
            "https://pydantic.dev/",
        ]

        source_ids = []
        for url in test_urls:
            source_id = handler.generate_unique_source_id(url)
            source_ids.append(source_id)

            # Check that ID is a 16-character hex string
            assert len(source_id) == 16, f"ID should be 16 chars, got {len(source_id)}"
            assert all(c in '0123456789abcdef' for c in source_id), f"ID should be hex: {source_id}"

        # All IDs should be unique
        assert len(set(source_ids)) == len(source_ids), "All source IDs should be unique"

    def test_same_domain_different_ids(self):
        """Test that same domain with different paths generates different IDs."""
        handler = URLHandler()

        # Multiple GitHub repos (same domain, different paths)
        github_urls = [
            "https://github.com/owner1/repo1",
            "https://github.com/owner1/repo2",
            "https://github.com/owner2/repo1",
        ]

        ids = [handler.generate_unique_source_id(url) for url in github_urls]

        # All should be unique despite same domain
        assert len(set(ids)) == len(ids), "Same domain should generate different IDs for different URLs"

    def test_id_consistency(self):
        """Test that the same URL always generates the same ID."""
        handler = URLHandler()
        url = "https://github.com/microsoft/typescript"

        # Generate ID multiple times
        ids = [handler.generate_unique_source_id(url) for _ in range(5)]

        # All should be identical
        assert len(set(ids)) == 1, f"Same URL should always generate same ID, got: {set(ids)}"
        assert ids[0] == ids[4], "First and last ID should match"

    def test_url_normalization(self):
        """Test that URL normalization works correctly."""
        handler = URLHandler()

        # These should all generate the same ID (after normalization)
        url_variations = [
            "https://github.com/Microsoft/TypeScript",
            "HTTPS://GITHUB.COM/MICROSOFT/TYPESCRIPT",
            "https://GitHub.com/Microsoft/TypeScript",
        ]

        ids = [handler.generate_unique_source_id(url) for url in url_variations]

        # All normalized versions should generate the same ID
        assert len(set(ids)) == 1, f"Normalized URLs should generate same ID, got: {set(ids)}"

    def test_concurrent_crawl_simulation(self):
        """Simulate concurrent crawls to verify no race conditions."""
        handler = URLHandler()

        # URLs that would previously conflict
        concurrent_urls = [
            "https://github.com/coleam00/archon",
            "https://github.com/microsoft/typescript",
            "https://github.com/facebook/react",
            "https://github.com/vercel/next.js",
            "https://github.com/vuejs/vue",
        ]

        def generate_id(url):
            """Simulate a crawl generating an ID."""
            time.sleep(0.001)  # Simulate some processing time
            return handler.generate_unique_source_id(url)

        # Run concurrent ID generation
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(generate_id, url) for url in concurrent_urls]
            source_ids = [future.result() for future in futures]

        # All IDs should be unique
        assert len(set(source_ids)) == len(source_ids), "Concurrent crawls should generate unique IDs"

    def test_error_handling(self):
        """Test error handling for edge cases."""
        handler = URLHandler()

        # Test various edge cases
        edge_cases = [
            "",  # Empty string
            "not-a-url",  # Invalid URL
            "https://",  # Incomplete URL
            None,  # None should be handled gracefully in real code
        ]

        for url in edge_cases:
            if url is None:
                continue  # Skip None for this test

            # Should not raise exception
            source_id = handler.generate_unique_source_id(url)
            assert source_id is not None, f"Should generate ID even for edge case: {url}"
            assert len(source_id) == 16, f"Edge case should still generate 16-char ID: {url}"


class TestDisplayNameExtraction:
    """Test the human-readable display name extraction."""

    def test_github_display_names(self):
        """Test GitHub repository display name extraction."""
        handler = URLHandler()

        test_cases = [
            ("https://github.com/microsoft/typescript", "GitHub - microsoft/typescript"),
            ("https://github.com/facebook/react", "GitHub - facebook/react"),
            ("https://github.com/vercel/next.js", "GitHub - vercel/next.js"),
            ("https://github.com/owner", "GitHub - owner"),
            ("https://github.com/", "GitHub"),
        ]

        for url, expected in test_cases:
            display_name = handler.extract_display_name(url)
            assert display_name == expected, f"URL {url} should display as '{expected}', got '{display_name}'"

    def test_documentation_display_names(self):
        """Test documentation site display name extraction."""
        handler = URLHandler()

        test_cases = [
            ("https://docs.python.org/3/", "Python Documentation"),
            ("https://docs.djangoproject.com/", "Djangoproject Documentation"),
            ("https://fastapi.tiangolo.com/", "FastAPI Documentation"),
            ("https://pydantic.dev/", "Pydantic Documentation"),
            ("https://numpy.org/doc/", "NumPy Documentation"),
            ("https://pandas.pydata.org/", "Pandas Documentation"),
            ("https://project.readthedocs.io/", "Project Docs"),
        ]

        for url, expected in test_cases:
            display_name = handler.extract_display_name(url)
            assert display_name == expected, f"URL {url} should display as '{expected}', got '{display_name}'"

    def test_api_display_names(self):
        """Test API endpoint display name extraction."""
        handler = URLHandler()

        test_cases = [
            ("https://api.github.com/", "GitHub API"),
            ("https://api.openai.com/v1/", "Openai API"),
            ("https://example.com/api/v2/", "Example"),
        ]

        for url, expected in test_cases:
            display_name = handler.extract_display_name(url)
            assert display_name == expected, f"URL {url} should display as '{expected}', got '{display_name}'"

    def test_generic_display_names(self):
        """Test generic website display name extraction."""
        handler = URLHandler()

        test_cases = [
            ("https://example.com/", "Example"),
            ("https://my-site.org/", "My Site"),
            ("https://test_project.io/", "Test Project"),
            ("https://some.subdomain.example.com/", "Some Subdomain Example"),
        ]

        for url, expected in test_cases:
            display_name = handler.extract_display_name(url)
            assert display_name == expected, f"URL {url} should display as '{expected}', got '{display_name}'"

    def test_edge_case_display_names(self):
        """Test edge cases for display name extraction."""
        handler = URLHandler()

        # Edge cases
        test_cases = [
            ("", ""),  # Empty URL
            ("not-a-url", "not-a-url"),  # Invalid URL
            ("/local/file/path", "Local: path"),  # Local file path
            ("https://", "https://"),  # Incomplete URL
        ]

        for url, expected_contains in test_cases:
            display_name = handler.extract_display_name(url)
            assert expected_contains in display_name or display_name == expected_contains, \
                f"Edge case {url} handling failed: {display_name}"

    def test_special_file_display_names(self):
        """Test that special files like llms.txt and sitemap.xml are properly displayed."""
        handler = URLHandler()

        test_cases = [
            # llms.txt files
            ("https://docs.mem0.ai/llms-full.txt", "Mem0 - Llms.Txt"),
            ("https://example.com/llms.txt", "Example - Llms.Txt"),
            ("https://api.example.com/llms.txt", "Example API"),  # API takes precedence

            # sitemap.xml files
            ("https://mem0.ai/sitemap.xml", "Mem0 - Sitemap.Xml"),
            ("https://docs.example.com/sitemap.xml", "Example - Sitemap.Xml"),
            ("https://example.org/sitemap.xml", "Example - Sitemap.Xml"),

            # Regular .txt files on docs sites
            ("https://docs.example.com/readme.txt", "Example - Readme.Txt"),

            # Non-special files should not get special treatment
            ("https://docs.example.com/guide", "Example Documentation"),
            ("https://example.com/page.html", "Example - Page.Html"),  # Path gets added for single file
        ]

        for url, expected in test_cases:
            display_name = handler.extract_display_name(url)
            assert display_name == expected, f"URL {url} should display as '{expected}', got '{display_name}'"

    def test_git_extension_removal(self):
        """Test that .git extension is removed from GitHub repos."""
        handler = URLHandler()

        test_cases = [
            ("https://github.com/owner/repo.git", "GitHub - owner/repo"),
            ("https://github.com/owner/repo", "GitHub - owner/repo"),
        ]

        for url, expected in test_cases:
            display_name = handler.extract_display_name(url)
            assert display_name == expected, f"URL {url} should display as '{expected}', got '{display_name}'"


class TestRaceConditionFix:
    """Test that the race condition is actually fixed."""

    def test_no_domain_conflicts(self):
        """Test that multiple sources from same domain don't conflict."""
        handler = URLHandler()

        # These would all have source_id = "github.com" in the old system
        github_urls = [
            "https://github.com/microsoft/typescript",
            "https://github.com/microsoft/vscode",
            "https://github.com/facebook/react",
            "https://github.com/vercel/next.js",
            "https://github.com/vuejs/vue",
        ]

        source_ids = [handler.generate_unique_source_id(url) for url in github_urls]

        # All should be unique
        assert len(set(source_ids)) == len(source_ids), \
            "Race condition not fixed: duplicate source IDs for same domain"

        # None should be just "github.com"
        for source_id in source_ids:
            assert source_id != "github.com", \
                "Source ID should not be just the domain"

    def test_hash_properties(self):
        """Test that the hash has good properties."""
        handler = URLHandler()

        # Similar URLs should still generate very different hashes
        url1 = "https://github.com/owner/repo1"
        url2 = "https://github.com/owner/repo2"  # Only differs by one character

        id1 = handler.generate_unique_source_id(url1)
        id2 = handler.generate_unique_source_id(url2)

        # IDs should be completely different (good hash distribution)
        matching_chars = sum(1 for a, b in zip(id1, id2) if a == b)
        assert matching_chars < 8, \
            f"Similar URLs should generate very different hashes, {matching_chars}/16 chars match"


class TestIntegration:
    """Integration tests for the complete source ID system."""

    def test_full_source_creation_flow(self):
        """Test the complete flow of creating a source with all fields."""
        handler = URLHandler()
        url = "https://github.com/microsoft/typescript"

        # Generate all source fields
        source_id = handler.generate_unique_source_id(url)
        source_display_name = handler.extract_display_name(url)
        source_url = url

        # Verify all fields are populated correctly
        assert len(source_id) == 16, "Source ID should be 16 characters"
        assert source_display_name == "GitHub - microsoft/typescript", \
            f"Display name incorrect: {source_display_name}"
        assert source_url == url, "Source URL should match original"

        # Simulate database record
        source_record = {
            'source_id': source_id,
            'source_url': source_url,
            'source_display_name': source_display_name,
            'title': None,  # Generated later
            'summary': None,  # Generated later
            'metadata': {}
        }

        # Verify record structure
        assert 'source_id' in source_record
        assert 'source_url' in source_record
        assert 'source_display_name' in source_record

    def test_backward_compatibility(self):
        """Test that the system handles existing sources gracefully."""
        handler = URLHandler()

        # Simulate an existing source with old-style source_id
        existing_source = {
            'source_id': 'github.com',  # Old style - just domain
            'source_url': None,  # Not populated in old system
            'source_display_name': None,  # Not populated in old system
        }

        # The migration should handle this by backfilling
        # source_url and source_display_name with source_id value
        migrated_source = {
            'source_id': 'github.com',
            'source_url': 'github.com',  # Backfilled
            'source_display_name': 'github.com',  # Backfilled
        }

        assert migrated_source['source_url'] is not None
        assert migrated_source['source_display_name'] is not None