mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Merge main into feature/automatic-discovery-llms-sitemap-430
Resolved merge conflicts by integrating features from both branches: - Added page_storage_ops service initialization from main - Merged link text extraction with discovery mode features - Preserved discovery single-file mode and domain filtering - Maintained link text fallbacks for title extraction 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
195
python/tests/server/services/test_llms_full_parser.py
Normal file
195
python/tests/server/services/test_llms_full_parser.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""
|
||||
Tests for LLMs-full.txt Section Parser
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from src.server.services.crawling.helpers.llms_full_parser import (
|
||||
create_section_slug,
|
||||
create_section_url,
|
||||
parse_llms_full_sections,
|
||||
)
|
||||
|
||||
|
||||
def test_create_section_slug():
|
||||
"""Test slug generation from H1 headings"""
|
||||
assert create_section_slug("# Core Concepts") == "core-concepts"
|
||||
assert create_section_slug("# Getting Started!") == "getting-started"
|
||||
assert create_section_slug("# API Reference (v2)") == "api-reference-v2"
|
||||
assert create_section_slug("# Hello World") == "hello-world"
|
||||
assert create_section_slug("# Spaces ") == "spaces"
|
||||
|
||||
|
||||
def test_create_section_url():
|
||||
"""Test synthetic URL generation with slug anchor"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
url = create_section_url(base_url, "# Core Concepts", 0)
|
||||
assert url == "https://example.com/llms-full.txt#section-0-core-concepts"
|
||||
|
||||
url = create_section_url(base_url, "# Getting Started", 1)
|
||||
assert url == "https://example.com/llms-full.txt#section-1-getting-started"
|
||||
|
||||
|
||||
def test_parse_single_section():
|
||||
"""Test parsing a single H1 section"""
|
||||
content = """# Core Concepts
|
||||
Claude is an AI assistant built by Anthropic.
|
||||
It can help with various tasks.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 1
|
||||
assert sections[0].section_title == "# Core Concepts"
|
||||
assert sections[0].section_order == 0
|
||||
assert sections[0].url == "https://example.com/llms-full.txt#section-0-core-concepts"
|
||||
assert "Claude is an AI assistant" in sections[0].content
|
||||
assert sections[0].word_count > 0
|
||||
|
||||
|
||||
def test_parse_multiple_sections():
|
||||
"""Test parsing multiple H1 sections"""
|
||||
content = """# Core Concepts
|
||||
Claude is an AI assistant built by Anthropic that can help with various tasks.
|
||||
It uses advanced language models to understand and respond to queries.
|
||||
This section provides an overview of the core concepts and capabilities.
|
||||
|
||||
# Getting Started
|
||||
To get started with Claude, you'll need to create an account and obtain API credentials.
|
||||
Follow the setup instructions and configure your development environment properly.
|
||||
This will enable you to make your first API calls and start building applications.
|
||||
|
||||
# API Reference
|
||||
The API uses REST principles and supports standard HTTP methods like GET, POST, PUT, and DELETE.
|
||||
Authentication is handled through API keys that should be kept secure at all times.
|
||||
Comprehensive documentation is available for all endpoints and response formats.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 3
|
||||
assert sections[0].section_title == "# Core Concepts"
|
||||
assert sections[1].section_title == "# Getting Started"
|
||||
assert sections[2].section_title == "# API Reference"
|
||||
|
||||
assert sections[0].section_order == 0
|
||||
assert sections[1].section_order == 1
|
||||
assert sections[2].section_order == 2
|
||||
|
||||
assert sections[0].url == "https://example.com/llms-full.txt#section-0-core-concepts"
|
||||
assert sections[1].url == "https://example.com/llms-full.txt#section-1-getting-started"
|
||||
assert sections[2].url == "https://example.com/llms-full.txt#section-2-api-reference"
|
||||
|
||||
|
||||
def test_no_h1_headers():
|
||||
"""Test handling content with no H1 headers"""
|
||||
content = """This is some documentation.
|
||||
It has no H1 headers.
|
||||
Just regular content.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 1
|
||||
assert sections[0].section_title == "Full Document"
|
||||
assert sections[0].url == "https://example.com/llms-full.txt"
|
||||
assert "This is some documentation" in sections[0].content
|
||||
|
||||
|
||||
def test_h2_not_treated_as_section():
|
||||
"""Test that H2 headers (##) are not treated as section boundaries"""
|
||||
content = """# Main Section
|
||||
This is the main section.
|
||||
|
||||
## Subsection
|
||||
This is a subsection.
|
||||
|
||||
## Another Subsection
|
||||
This is another subsection.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 1
|
||||
assert sections[0].section_title == "# Main Section"
|
||||
assert "## Subsection" in sections[0].content
|
||||
assert "## Another Subsection" in sections[0].content
|
||||
|
||||
|
||||
def test_empty_sections_skipped():
|
||||
"""Test that empty sections are skipped"""
|
||||
content = """# Section 1
|
||||
This is the first section with enough content to prevent automatic combination.
|
||||
It contains multiple sentences and provides substantial information for testing purposes.
|
||||
The section has several lines to ensure it exceeds the minimum character threshold.
|
||||
|
||||
#
|
||||
|
||||
# Section 2
|
||||
This is the second section with enough content to prevent automatic combination.
|
||||
It also contains multiple sentences and provides substantial information for testing.
|
||||
The section has several lines to ensure it exceeds the minimum character threshold.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
# Should only have 2 sections (empty one skipped)
|
||||
assert len(sections) == 2
|
||||
assert sections[0].section_title == "# Section 1"
|
||||
assert sections[1].section_title == "# Section 2"
|
||||
|
||||
|
||||
def test_consecutive_h1_headers():
|
||||
"""Test handling multiple consecutive H1 headers"""
|
||||
content = """# Section 1
|
||||
The first section contains enough content to prevent automatic combination with subsequent sections.
|
||||
It has multiple sentences and provides substantial information for proper testing functionality.
|
||||
This ensures that the section exceeds the minimum character threshold requirement.
|
||||
# Section 2
|
||||
This section also has enough content to prevent automatic combination with the previous section.
|
||||
It contains multiple sentences and provides substantial information for proper testing.
|
||||
The content here ensures that the section exceeds the minimum character threshold.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
# Both sections should be parsed
|
||||
assert len(sections) == 2
|
||||
assert sections[0].section_title == "# Section 1"
|
||||
assert sections[1].section_title == "# Section 2"
|
||||
assert "The first section contains enough content" in sections[0].content
|
||||
assert "This section also has enough content" in sections[1].content
|
||||
|
||||
|
||||
def test_word_count_calculation():
|
||||
"""Test word count calculation for sections"""
|
||||
content = """# Test Section
|
||||
This is a test section with exactly ten words here.
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 1
|
||||
# Word count includes the H1 heading
|
||||
assert sections[0].word_count > 10
|
||||
|
||||
|
||||
def test_empty_content():
|
||||
"""Test handling empty content"""
|
||||
content = ""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 0
|
||||
|
||||
|
||||
def test_whitespace_only_content():
|
||||
"""Test handling whitespace-only content"""
|
||||
content = """
|
||||
|
||||
|
||||
"""
|
||||
base_url = "https://example.com/llms-full.txt"
|
||||
sections = parse_llms_full_sections(content, base_url)
|
||||
|
||||
assert len(sections) == 0
|
||||
@@ -111,8 +111,8 @@ class TestCodeExtractionSourceId:
|
||||
assert args[2] == source_id
|
||||
assert args[3] is None
|
||||
assert args[4] is None
|
||||
if len(args) > 5:
|
||||
assert args[5] is None
|
||||
assert args[5] is None
|
||||
assert args[6] is None
|
||||
assert result == 5
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
||||
@@ -119,8 +119,8 @@ class TestSourceRaceCondition:
|
||||
assert "upsert" in methods_called, "Should use upsert for new sources"
|
||||
assert "insert" not in methods_called, "Should not use insert to avoid race conditions"
|
||||
|
||||
def test_existing_source_uses_update(self):
|
||||
"""Test that existing sources still use UPDATE (not affected by change)."""
|
||||
def test_existing_source_uses_upsert(self):
|
||||
"""Test that existing sources use UPSERT to handle race conditions."""
|
||||
mock_client = Mock()
|
||||
|
||||
methods_called = []
|
||||
@@ -158,9 +158,9 @@ class TestSourceRaceCondition:
|
||||
))
|
||||
loop.close()
|
||||
|
||||
# Should use update for existing sources
|
||||
assert "update" in methods_called, "Should use update for existing sources"
|
||||
assert "upsert" not in methods_called, "Should not use upsert for existing sources"
|
||||
# Should use upsert for existing sources to handle race conditions
|
||||
assert "upsert" in methods_called, "Should use upsert for existing sources"
|
||||
assert "update" not in methods_called, "Should not use update (upsert handles race conditions)"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_concurrent_creation(self):
|
||||
|
||||
Reference in New Issue
Block a user