Merge main into feature/automatic-discovery-llms-sitemap-430

Resolved merge conflicts by integrating features from both branches:
- Added page_storage_ops service initialization from main
- Merged link text extraction with discovery mode features
- Preserved discovery single-file mode and domain filtering
- Maintained link text fallbacks for title extraction

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-10-11 09:31:24 +02:00
123 changed files with 12523 additions and 3795 deletions

View File

@@ -0,0 +1,195 @@
"""
Tests for LLMs-full.txt Section Parser
"""
import pytest
from src.server.services.crawling.helpers.llms_full_parser import (
create_section_slug,
create_section_url,
parse_llms_full_sections,
)
def test_create_section_slug():
"""Test slug generation from H1 headings"""
assert create_section_slug("# Core Concepts") == "core-concepts"
assert create_section_slug("# Getting Started!") == "getting-started"
assert create_section_slug("# API Reference (v2)") == "api-reference-v2"
assert create_section_slug("# Hello World") == "hello-world"
assert create_section_slug("# Spaces ") == "spaces"
def test_create_section_url():
"""Test synthetic URL generation with slug anchor"""
base_url = "https://example.com/llms-full.txt"
url = create_section_url(base_url, "# Core Concepts", 0)
assert url == "https://example.com/llms-full.txt#section-0-core-concepts"
url = create_section_url(base_url, "# Getting Started", 1)
assert url == "https://example.com/llms-full.txt#section-1-getting-started"
def test_parse_single_section():
"""Test parsing a single H1 section"""
content = """# Core Concepts
Claude is an AI assistant built by Anthropic.
It can help with various tasks.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 1
assert sections[0].section_title == "# Core Concepts"
assert sections[0].section_order == 0
assert sections[0].url == "https://example.com/llms-full.txt#section-0-core-concepts"
assert "Claude is an AI assistant" in sections[0].content
assert sections[0].word_count > 0
def test_parse_multiple_sections():
"""Test parsing multiple H1 sections"""
content = """# Core Concepts
Claude is an AI assistant built by Anthropic that can help with various tasks.
It uses advanced language models to understand and respond to queries.
This section provides an overview of the core concepts and capabilities.
# Getting Started
To get started with Claude, you'll need to create an account and obtain API credentials.
Follow the setup instructions and configure your development environment properly.
This will enable you to make your first API calls and start building applications.
# API Reference
The API uses REST principles and supports standard HTTP methods like GET, POST, PUT, and DELETE.
Authentication is handled through API keys that should be kept secure at all times.
Comprehensive documentation is available for all endpoints and response formats.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 3
assert sections[0].section_title == "# Core Concepts"
assert sections[1].section_title == "# Getting Started"
assert sections[2].section_title == "# API Reference"
assert sections[0].section_order == 0
assert sections[1].section_order == 1
assert sections[2].section_order == 2
assert sections[0].url == "https://example.com/llms-full.txt#section-0-core-concepts"
assert sections[1].url == "https://example.com/llms-full.txt#section-1-getting-started"
assert sections[2].url == "https://example.com/llms-full.txt#section-2-api-reference"
def test_no_h1_headers():
"""Test handling content with no H1 headers"""
content = """This is some documentation.
It has no H1 headers.
Just regular content.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 1
assert sections[0].section_title == "Full Document"
assert sections[0].url == "https://example.com/llms-full.txt"
assert "This is some documentation" in sections[0].content
def test_h2_not_treated_as_section():
"""Test that H2 headers (##) are not treated as section boundaries"""
content = """# Main Section
This is the main section.
## Subsection
This is a subsection.
## Another Subsection
This is another subsection.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 1
assert sections[0].section_title == "# Main Section"
assert "## Subsection" in sections[0].content
assert "## Another Subsection" in sections[0].content
def test_empty_sections_skipped():
"""Test that empty sections are skipped"""
content = """# Section 1
This is the first section with enough content to prevent automatic combination.
It contains multiple sentences and provides substantial information for testing purposes.
The section has several lines to ensure it exceeds the minimum character threshold.
#
# Section 2
This is the second section with enough content to prevent automatic combination.
It also contains multiple sentences and provides substantial information for testing.
The section has several lines to ensure it exceeds the minimum character threshold.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
# Should only have 2 sections (empty one skipped)
assert len(sections) == 2
assert sections[0].section_title == "# Section 1"
assert sections[1].section_title == "# Section 2"
def test_consecutive_h1_headers():
"""Test handling multiple consecutive H1 headers"""
content = """# Section 1
The first section contains enough content to prevent automatic combination with subsequent sections.
It has multiple sentences and provides substantial information for proper testing functionality.
This ensures that the section exceeds the minimum character threshold requirement.
# Section 2
This section also has enough content to prevent automatic combination with the previous section.
It contains multiple sentences and provides substantial information for proper testing.
The content here ensures that the section exceeds the minimum character threshold.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
# Both sections should be parsed
assert len(sections) == 2
assert sections[0].section_title == "# Section 1"
assert sections[1].section_title == "# Section 2"
assert "The first section contains enough content" in sections[0].content
assert "This section also has enough content" in sections[1].content
def test_word_count_calculation():
"""Test word count calculation for sections"""
content = """# Test Section
This is a test section with exactly ten words here.
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 1
# Word count includes the H1 heading
assert sections[0].word_count > 10
def test_empty_content():
"""Test handling empty content"""
content = ""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 0
def test_whitespace_only_content():
"""Test handling whitespace-only content"""
content = """
"""
base_url = "https://example.com/llms-full.txt"
sections = parse_llms_full_sections(content, base_url)
assert len(sections) == 0

View File

@@ -111,8 +111,8 @@ class TestCodeExtractionSourceId:
assert args[2] == source_id
assert args[3] is None
assert args[4] is None
if len(args) > 5:
assert args[5] is None
assert args[5] is None
assert args[6] is None
assert result == 5
@pytest.mark.asyncio

View File

@@ -119,8 +119,8 @@ class TestSourceRaceCondition:
assert "upsert" in methods_called, "Should use upsert for new sources"
assert "insert" not in methods_called, "Should not use insert to avoid race conditions"
def test_existing_source_uses_update(self):
"""Test that existing sources still use UPDATE (not affected by change)."""
def test_existing_source_uses_upsert(self):
"""Test that existing sources use UPSERT to handle race conditions."""
mock_client = Mock()
methods_called = []
@@ -158,9 +158,9 @@ class TestSourceRaceCondition:
))
loop.close()
# Should use update for existing sources
assert "update" in methods_called, "Should use update for existing sources"
assert "upsert" not in methods_called, "Should not use upsert for existing sources"
# Should use upsert for existing sources to handle race conditions
assert "upsert" in methods_called, "Should use upsert for existing sources"
assert "update" not in methods_called, "Should not use update (upsert handles race conditions)"
@pytest.mark.asyncio
async def test_async_concurrent_creation(self):