Merge main into feature/automatic-discovery-llms-sitemap-430

- Resolved conflicts in progress_mapper.py to include discovery stage (3-4%) - Resolved conflicts in crawling_service.py to maintain both discovery feature and main improvements - Resolved conflicts in test_progress_mapper.py to include tests for discovery stage - Kept all optimizations and improvements from main - Maintained discovery feature functionality with proper integration
2025-12-30 21:49:30 -05:00 · 2025-09-20 09:27:36 +02:00
parent 77b047093c 37994191fc
commit 8072066ee6
265 changed files with 28898 additions and 12424 deletions
--- a/python/tests/progress_tracking/integration/test_crawl_orchestration_progress.py
+++ b/python/tests/progress_tracking/integration/test_crawl_orchestration_progress.py
@@ -168,17 +168,17 @@ class TestCrawlOrchestrationProgressIntegration:
        mapper = crawling_service.progress_mapper
        tracker = crawling_service.progress_tracker
        
-        # Test sequence of stage progressions with mapping
+        # Test sequence of stage progressions with mapping (updated for new ranges)
        test_stages = [
-            ("analyzing", 100, 2),      # Should map to ~2%
-            ("crawling", 100, 5),       # Should map to ~5% 
-            ("processing", 100, 8),     # Should map to ~8%
-            ("source_creation", 100, 10), # Should map to ~10%
-            ("document_storage", 25, 15), # 25% of 10-30% = 15%
-            ("document_storage", 50, 20), # 50% of 10-30% = 20%
-            ("document_storage", 100, 30), # 100% of 10-30% = 30%
-            ("code_extraction", 50, 62),  # 50% of 30-95% = 62.5% ≈ 62%
-            ("code_extraction", 100, 95), # 100% of 30-95% = 95%
+            ("analyzing", 100, 3),      # Should map to ~3%
+            ("crawling", 100, 15),      # Should map to ~15% 
+            ("processing", 100, 20),    # Should map to ~20%
+            ("source_creation", 100, 25), # Should map to ~25%
+            ("document_storage", 25, 29), # 25% of 25-40% = 29%
+            ("document_storage", 50, 32), # 50% of 25-40% = 32.5% ≈ 32%
+            ("document_storage", 100, 40), # 100% of 25-40% = 40%
+            ("code_extraction", 50, 65),  # 50% of 40-90% = 65%
+            ("code_extraction", 100, 90), # 100% of 40-90% = 90%
            ("finalization", 100, 100),   # Should map to 100%
        ]
        
--- a/python/tests/progress_tracking/test_batch_progress_bug.py
+++ b/python/tests/progress_tracking/test_batch_progress_bug.py
@@ -0,0 +1,172 @@
+"""
+Test for batch progress bug where progress jumps to 100% prematurely.
+
+This test ensures that when document_storage completes (100% of its stage),
+the overall progress maps correctly to 40% and doesn't contaminate future stages.
+"""
+
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+import pytest
+
+from src.server.services.crawling.crawling_service import CrawlingService
+from src.server.services.crawling.progress_mapper import ProgressMapper
+from src.server.utils.progress.progress_tracker import ProgressTracker
+
+
+class TestBatchProgressBug:
+    """Test that batch progress doesn't jump to 100% prematurely."""
+    
+    @pytest.mark.asyncio
+    async def test_document_storage_completion_maps_correctly(self):
+        """Test that document_storage at 100% maps to 40% overall, not 100%."""
+        
+        # Create a progress mapper
+        mapper = ProgressMapper()
+        
+        # Simulate document_storage progress
+        progress_values = []
+        
+        # Document storage progresses from 0 to 100%
+        for i in range(0, 101, 20):
+            mapped = mapper.map_progress("document_storage", i)
+            progress_values.append(mapped)
+            
+            # Document storage range is 25-40%
+            # So 0% -> 25%, 50% -> 32.5%, 100% -> 40%
+            if i == 0:
+                assert mapped == 25, f"document_storage at 0% should map to 25%, got {mapped}%"
+            elif i == 100:
+                assert mapped == 40, f"document_storage at 100% should map to 40%, got {mapped}%"
+            else:
+                assert 25 <= mapped <= 40, f"document_storage at {i}% should be between 25-40%, got {mapped}%"
+        
+        # Verify final state after document_storage completes
+        assert mapper.last_overall_progress == 40, "After document_storage completes, overall should be 40%"
+        
+        # Now start code_extraction at 0%
+        code_start = mapper.map_progress("code_extraction", 0)
+        assert code_start == 40, f"code_extraction at 0% should map to 40%, got {code_start}%"
+        
+        # Progress through code_extraction
+        code_mid = mapper.map_progress("code_extraction", 50)
+        assert code_mid == 65, f"code_extraction at 50% should map to 65%, got {code_mid}%"
+        
+        code_end = mapper.map_progress("code_extraction", 100)
+        assert code_end == 90, f"code_extraction at 100% should map to 90%, got {code_end}%"
+    
+    @pytest.mark.asyncio
+    async def test_progress_tracker_prevents_raw_value_contamination(self):
+        """Test that ProgressTracker doesn't allow raw progress values to contaminate state."""
+        
+        tracker = ProgressTracker("test-progress-123", "crawl")
+        
+        # Start tracking
+        await tracker.start({"url": "https://example.com"})
+        
+        # Simulate document_storage sending updates
+        await tracker.update("document_storage", 25, "Starting document storage")
+        assert tracker.state["progress"] == 25
+        
+        # Midway through
+        await tracker.update("document_storage", 32, "Processing batches")
+        assert tracker.state["progress"] == 32
+        
+        # Document storage completes (mapped to 40%)
+        await tracker.update("document_storage", 40, "Document storage complete")
+        assert tracker.state["progress"] == 40
+        
+        # Verify that logs also have correct progress
+        logs = tracker.state.get("logs", [])
+        if logs:
+            last_log = logs[-1]
+            assert last_log["progress"] == 40, f"Log should have progress=40, got {last_log['progress']}"
+        
+        # Start code_extraction at 40% (not 100%!)
+        await tracker.update("code_extraction", 40, "Starting code extraction")
+        assert tracker.state["progress"] == 40, "Progress should stay at 40% when code_extraction starts"
+        
+        # Progress through code_extraction
+        await tracker.update("code_extraction", 65, "Extracting code examples")
+        assert tracker.state["progress"] == 65
+        
+        # Verify protected fields aren't overridden via kwargs
+        await tracker.update("code_extraction", 70, "More extraction", raw_progress=100, fake_status="fake")
+        assert tracker.state["progress"] == 70, "Progress should remain at 70%"
+        assert tracker.state["status"] == "code_extraction", "Status should remain code_extraction"
+        # Verify that raw_progress doesn't override the actual progress
+        assert tracker.state.get("raw_progress") != 70, "raw_progress can be stored but shouldn't affect progress"
+    
+    @pytest.mark.asyncio
+    async def test_batch_processing_progress_sequence(self):
+        """Test realistic batch processing sequence to ensure no premature 100%."""
+        
+        mapper = ProgressMapper()
+        tracker = ProgressTracker("test-batch-123", "crawl")
+        
+        await tracker.start({"url": "https://example.com/sitemap.xml"})
+        
+        # Simulate crawling 20 pages
+        total_pages = 20
+        
+        # Crawling phase (3-15%)
+        for page in range(1, total_pages + 1):
+            progress = (page / total_pages) * 100
+            mapped = mapper.map_progress("crawling", progress)
+            await tracker.update("crawling", mapped, f"Crawled {page}/{total_pages} pages")
+            
+            # Should never exceed 15% during crawling
+            assert mapped <= 15, f"Crawling progress should not exceed 15%, got {mapped}%"
+        
+        # Document storage phase (25-40%) - process in 5 batches
+        total_batches = 5
+        for batch in range(1, total_batches + 1):
+            progress = (batch / total_batches) * 100
+            mapped = mapper.map_progress("document_storage", progress)
+            await tracker.update("document_storage", mapped, f"Batch {batch}/{total_batches}")
+            
+            # Should be between 25-40% during document storage
+            assert 25 <= mapped <= 40, f"Document storage should be 25-40%, got {mapped}%"
+            
+            # Specifically check batch 4/5 (80% of stage = ~37% overall)
+            if batch == 4:
+                assert mapped < 40, f"Batch 4/{total_batches} should not be at 40% yet, got {mapped}%"
+                assert mapped < 100, f"Batch 4/{total_batches} should NEVER be 100%, got {mapped}%"
+        
+        # After all document storage batches
+        final_doc_progress = tracker.state["progress"]
+        assert final_doc_progress == 40, f"After document storage, should be at 40%, got {final_doc_progress}%"
+        
+        # Code extraction phase (40-90%)
+        code_batches = 10
+        for batch in range(1, code_batches + 1):
+            progress = (batch / code_batches) * 100
+            mapped = mapper.map_progress("code_extraction", progress)
+            await tracker.update("code_extraction", mapped, f"Code batch {batch}/{code_batches}")
+            
+            # Should be between 40-90% during code extraction
+            assert 40 <= mapped <= 90, f"Code extraction should be 40-90%, got {mapped}%"
+        
+        # Finalization (90-100%)
+        finalize_mapped = mapper.map_progress("finalization", 50)
+        await tracker.update("finalization", finalize_mapped, "Finalizing")
+        assert 90 <= finalize_mapped <= 100, f"Finalization should be 90-100%, got {finalize_mapped}%"
+        
+        # Only at the very end should we reach 100%
+        complete_mapped = mapper.map_progress("completed", 100)
+        await tracker.update("completed", complete_mapped, "Completed")
+        assert complete_mapped == 100, "Only 'completed' stage should reach 100%"
+        
+        # Verify the entire sequence never jumped to 100% prematurely
+        # by checking the logs
+        logs = tracker.state.get("logs", [])
+        for i, log in enumerate(logs[:-1]):  # All except the last one
+            assert log["progress"] < 100, f"Log {i} shows premature 100%: {log}"
+        
+        # Only the last log should be 100%
+        if logs:
+            assert logs[-1]["progress"] == 100, "Final log should be 100%"
+
+
+if __name__ == "__main__":
+    asyncio.run(pytest.main([__file__, "-v"]))
--- a/python/tests/progress_tracking/test_progress_mapper.py
+++ b/python/tests/progress_tracking/test_progress_mapper.py
@@ -1,4 +1,6 @@
-"""Unit tests for the ProgressMapper class."""
+"""
+Tests for ProgressMapper
+"""

 import pytest

@@ -6,215 +8,292 @@ from src.server.services.crawling.progress_mapper import ProgressMapper


 class TestProgressMapper:
-    """Test cases for ProgressMapper functionality."""
+    """Test suite for ProgressMapper"""

-    @pytest.fixture
-    def progress_mapper(self):
-        """Create a fresh ProgressMapper for each test."""
-        return ProgressMapper()
+    def test_initialization(self):
+        """Test ProgressMapper initialization"""
+        mapper = ProgressMapper()

-    def test_init_sets_initial_state(self, progress_mapper):
-        """Test that initialization sets correct initial state."""
-        assert progress_mapper.last_overall_progress == 0
-        assert progress_mapper.current_stage == "starting"
+        assert mapper.last_overall_progress == 0
+        assert mapper.current_stage == "starting"

-    def test_stage_ranges_are_valid(self, progress_mapper):
-        """Test that all stage ranges are valid and sequential."""
-        ranges = progress_mapper.STAGE_RANGES
+    def test_map_progress_basic(self):
+        """Test basic progress mapping"""
+        mapper = ProgressMapper()

-        # Test that ranges don't overlap (except for aliases)
-        crawl_stages = ["starting", "analyzing", "crawling", "processing",
-                       "source_creation", "document_storage", "code_extraction",
-                       "finalization", "completed"]
+        # Starting stage (0-1%)
+        progress = mapper.map_progress("starting", 50)
+        assert progress == 0  # 50% of 0-1 range

-        last_end = 0
-        for stage in crawl_stages[:-1]:  # Exclude completed which is (100, 100)
-            start, end = ranges[stage]
-            assert start >= last_end, f"Stage {stage} starts before previous stage ends"
-            assert end > start, f"Stage {stage} has invalid range: {start}-{end}"
-            last_end = end
+        # Analyzing stage (1-3%)
+        progress = mapper.map_progress("analyzing", 50)
+        assert progress == 2  # 1 + (50% of 2) = 2

-        # Test that code extraction gets the largest range (it's the longest)
-        code_start, code_end = ranges["code_extraction"]
-        code_range = code_end - code_start
+        # Discovery stage (3-4%) - NEW TEST FOR DISCOVERY FEATURE
+        progress = mapper.map_progress("discovery", 50)
+        assert progress == 3  # 3 + (50% of 1) = 3.5 -> 3

-        doc_start, doc_end = ranges["document_storage"]
-        doc_range = doc_end - doc_start
+        # Crawling stage (4-15%)
+        progress = mapper.map_progress("crawling", 50)
+        assert progress == 9  # 4 + (50% of 11) = 9.5 -> 9

-        assert code_range > doc_range, "Code extraction should have larger range than document storage"
+    def test_progress_never_goes_backwards(self):
+        """Test that progress never decreases"""
+        mapper = ProgressMapper()

-    def test_map_progress_basic_functionality(self, progress_mapper):
-        """Test basic progress mapping functionality."""
-        # Test crawling stage at 50%
-        result = progress_mapper.map_progress("crawling", 50.0)
+        # Move to 50% of crawling (4-15%) = 9.5 -> 9%
+        progress1 = mapper.map_progress("crawling", 50)
+        assert progress1 == 9

-        # Should be halfway between crawling range (2-5%)
-        expected = 2 + (50 / 100) * (5 - 2)  # 3.5%, rounded to 4
-        assert result == 4
+        # Try to go back to analyzing (1-3%) - should stay at 9%
+        progress2 = mapper.map_progress("analyzing", 100)
+        assert progress2 == 9  # Should not go backwards

-    def test_map_progress_document_storage(self, progress_mapper):
-        """Test progress mapping for document storage stage."""
-        # Test document storage at 25%
-        result = progress_mapper.map_progress("document_storage", 25.0)
+        # Can move forward to document_storage
+        progress3 = mapper.map_progress("document_storage", 50)
+        assert progress3 == 32  # 25 + (50% of 15) = 32.5 -> 32

-        # Should be 25% through document_storage range (10-30%)
-        expected = 10 + (25 / 100) * (30 - 10)  # 10 + 5 = 15
-        assert result == 15
+    def test_completion_handling(self):
+        """Test completion status handling"""
+        mapper = ProgressMapper()

-    def test_map_progress_code_extraction(self, progress_mapper):
-        """Test progress mapping for code extraction stage."""
-        # Test code extraction at 50%
-        result = progress_mapper.map_progress("code_extraction", 50.0)
+        # Jump straight to completed
+        progress = mapper.map_progress("completed", 0)
+        assert progress == 100

-        # Should be 50% through code_extraction range (30-95%)
-        expected = 30 + (50 / 100) * (95 - 30)  # 30 + 32.5 = 62.5, rounded to 62
-        assert result == 62
+        # Any percentage at completed should be 100
+        progress = mapper.map_progress("completed", 50)
+        assert progress == 100

-    def test_map_progress_never_goes_backwards(self, progress_mapper):
-        """Test that mapped progress never decreases."""
-        # Set initial progress to 50%
-        result1 = progress_mapper.map_progress("document_storage", 100.0)  # Should be 30%
-        assert result1 == 30
+        # Test alias 'complete'
+        mapper2 = ProgressMapper()
+        progress = mapper2.map_progress("complete", 0)
+        assert progress == 100

-        # Try to map a lower stage with lower progress
-        result2 = progress_mapper.map_progress("crawling", 50.0)  # Would normally be ~3.5%
+    def test_error_handling(self):
+        """Test error status handling - preserves last known progress"""
+        mapper = ProgressMapper()

-        # Should maintain higher progress
-        assert result2 == 30  # Stays at previous high value
+        # Error with no prior progress should return 0 (initial state)
+        progress = mapper.map_progress("error", 50)
+        assert progress == 0

-    def test_map_progress_clamping(self, progress_mapper):
-        """Test that stage progress is clamped to 0-100 range."""
-        # Test negative progress
-        result = progress_mapper.map_progress("crawling", -10.0)
-        expected = 3  # Start of crawling range (updated after discovery stage)
-        assert result == expected
+        # Set some progress first, then error should preserve it
+        mapper.map_progress("crawling", 50)  # Should map to somewhere in the crawling range
+        current_progress = mapper.last_overall_progress
+        error_progress = mapper.map_progress("error", 50)
+        assert error_progress == current_progress  # Should preserve the progress

-        # Test progress over 100
-        result = progress_mapper.map_progress("crawling", 150.0)
-        expected = 5  # End of crawling range
-        assert result == expected
+    def test_cancelled_handling(self):
+        """Test cancelled status handling - preserves last known progress"""
+        mapper = ProgressMapper()

-    def test_completion_always_returns_100(self, progress_mapper):
-        """Test that completion stages always return 100%."""
-        assert progress_mapper.map_progress("completed", 0) == 100
-        assert progress_mapper.map_progress("complete", 50) == 100
-        assert progress_mapper.map_progress("completed", 100) == 100
+        # Cancelled with no prior progress should return 0 (initial state)
+        progress = mapper.map_progress("cancelled", 50)
+        assert progress == 0

-    def test_error_returns_negative_one(self, progress_mapper):
-        """Test that error stage returns -1."""
-        assert progress_mapper.map_progress("error", 50) == -1
+        # Set some progress first, then cancelled should preserve it
+        mapper.map_progress("crawling", 75)  # Should map to somewhere in the crawling range
+        current_progress = mapper.last_overall_progress
+        cancelled_progress = mapper.map_progress("cancelled", 50)
+        assert cancelled_progress == current_progress  # Should preserve the progress
+
+    def test_unknown_stage(self):
+        """Test handling of unknown stages"""
+        mapper = ProgressMapper()

-    def test_unknown_stage_maintains_current_progress(self, progress_mapper):
-        """Test that unknown stages don't change progress."""
        # Set some initial progress
-        progress_mapper.map_progress("crawling", 50)
-        current = progress_mapper.last_overall_progress
+        mapper.map_progress("crawling", 50)
+        current = mapper.last_overall_progress

-        # Try unknown stage
-        result = progress_mapper.map_progress("unknown_stage", 75)
+        # Unknown stage should maintain current progress
+        progress = mapper.map_progress("unknown_stage", 50)
+        assert progress == current

-        # Should maintain current progress
-        assert result == current
+    def test_stage_ranges_with_discovery(self):
+        """Test all defined stage ranges including discovery"""
+        mapper = ProgressMapper()

-    def test_get_stage_range(self, progress_mapper):
-        """Test getting stage ranges."""
-        assert progress_mapper.get_stage_range("discovery") == (2, 3)    # New discovery stage
-        assert progress_mapper.get_stage_range("crawling") == (3, 5)     # Updated after discovery
-        assert progress_mapper.get_stage_range("document_storage") == (10, 30)
-        assert progress_mapper.get_stage_range("code_extraction") == (30, 95)
-        assert progress_mapper.get_stage_range("unknown") == (0, 100)  # Default
+        # Verify ranges are correctly defined with new balanced values
+        assert mapper.STAGE_RANGES["starting"] == (0, 1)
+        assert mapper.STAGE_RANGES["analyzing"] == (1, 3)
+        assert mapper.STAGE_RANGES["discovery"] == (3, 4)  # NEW DISCOVERY STAGE
+        assert mapper.STAGE_RANGES["crawling"] == (4, 15)
+        assert mapper.STAGE_RANGES["processing"] == (15, 20)
+        assert mapper.STAGE_RANGES["source_creation"] == (20, 25)
+        assert mapper.STAGE_RANGES["document_storage"] == (25, 40)
+        assert mapper.STAGE_RANGES["code_extraction"] == (40, 90)
+        assert mapper.STAGE_RANGES["finalization"] == (90, 100)
+        assert mapper.STAGE_RANGES["completed"] == (100, 100)

-    def test_calculate_stage_progress(self, progress_mapper):
-        """Test stage progress calculation from current/max values."""
-        # Test normal case
-        result = progress_mapper.calculate_stage_progress(25, 100)
-        assert result == 25.0
+        # Upload-specific stages
+        assert mapper.STAGE_RANGES["reading"] == (0, 5)
+        assert mapper.STAGE_RANGES["text_extraction"] == (5, 10)
+        assert mapper.STAGE_RANGES["chunking"] == (10, 15)
+        # Note: source_creation is shared between crawl and upload operations at (20, 25)
+        assert mapper.STAGE_RANGES["summarizing"] == (25, 35)
+        assert mapper.STAGE_RANGES["storing"] == (35, 100)

-        # Test division by zero protection
-        result = progress_mapper.calculate_stage_progress(10, 0)
-        assert result == 0.0
+    def test_calculate_stage_progress(self):
+        """Test calculating percentage within a stage"""
+        mapper = ProgressMapper()

-        # Test negative max protection
-        result = progress_mapper.calculate_stage_progress(10, -5)
-        assert result == 0.0
+        # 5 out of 10 = 50%
+        progress = mapper.calculate_stage_progress(5, 10)
+        assert progress == 50.0

-    def test_map_batch_progress(self, progress_mapper):
-        """Test batch progress mapping."""
-        # Test batch 3 of 6 in document_storage stage
-        result = progress_mapper.map_batch_progress("document_storage", 3, 6)
+        # 0 out of 10 = 0%
+        progress = mapper.calculate_stage_progress(0, 10)
+        assert progress == 0.0

-        # Should be (3-1)/6 = 33.3% through document_storage stage
-        # document_storage is 10-30%, so 33.3% of 20% = 6.67%, so 10 + 6.67 = 16.67 ≈ 17
-        assert result == 17
+        # 10 out of 10 = 100%
+        progress = mapper.calculate_stage_progress(10, 10)
+        assert progress == 100.0

-    def test_map_with_substage(self, progress_mapper):
-        """Test progress mapping with substage information."""
-        # For now, this should work the same as regular mapping
-        result = progress_mapper.map_with_substage("document_storage", "embeddings", 50.0)
-        expected = progress_mapper.map_progress("document_storage", 50.0)
-        assert result == expected
+        # Handle division by zero
+        progress = mapper.calculate_stage_progress(5, 0)
+        assert progress == 0.0
+
+    def test_map_batch_progress(self):
+        """Test batch progress mapping"""
+        mapper = ProgressMapper()
+
+        # Batch 1 of 5 in document_storage stage
+        progress = mapper.map_batch_progress("document_storage", 1, 5)
+        assert progress == 25  # Start of document_storage range (25-40)
+
+        # Batch 3 of 5
+        progress = mapper.map_batch_progress("document_storage", 3, 5)
+        assert progress == 31  # 40% through 25-40 range
+
+        # Batch 5 of 5
+        progress = mapper.map_batch_progress("document_storage", 5, 5)
+        assert progress == 37  # 80% through 25-40 range
+
+    def test_map_with_substage(self):
+        """Test mapping with substage information"""
+        mapper = ProgressMapper()
+
+        # Currently just uses main stage
+        progress = mapper.map_with_substage("document_storage", "embeddings", 50)
+        assert progress == 32  # 50% of 25-40 range = 32.5 -> 32
+
+    def test_reset(self):
+        """Test resetting the mapper"""
+        mapper = ProgressMapper()

-    def test_reset_functionality(self, progress_mapper):
-        """Test that reset() clears state."""
        # Set some progress
-        progress_mapper.map_progress("crawling", 50)
-        assert progress_mapper.last_overall_progress > 0
-        assert progress_mapper.current_stage != "starting"
+        mapper.map_progress("document_storage", 50)
+        assert mapper.last_overall_progress == 32  # 25 + (50% of 15) = 32.5 -> 32
+        assert mapper.current_stage == "document_storage"

        # Reset
-        progress_mapper.reset()
+        mapper.reset()
+        assert mapper.last_overall_progress == 0
+        assert mapper.current_stage == "starting"

-        # Should be back to initial state
-        assert progress_mapper.last_overall_progress == 0
-        assert progress_mapper.current_stage == "starting"
+    def test_get_current_stage(self):
+        """Test getting current stage"""
+        mapper = ProgressMapper()

-    def test_get_current_stage_and_progress(self, progress_mapper):
-        """Test getting current stage and progress."""
-        # Initial state
-        assert progress_mapper.get_current_stage() == "starting"
-        assert progress_mapper.get_current_progress() == 0
+        assert mapper.get_current_stage() == "starting"

-        # After mapping some progress
-        progress_mapper.map_progress("document_storage", 50)
-        assert progress_mapper.get_current_stage() == "document_storage"
-        assert progress_mapper.get_current_progress() == 20  # 50% of 10-30% range
+        mapper.map_progress("crawling", 50)
+        assert mapper.get_current_stage() == "crawling"

-    def test_realistic_crawl_sequence(self, progress_mapper):
-        """Test a realistic sequence of crawl progress updates."""
-        stages = [
-            ("starting", 0, 0),
-            ("analyzing", 100, 2),
-            ("crawling", 100, 5),
-            ("processing", 100, 8),
-            ("source_creation", 100, 10),
-            ("document_storage", 25, 15),  # 25% of storage
-            ("document_storage", 50, 20),  # 50% of storage
-            ("document_storage", 75, 25),  # 75% of storage
-            ("document_storage", 100, 30), # Complete storage
-            ("code_extraction", 25, 46),   # 25% of extraction
-            ("code_extraction", 50, 62),   # 50% of extraction
-            ("code_extraction", 100, 95),  # Complete extraction
-            ("finalization", 100, 100),    # Finalization
-            ("completed", 0, 100),         # Completion
-        ]
+        mapper.map_progress("code_extraction", 50)
+        assert mapper.get_current_stage() == "code_extraction"

-        progress_mapper.reset()
+    def test_get_current_progress(self):
+        """Test getting current progress"""
+        mapper = ProgressMapper()

-        for stage, stage_progress, expected_overall in stages:
-            result = progress_mapper.map_progress(stage, stage_progress)
-            assert result == expected_overall, f"Stage {stage} at {stage_progress}% should map to {expected_overall}%, got {result}%"
+        assert mapper.get_current_progress() == 0

-    def test_upload_stage_ranges(self, progress_mapper):
-        """Test upload-specific stage ranges."""
-        upload_stages = ["reading", "extracting", "chunking", "creating_source", "summarizing", "storing"]
+        mapper.map_progress("crawling", 50)
+        assert mapper.get_current_progress() == 9  # 4 + (50% of 11) = 9.5 -> 9

-        # Test that upload stages have valid ranges
-        last_end = 0
-        for stage in upload_stages:
-            start, end = progress_mapper.get_stage_range(stage)
-            assert start >= last_end, f"Upload stage {stage} overlaps with previous"
-            assert end > start, f"Upload stage {stage} has invalid range"
-            last_end = end
+        mapper.map_progress("code_extraction", 50)
+        assert mapper.get_current_progress() == 65  # 40 + (50% of 50) = 65

-        # Test that final upload stage reaches 100%
-        assert progress_mapper.get_stage_range("storing")[1] == 100
+    def test_get_stage_range(self):
+        """Test getting stage range"""
+        mapper = ProgressMapper()
+
+        assert mapper.get_stage_range("starting") == (0, 1)
+        assert mapper.get_stage_range("discovery") == (3, 4)  # Test discovery stage
+        assert mapper.get_stage_range("code_extraction") == (40, 90)
+        assert mapper.get_stage_range("unknown") == (0, 100)  # Default range
+
+    def test_realistic_crawl_sequence_with_discovery(self):
+        """Test a realistic crawl progress sequence including discovery"""
+        mapper = ProgressMapper()
+
+        # Starting
+        assert mapper.map_progress("starting", 0) == 0
+        assert mapper.map_progress("starting", 100) == 1
+
+        # Analyzing
+        assert mapper.map_progress("analyzing", 0) == 1
+        assert mapper.map_progress("analyzing", 100) == 3
+
+        # Discovery (NEW)
+        assert mapper.map_progress("discovery", 0) == 3
+        assert mapper.map_progress("discovery", 50) == 3  # 3 + (50% of 1) = 3.5 -> 3
+        assert mapper.map_progress("discovery", 100) == 4
+
+        # Crawling
+        assert mapper.map_progress("crawling", 0) == 4
+        assert mapper.map_progress("crawling", 33) == 7  # 4 + (33% of 11) = 7.63 -> 8 but may round to 7
+        progress_crawl_66 = mapper.map_progress("crawling", 66)
+        assert progress_crawl_66 in [11, 12]  # 4 + (66% of 11) = 11.26, could round to 11 or 12
+        assert mapper.map_progress("crawling", 100) == 15
+
+        # Processing
+        assert mapper.map_progress("processing", 0) == 15
+        assert mapper.map_progress("processing", 100) == 20
+
+        # Source creation
+        assert mapper.map_progress("source_creation", 0) == 20
+        assert mapper.map_progress("source_creation", 100) == 25
+
+        # Document storage
+        assert mapper.map_progress("document_storage", 0) == 25
+        assert mapper.map_progress("document_storage", 50) == 32  # 25 + (50% of 15) = 32.5 -> 32
+        assert mapper.map_progress("document_storage", 100) == 40
+
+        # Code extraction (longest phase)
+        assert mapper.map_progress("code_extraction", 0) == 40
+        progress_25 = mapper.map_progress("code_extraction", 25)
+        assert progress_25 in [52, 53]  # 40 + (25% of 50) = 52.5, could round to 52 or 53
+        assert mapper.map_progress("code_extraction", 50) == 65  # 40 + (50% of 50) = 65
+        progress_75 = mapper.map_progress("code_extraction", 75)
+        assert progress_75 in [77, 78]  # 40 + (75% of 50) = 77.5, could round to 77 or 78
+        assert mapper.map_progress("code_extraction", 100) == 90
+
+        # Finalization
+        assert mapper.map_progress("finalization", 0) == 90
+        assert mapper.map_progress("finalization", 100) == 100
+
+        # Completed
+        assert mapper.map_progress("completed", 0) == 100
+
+    def test_aliases_work_correctly(self):
+        """Test that stage aliases work correctly"""
+        mapper = ProgressMapper()
+
+        # Test code_storage alias for code_extraction
+        progress1 = mapper.map_progress("code_extraction", 50)
+        mapper2 = ProgressMapper()
+        progress2 = mapper2.map_progress("code_storage", 50)
+        assert progress1 == progress2
+
+        # Test extracting alias for code_extraction
+        mapper3 = ProgressMapper()
+        progress3 = mapper3.map_progress("extracting", 50)
+        assert progress1 == progress3
+
+        # Test complete alias for completed
+        mapper4 = ProgressMapper()
+        progress4 = mapper4.map_progress("complete", 0)
+        assert progress4 == 100
--- a/python/tests/progress_tracking/test_progress_models.py
+++ b/python/tests/progress_tracking/test_progress_models.py
@@ -4,12 +4,12 @@ import pytest
 from pydantic import ValidationError

 from src.server.models.progress_models import (
-    ProgressDetails,
    BaseProgressResponse,
    CrawlProgressResponse,
-    UploadProgressResponse,
+    ProgressDetails,
    ProjectCreationProgressResponse,
-    create_progress_response
+    UploadProgressResponse,
+    create_progress_response,
 )


@@ -25,7 +25,7 @@ class TestProgressDetails:
            total_batches=6,
            chunks_per_second=5.5
        )
-        
+
        assert details.current_chunk == 25
        assert details.total_chunks == 100
        assert details.current_batch == 3
@@ -41,7 +41,7 @@ class TestProgressDetails:
            totalBatches=6,
            chunksPerSecond=5.5
        )
-        
+
        assert details.current_chunk == 25
        assert details.total_chunks == 100
        assert details.current_batch == 3
@@ -55,9 +55,9 @@ class TestProgressDetails:
            total_chunks=100,
            chunks_per_second=2.5
        )
-        
+
        data = details.model_dump(by_alias=True)
-        
+
        assert "currentChunk" in data
        assert "totalChunks" in data
        assert "chunksPerSecond" in data
@@ -76,9 +76,9 @@ class TestBaseProgressResponse:
            progress=50.0,
            message="Processing..."
        )
-        
+
        assert response.progress_id == "test-123"
-        assert response.status == "running" 
+        assert response.status == "running"
        assert response.progress == 50.0
        assert response.message == "Processing..."

@@ -91,15 +91,15 @@ class TestBaseProgressResponse:
            progress=50.0
        )
        assert response.progress == 50.0
-        
+
        # Invalid progress - too high
        with pytest.raises(ValidationError):
            BaseProgressResponse(
                progress_id="test-123",
-                status="running", 
+                status="running",
                progress=150.0
            )
-        
+
        # Invalid progress - too low
        with pytest.raises(ValidationError):
            BaseProgressResponse(
@@ -118,7 +118,7 @@ class TestBaseProgressResponse:
            logs=["Starting", "Processing", "Almost done"]
        )
        assert response.logs == ["Starting", "Processing", "Almost done"]
-        
+
        # Test with single string
        response = BaseProgressResponse(
            progress_id="test-123",
@@ -127,7 +127,7 @@ class TestBaseProgressResponse:
            logs="Single log message"
        )
        assert response.logs == ["Single log message"]
-        
+
        # Test with list of dicts (log entries)
        response = BaseProgressResponse(
            progress_id="test-123",
@@ -149,7 +149,7 @@ class TestBaseProgressResponse:
            currentStep="processing",  # camelCase
            stepMessage="Working on it"  # camelCase
        )
-        
+
        assert response.progress_id == "test-123"
        assert response.current_step == "processing"
        assert response.step_message == "Working on it"
@@ -162,7 +162,7 @@ class TestCrawlProgressResponse:
        """Test creating crawl response with batch processing information."""
        response = CrawlProgressResponse(
            progress_id="crawl-123",
-            status="document_storage", 
+            status="document_storage",
            progress=45.0,
            message="Processing batch 3/6",
            total_pages=60,
@@ -173,7 +173,7 @@ class TestCrawlProgressResponse:
            chunks_in_batch=25,
            active_workers=4
        )
-        
+
        assert response.progress_id == "crawl-123"
        assert response.status == "document_storage"
        assert response.current_batch == 3
@@ -195,7 +195,7 @@ class TestCrawlProgressResponse:
            completed_summaries=30,
            total_summaries=40
        )
-        
+
        assert response.code_blocks_found == 150
        assert response.code_examples_stored == 120
        assert response.completed_documents == 45
@@ -207,10 +207,10 @@ class TestCrawlProgressResponse:
        """Test that only valid crawl statuses are accepted."""
        valid_statuses = [
            "starting", "analyzing", "crawling", "processing",
-            "source_creation", "document_storage", "code_extraction", 
-            "finalization", "completed", "failed", "cancelled"
+            "source_creation", "document_storage", "code_extraction", "code_storage",
+            "finalization", "completed", "failed", "cancelled", "stopping", "error"
        ]
-        
+
        for status in valid_statuses:
            response = CrawlProgressResponse(
                progress_id="test-123",
@@ -218,7 +218,7 @@ class TestCrawlProgressResponse:
                progress=50.0
            )
            assert response.status == status
-        
+
        # Invalid status should raise validation error
        with pytest.raises(ValidationError):
            CrawlProgressResponse(
@@ -240,7 +240,7 @@ class TestCrawlProgressResponse:
            totalBatches=6,  # camelCase
            currentBatch=3  # camelCase
        )
-        
+
        assert response.current_url == "https://example.com/page1"
        assert response.total_pages == 100
        assert response.processed_pages == 50
@@ -258,16 +258,16 @@ class TestCrawlProgressResponse:
            duration=123.45
        )
        assert response.duration == "123.45"
-        
+
        # Test with int
        response = CrawlProgressResponse(
            progress_id="test-123",
-            status="completed", 
+            status="completed",
            progress=100.0,
            duration=120
        )
        assert response.duration == "120"
-        
+
        # Test with None
        response = CrawlProgressResponse(
            progress_id="test-123",
@@ -293,7 +293,7 @@ class TestUploadProgressResponse:
            chunks_stored=400,
            word_count=5000
        )
-        
+
        assert response.progress_id == "upload-123"
        assert response.status == "storing"
        assert response.upload_type == "document"
@@ -305,11 +305,11 @@ class TestUploadProgressResponse:
    def test_upload_status_validation(self):
        """Test upload status validation."""
        valid_statuses = [
-            "starting", "reading", "extracting", "chunking",
-            "creating_source", "summarizing", "storing",
-            "completed", "failed", "cancelled"
+            "starting", "reading", "text_extraction", "chunking",
+            "source_creation", "summarizing", "storing",
+            "completed", "failed", "cancelled", "error"
        ]
-        
+
        for status in valid_statuses:
            response = UploadProgressResponse(
                progress_id="test-123",
@@ -319,6 +319,33 @@ class TestUploadProgressResponse:
            assert response.status == status


+class TestProjectCreationProgressResponse:
+    """Test cases for ProjectCreationProgressResponse model."""
+
+    def test_project_creation_status_validation(self):
+        """Test project creation status validation."""
+        valid_statuses = [
+            "starting", "analyzing", "generating_prp", "creating_tasks",
+            "organizing", "completed", "failed", "error"
+        ]
+
+        for status in valid_statuses:
+            response = ProjectCreationProgressResponse(
+                progress_id="test-123",
+                status=status,
+                progress=50.0
+            )
+            assert response.status == status
+
+        # Invalid status should raise validation error
+        with pytest.raises(ValidationError):
+            ProjectCreationProgressResponse(
+                progress_id="test-123",
+                status="invalid_status",
+                progress=50.0
+            )
+
+
 class TestProgressResponseFactory:
    """Test cases for create_progress_response factory function."""

@@ -334,9 +361,9 @@ class TestProgressResponseFactory:
            "total_pages": 60,
            "processed_pages": 60
        }
-        
+
        response = create_progress_response("crawl", progress_data)
-        
+
        assert isinstance(response, CrawlProgressResponse)
        assert response.progress_id == "crawl-123"
        assert response.status == "document_storage"
@@ -353,9 +380,9 @@ class TestProgressResponseFactory:
            "file_name": "document.pdf",
            "chunks_stored": 300
        }
-        
+
        response = create_progress_response("upload", progress_data)
-        
+
        assert isinstance(response, UploadProgressResponse)
        assert response.progress_id == "upload-123"
        assert response.status == "storing"
@@ -374,9 +401,9 @@ class TestProgressResponseFactory:
            "total_chunks": 300,
            "chunks_per_second": 5.5
        }
-        
+
        response = create_progress_response("crawl", progress_data)
-        
+
        assert response.details is not None
        assert response.details.current_batch == 3
        assert response.details.total_batches == 6
@@ -391,16 +418,16 @@ class TestProgressResponseFactory:
            "progress_id": "test-123",
            "progress": 50
        }
-        
+
        response = create_progress_response("crawl", progress_data)
-        assert response.status == "running"  # Default
-        
+        assert response.status == "starting"  # Default
+
        # Missing progress
        progress_data = {
            "progress_id": "test-123",
            "status": "processing"
        }
-        
+
        response = create_progress_response("crawl", progress_data)
        assert response.progress == 0  # Default

@@ -411,7 +438,7 @@ class TestProgressResponseFactory:
            "status": "processing",
            "progress": 50
        }
-        
+
        response = create_progress_response("unknown_type", progress_data)
        assert isinstance(response, BaseProgressResponse)
        assert not isinstance(response, CrawlProgressResponse)
@@ -420,13 +447,13 @@ class TestProgressResponseFactory:
        """Test that factory falls back to base response on validation errors."""
        # Create invalid data that would fail CrawlProgressResponse validation
        progress_data = {
-            "progress_id": "test-123", 
+            "progress_id": "test-123",
            "status": "invalid_crawl_status",  # Invalid status
            "progress": 50
        }
-        
+
        response = create_progress_response("crawl", progress_data)
-        
+
        # Should fall back to BaseProgressResponse
        assert isinstance(response, BaseProgressResponse)
-        assert response.progress_id == "test-123"
+        assert response.progress_id == "test-123"
--- a/python/tests/progress_tracking/test_progress_tracker.py
+++ b/python/tests/progress_tracking/test_progress_tracker.py
@@ -1,226 +1,226 @@
-"""Unit tests for the ProgressTracker class."""
+"""
+Tests for ProgressTracker
+"""

 import pytest
 from datetime import datetime
-from unittest.mock import patch

-from src.server.utils.progress.progress_tracker import ProgressTracker
+from src.server.utils.progress import ProgressTracker


 class TestProgressTracker:
-    """Test cases for ProgressTracker functionality."""
+    """Test suite for ProgressTracker"""

-    @pytest.fixture
-    def progress_tracker(self):
-        """Create a fresh ProgressTracker for each test."""
-        return ProgressTracker("test-progress-id", "crawl")
-
-    def test_init_creates_initial_state(self, progress_tracker):
-        """Test that initialization creates correct initial state."""
-        assert progress_tracker.progress_id == "test-progress-id"
-        assert progress_tracker.operation_type == "crawl"
-        assert progress_tracker.state["progress_id"] == "test-progress-id"
-        assert progress_tracker.state["type"] == "crawl"
-        assert progress_tracker.state["status"] == "initializing"
-        assert progress_tracker.state["progress"] == 0
-        assert isinstance(progress_tracker.state["logs"], list)
-        assert len(progress_tracker.state["logs"]) == 0
-
-    def test_get_progress_returns_state(self, progress_tracker):
-        """Test that get_progress returns the correct state."""
-        state = ProgressTracker.get_progress("test-progress-id")
-        assert state is not None
-        assert state["progress_id"] == "test-progress-id"
-        assert state["type"] == "crawl"
-
-    def test_clear_progress_removes_state(self, progress_tracker):
-        """Test that clear_progress removes the state from memory."""
-        # Verify state exists
-        assert ProgressTracker.get_progress("test-progress-id") is not None
+    def test_initialization(self):
+        """Test ProgressTracker initialization"""
+        progress_id = "test-123"
+        tracker = ProgressTracker(progress_id, operation_type="crawl")
        
-        # Clear progress
-        ProgressTracker.clear_progress("test-progress-id")
+        assert tracker.progress_id == progress_id
+        assert tracker.operation_type == "crawl"
+        assert tracker.state["status"] == "initializing"
+        assert tracker.state["progress"] == 0
+        assert "start_time" in tracker.state
+        
+    def test_get_progress(self):
+        """Test getting progress by ID"""
+        progress_id = "test-456"
+        tracker = ProgressTracker(progress_id, operation_type="upload")
+        
+        # Should be able to get progress by ID
+        retrieved = ProgressTracker.get_progress(progress_id)
+        assert retrieved is not None
+        assert retrieved["progress_id"] == progress_id
+        assert retrieved["type"] == "upload"
+        
+    def test_clear_progress(self):
+        """Test clearing progress from memory"""
+        progress_id = "test-789"
+        ProgressTracker(progress_id, operation_type="crawl")
+        
+        # Verify it exists
+        assert ProgressTracker.get_progress(progress_id) is not None
+        
+        # Clear it
+        ProgressTracker.clear_progress(progress_id)
+        
+        # Verify it's gone
+        assert ProgressTracker.get_progress(progress_id) is None
        
-        # Verify state is gone
-        assert ProgressTracker.get_progress("test-progress-id") is None
-
    @pytest.mark.asyncio
-    async def test_start_updates_status_and_time(self, progress_tracker):
-        """Test that start() updates status and start time."""
-        initial_data = {"test_key": "test_value"}
+    async def test_start(self):
+        """Test starting progress tracking"""
+        tracker = ProgressTracker("test-start", operation_type="crawl")
        
-        await progress_tracker.start(initial_data)
+        initial_data = {
+            "url": "https://example.com",
+            "crawl_type": "normal"
+        }
+        
+        await tracker.start(initial_data)
+        
+        assert tracker.state["status"] == "starting"
+        assert tracker.state["url"] == "https://example.com"
+        assert tracker.state["crawl_type"] == "normal"
        
-        assert progress_tracker.state["status"] == "starting"
-        assert "start_time" in progress_tracker.state
-        assert progress_tracker.state["test_key"] == "test_value"
-
    @pytest.mark.asyncio
-    async def test_update_progress_and_logs(self, progress_tracker):
-        """Test that update() correctly updates progress and adds logs."""
-        await progress_tracker.update(
+    async def test_update(self):
+        """Test updating progress"""
+        tracker = ProgressTracker("test-update", operation_type="crawl")
+        
+        await tracker.update(
            status="crawling",
-            progress=25,
-            log="Processing page 5/20",
-            total_pages=20,
-            processed_pages=5
+            progress=50,
+            log="Processing page 5/10",
+            current_url="https://example.com/page5"
        )
        
-        assert progress_tracker.state["status"] == "crawling"
-        assert progress_tracker.state["progress"] == 25
-        assert progress_tracker.state["log"] == "Processing page 5/20"
-        assert progress_tracker.state["total_pages"] == 20
-        assert progress_tracker.state["processed_pages"] == 5
+        assert tracker.state["status"] == "crawling"
+        assert tracker.state["progress"] == 50
+        assert tracker.state["log"] == "Processing page 5/10"
+        assert tracker.state["current_url"] == "https://example.com/page5"
+        assert len(tracker.state["logs"]) == 1
        
-        # Check log entry was added
-        assert len(progress_tracker.state["logs"]) == 1
-        log_entry = progress_tracker.state["logs"][0]
-        assert log_entry["message"] == "Processing page 5/20"
-        assert log_entry["status"] == "crawling"
-        assert log_entry["progress"] == 25
-
    @pytest.mark.asyncio
-    async def test_progress_never_goes_backwards(self, progress_tracker):
-        """Test that progress values cannot decrease."""
-        # Set initial progress
-        await progress_tracker.update("crawling", 50, "Halfway done")
-        assert progress_tracker.state["progress"] == 50
+    async def test_progress_never_goes_backwards(self):
+        """Test that progress never decreases"""
+        tracker = ProgressTracker("test-backwards", operation_type="crawl")
        
-        # Try to set lower progress
-        await progress_tracker.update("crawling", 30, "Should not decrease")
+        # Set progress to 50%
+        await tracker.update(status="crawling", progress=50, log="Half way")
+        assert tracker.state["progress"] == 50
+        
+        # Try to set it to 30% - should stay at 50%
+        await tracker.update(status="crawling", progress=30, log="Should not go back")
+        assert tracker.state["progress"] == 50  # Should not decrease
+        
+        # Can increase to 70%
+        await tracker.update(status="crawling", progress=70, log="Moving forward")
+        assert tracker.state["progress"] == 70
        
-        # Progress should remain at 50
-        assert progress_tracker.state["progress"] == 50
-        # But status and message should update
-        assert progress_tracker.state["log"] == "Should not decrease"
-
    @pytest.mark.asyncio
-    async def test_progress_clamped_to_0_100(self, progress_tracker):
-        """Test that progress values are clamped to 0-100 range."""
-        # Test negative progress
-        await progress_tracker.update("starting", -10, "Negative progress")
-        assert progress_tracker.state["progress"] == 0
+    async def test_complete(self):
+        """Test marking progress as completed"""
+        tracker = ProgressTracker("test-complete", operation_type="crawl")
+        
+        await tracker.complete({
+            "chunks_stored": 100,
+            "source_id": "source-123",
+            "log": "Crawl completed successfully"
+        })
+        
+        assert tracker.state["status"] == "completed"
+        assert tracker.state["progress"] == 100
+        assert tracker.state["chunks_stored"] == 100
+        assert tracker.state["source_id"] == "source-123"
+        assert "end_time" in tracker.state
+        assert "duration" in tracker.state
        
-        # Test progress over 100
-        await progress_tracker.update("running", 150, "Over 100 progress")
-        assert progress_tracker.state["progress"] == 100
-
    @pytest.mark.asyncio
-    async def test_complete_sets_100_percent_and_duration(self, progress_tracker):
-        """Test that complete() sets progress to 100% and calculates duration."""
-        completion_data = {"chunks_stored": 500, "word_count": 10000}
+    async def test_error(self):
+        """Test marking progress as error"""
+        tracker = ProgressTracker("test-error", operation_type="crawl")
        
-        await progress_tracker.complete(completion_data)
-        
-        assert progress_tracker.state["status"] == "completed"
-        assert progress_tracker.state["progress"] == 100
-        assert progress_tracker.state["chunks_stored"] == 500
-        assert progress_tracker.state["word_count"] == 10000
-        assert "end_time" in progress_tracker.state
-        assert "duration" in progress_tracker.state
-        assert "duration_formatted" in progress_tracker.state
-
-    @pytest.mark.asyncio
-    async def test_error_sets_error_status(self, progress_tracker):
-        """Test that error() sets error status and details."""
-        error_details = {"error_code": 500, "component": "embedding_service"}
-        
-        await progress_tracker.error("Failed to create embeddings", error_details)
-        
-        assert progress_tracker.state["status"] == "error"
-        assert progress_tracker.state["error"] == "Failed to create embeddings"
-        assert progress_tracker.state["error_details"]["error_code"] == 500
-        assert "error_time" in progress_tracker.state
-
-    @pytest.mark.asyncio
-    async def test_update_batch_progress(self, progress_tracker):
-        """Test batch progress calculation and updates."""
-        await progress_tracker.update_batch_progress(
-            current_batch=3,
-            total_batches=6,
-            batch_size=25,
-            message="Processing batch 3 of 6"
+        await tracker.error(
+            "Failed to connect to URL",
+            error_details={"code": 404, "url": "https://example.com"}
        )
        
-        expected_progress = int((3 / 6) * 100)  # 50%
-        assert progress_tracker.state["progress"] == expected_progress
-        assert progress_tracker.state["status"] == "processing_batch"
-        assert progress_tracker.state["current_batch"] == 3
-        assert progress_tracker.state["total_batches"] == 6
-        assert progress_tracker.state["batch_size"] == 25
-
+        assert tracker.state["status"] == "error"
+        assert tracker.state["error"] == "Failed to connect to URL"
+        assert tracker.state["error_details"]["code"] == 404
+        assert "error_time" in tracker.state
+        
    @pytest.mark.asyncio
-    async def test_update_crawl_stats(self, progress_tracker):
-        """Test crawling statistics updates."""
-        await progress_tracker.update_crawl_stats(
-            processed_pages=15,
-            total_pages=30,
-            current_url="https://example.com/page15"
+    async def test_update_crawl_stats(self):
+        """Test updating crawl statistics"""
+        tracker = ProgressTracker("test-crawl-stats", operation_type="crawl")
+        
+        await tracker.update_crawl_stats(
+            processed_pages=5,
+            total_pages=10,
+            current_url="https://example.com/page5",
+            pages_found=15
        )
        
-        expected_progress = int((15 / 30) * 100)  # 50%
-        assert progress_tracker.state["progress"] == expected_progress
-        assert progress_tracker.state["status"] == "crawling"
-        assert progress_tracker.state["processed_pages"] == 15
-        assert progress_tracker.state["total_pages"] == 30
-        assert progress_tracker.state["current_url"] == "https://example.com/page15"
-        assert "Processing page 15/30: https://example.com/page15" in progress_tracker.state["log"]
-
+        assert tracker.state["status"] == "crawling"
+        assert tracker.state["progress"] == 50  # 5/10 = 50%
+        assert tracker.state["processed_pages"] == 5
+        assert tracker.state["total_pages"] == 10
+        assert tracker.state["current_url"] == "https://example.com/page5"
+        assert tracker.state["pages_found"] == 15
+        
    @pytest.mark.asyncio
-    async def test_update_storage_progress(self, progress_tracker):
-        """Test document storage progress updates."""
-        await progress_tracker.update_storage_progress(
-            chunks_stored=75,
+    async def test_update_storage_progress(self):
+        """Test updating storage progress"""
+        tracker = ProgressTracker("test-storage", operation_type="crawl")
+        
+        await tracker.update_storage_progress(
+            chunks_stored=25,
            total_chunks=100,
-            operation="storing embeddings"
+            operation="Storing embeddings",
+            word_count=5000,
+            embeddings_created=25
        )
        
-        expected_progress = int((75 / 100) * 100)  # 75%
-        assert progress_tracker.state["progress"] == expected_progress
-        assert progress_tracker.state["status"] == "document_storage"
-        assert progress_tracker.state["chunks_stored"] == 75
-        assert progress_tracker.state["total_chunks"] == 100
-        assert "storing embeddings: 75/100 chunks" in progress_tracker.state["log"]
-
-    def test_format_duration(self, progress_tracker):
-        """Test duration formatting for different time ranges."""
-        # Test seconds
-        formatted = progress_tracker._format_duration(45.5)
-        assert "45.5 seconds" in formatted
+        assert tracker.state["status"] == "document_storage"
+        assert tracker.state["progress"] == 25  # 25/100 = 25%
+        assert tracker.state["chunks_stored"] == 25
+        assert tracker.state["total_chunks"] == 100
+        assert tracker.state["word_count"] == 5000
+        assert tracker.state["embeddings_created"] == 25
        
-        # Test minutes
-        formatted = progress_tracker._format_duration(125.0)
-        assert "2.1 minutes" in formatted
+    @pytest.mark.asyncio
+    async def test_update_code_extraction_progress(self):
+        """Test updating code extraction progress"""
+        tracker = ProgressTracker("test-code", operation_type="crawl")
        
-        # Test hours
-        formatted = progress_tracker._format_duration(7200.0)
-        assert "2.0 hours" in formatted
-
-    def test_get_state_returns_copy(self, progress_tracker):
-        """Test that get_state returns a copy, not the original state."""
-        state_copy = progress_tracker.get_state()
+        await tracker.update_code_extraction_progress(
+            completed_summaries=3,
+            total_summaries=10,
+            code_blocks_found=15,
+            current_file="main.py"
+        )
        
-        # Modify the copy
-        state_copy["test_modification"] = "should not affect original"
+        assert tracker.state["status"] == "code_extraction"
+        assert tracker.state["progress"] == 30  # 3/10 = 30%
+        assert tracker.state["completed_summaries"] == 3
+        assert tracker.state["total_summaries"] == 10
+        assert tracker.state["code_blocks_found"] == 15
+        assert tracker.state["current_file"] == "main.py"
        
-        # Original state should be unchanged
-        assert "test_modification" not in progress_tracker.state
-
-    def test_multiple_trackers_independent(self):
-        """Test that multiple trackers maintain independent state."""
-        tracker1 = ProgressTracker("id-1", "crawl")
-        tracker2 = ProgressTracker("id-2", "upload")
+    @pytest.mark.asyncio
+    async def test_update_batch_progress(self):
+        """Test updating batch progress"""
+        tracker = ProgressTracker("test-batch", operation_type="upload")
        
-        # Verify they have different states
-        assert tracker1.progress_id != tracker2.progress_id
-        assert tracker1.state["progress_id"] != tracker2.state["progress_id"]
-        assert tracker1.state["type"] != tracker2.state["type"]
+        await tracker.update_batch_progress(
+            current_batch=3,
+            total_batches=5,
+            batch_size=100,
+            message="Processing batch 3 of 5"
+        )
        
-        # Verify they can be retrieved independently
-        state1 = ProgressTracker.get_progress("id-1")
-        state2 = ProgressTracker.get_progress("id-2")
+        assert tracker.state["status"] == "processing_batch"
+        assert tracker.state["progress"] == 60  # 3/5 = 60%
+        assert tracker.state["current_batch"] == 3
+        assert tracker.state["total_batches"] == 5
+        assert tracker.state["batch_size"] == 100
        
-        assert state1["progress_id"] == "id-1"
-        assert state2["progress_id"] == "id-2"
-        assert state1["type"] == "crawl"
-        assert state2["type"] == "upload"
+    def test_multiple_trackers(self):
+        """Test multiple progress trackers don't interfere"""
+        tracker1 = ProgressTracker("tracker-1", operation_type="crawl")
+        tracker2 = ProgressTracker("tracker-2", operation_type="upload")
+        
+        # Both should exist independently
+        assert ProgressTracker.get_progress("tracker-1") is not None
+        assert ProgressTracker.get_progress("tracker-2") is not None
+        
+        # They should have different types
+        assert ProgressTracker.get_progress("tracker-1")["type"] == "crawl"
+        assert ProgressTracker.get_progress("tracker-2")["type"] == "upload"
+        
+        # Clearing one shouldn't affect the other
+        ProgressTracker.clear_progress("tracker-1")
+        assert ProgressTracker.get_progress("tracker-1") is None
+        assert ProgressTracker.get_progress("tracker-2") is not None