fix: Improve path detection and add progress validation

- Replace dot-based file detection with explicit extension checking in discovery service to correctly handle versioned directories like /docs.v2 - Add comprehensive validation for start_progress and end_progress parameters in crawl_markdown_file to ensure they are valid numeric values in range [0, 100] with start < end - Validation runs before any async work or progress reporting begins - Clear error messages indicate which parameter is invalid and why 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-30 13:39:44 -05:00 · 2025-10-17 22:57:55 +02:00
parent cdf4323534
commit 8ab6c754fe
2 changed files with 44 additions and 5 deletions
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -142,10 +142,23 @@ class DiscoveryService:
            # Get the directory path of the base URL
            parsed = urlparse(base_url)
            base_path = parsed.path.rstrip('/')
+
+            # Known file extensions - only treat as file if last segment has one of these
+            FILE_EXTENSIONS = {
+                '.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv',
+                '.rss', '.yaml', '.yml', '.pdf', '.zip'
+            }
+
            # Extract directory (remove filename if present)
-            if '.' in base_path.split('/')[-1]:
+            last_segment = base_path.split('/')[-1] if base_path else ''
+            # Check if the last segment ends with a known file extension
+            has_file_extension = any(last_segment.lower().endswith(ext) for ext in FILE_EXTENSIONS)
+
+            if has_file_extension:
+                # Last segment is a file, strip it to get directory
                base_dir = '/'.join(base_path.split('/')[:-1])
            else:
+                # Last segment is a directory (e.g., /docs.v2)
                base_dir = base_path

            # Phase 1: Check llms files at ALL priority levels before checking sitemaps
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -229,17 +229,43 @@ class SinglePageCrawlStrategy:
    ) -> list[dict[str, Any]]:
        """
        Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
-        
+
        Args:
            url: URL of the text/markdown file
            transform_url_func: Function to transform URLs (e.g., GitHub URLs)
            progress_callback: Optional callback for progress updates
-            start_progress: Starting progress percentage
-            end_progress: Ending progress percentage
-            
+            start_progress: Starting progress percentage (must be 0-100)
+            end_progress: Ending progress percentage (must be 0-100 and > start_progress)
+
        Returns:
            List containing the crawled document
+
+        Raises:
+            ValueError: If start_progress or end_progress are invalid
        """
+        # Validate progress parameters before any async work or progress reporting
+        if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)):
+            raise ValueError(
+                f"start_progress and end_progress must be int or float, "
+                f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}"
+            )
+
+        if not (0 <= start_progress <= 100):
+            raise ValueError(
+                f"start_progress must be in range [0, 100], got {start_progress}"
+            )
+
+        if not (0 <= end_progress <= 100):
+            raise ValueError(
+                f"end_progress must be in range [0, 100], got {end_progress}"
+            )
+
+        if start_progress >= end_progress:
+            raise ValueError(
+                f"start_progress must be less than end_progress, "
+                f"got start_progress={start_progress}, end_progress={end_progress}"
+            )
+
        try:
            # Transform GitHub URLs to raw content URLs if applicable
            original_url = url