From 8ab6c754fe964e158f7ea93bb4ed59fc8e7d5a6d Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Fri, 17 Oct 2025 22:57:55 +0200
Subject: [PATCH] fix: Improve path detection and add progress validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace dot-based file detection with explicit extension checking
  in discovery service to correctly handle versioned directories
  like /docs.v2
- Add comprehensive validation for start_progress and end_progress
  parameters in crawl_markdown_file to ensure they are valid
  numeric values in range [0, 100] with start < end
- Validation runs before any async work or progress reporting begins
- Clear error messages indicate which parameter is invalid and why

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../services/crawling/discovery_service.py    | 15 +++++++-
 .../crawling/strategies/single_page.py        | 34 ++++++++++++++++---
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py
index 28ea2f5e..203b67df 100644
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -142,10 +142,23 @@ class DiscoveryService:
             # Get the directory path of the base URL
             parsed = urlparse(base_url)
             base_path = parsed.path.rstrip('/')
+
+            # Known file extensions - only treat as file if last segment has one of these
+            FILE_EXTENSIONS = {
+                '.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv',
+                '.rss', '.yaml', '.yml', '.pdf', '.zip'
+            }
+
             # Extract directory (remove filename if present)
-            if '.' in base_path.split('/')[-1]:
+            last_segment = base_path.split('/')[-1] if base_path else ''
+            # Check if the last segment ends with a known file extension
+            has_file_extension = any(last_segment.lower().endswith(ext) for ext in FILE_EXTENSIONS)
+
+            if has_file_extension:
+                # Last segment is a file, strip it to get directory
                 base_dir = '/'.join(base_path.split('/')[:-1])
             else:
+                # Last segment is a directory (e.g., /docs.v2)
                 base_dir = base_path
 
             # Phase 1: Check llms files at ALL priority levels before checking sitemaps
diff --git a/python/src/server/services/crawling/strategies/single_page.py b/python/src/server/services/crawling/strategies/single_page.py
index 58610d01..96ea5bb5 100644
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -229,17 +229,43 @@ class SinglePageCrawlStrategy:
     ) -> list[dict[str, Any]]:
         """
         Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
-        
+
         Args:
             url: URL of the text/markdown file
             transform_url_func: Function to transform URLs (e.g., GitHub URLs)
             progress_callback: Optional callback for progress updates
-            start_progress: Starting progress percentage
-            end_progress: Ending progress percentage
-            
+            start_progress: Starting progress percentage (must be 0-100)
+            end_progress: Ending progress percentage (must be 0-100 and > start_progress)
+
         Returns:
             List containing the crawled document
+
+        Raises:
+            ValueError: If start_progress or end_progress are invalid
         """
+        # Validate progress parameters before any async work or progress reporting
+        if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)):
+            raise ValueError(
+                f"start_progress and end_progress must be int or float, "
+                f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}"
+            )
+
+        if not (0 <= start_progress <= 100):
+            raise ValueError(
+                f"start_progress must be in range [0, 100], got {start_progress}"
+            )
+
+        if not (0 <= end_progress <= 100):
+            raise ValueError(
+                f"end_progress must be in range [0, 100], got {end_progress}"
+            )
+
+        if start_progress >= end_progress:
+            raise ValueError(
+                f"start_progress must be less than end_progress, "
+                f"got start_progress={start_progress}, end_progress={end_progress}"
+            )
+
         try:
             # Transform GitHub URLs to raw content URLs if applicable
             original_url = url