From 8ab6c754fe964e158f7ea93bb4ed59fc8e7d5a6d Mon Sep 17 00:00:00 2001 From: leex279 Date: Fri, 17 Oct 2025 22:57:55 +0200 Subject: [PATCH] fix: Improve path detection and add progress validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace dot-based file detection with explicit extension checking in discovery service to correctly handle versioned directories like /docs.v2 - Add comprehensive validation for start_progress and end_progress parameters in crawl_markdown_file to ensure they are valid numeric values in range [0, 100] with start < end - Validation runs before any async work or progress reporting begins - Clear error messages indicate which parameter is invalid and why 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../services/crawling/discovery_service.py | 15 +++++++- .../crawling/strategies/single_page.py | 34 ++++++++++++++++--- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index 28ea2f5e..203b67df 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -142,10 +142,23 @@ class DiscoveryService: # Get the directory path of the base URL parsed = urlparse(base_url) base_path = parsed.path.rstrip('/') + + # Known file extensions - only treat as file if last segment has one of these + FILE_EXTENSIONS = { + '.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv', + '.rss', '.yaml', '.yml', '.pdf', '.zip' + } + # Extract directory (remove filename if present) - if '.' in base_path.split('/')[-1]: + last_segment = base_path.split('/')[-1] if base_path else '' + # Check if the last segment ends with a known file extension + has_file_extension = any(last_segment.lower().endswith(ext) for ext in FILE_EXTENSIONS) + + if has_file_extension: + # Last segment is a file, strip it to get directory base_dir = '/'.join(base_path.split('/')[:-1]) else: + # Last segment is a directory (e.g., /docs.v2) base_dir = base_path # Phase 1: Check llms files at ALL priority levels before checking sitemaps diff --git a/python/src/server/services/crawling/strategies/single_page.py b/python/src/server/services/crawling/strategies/single_page.py index 58610d01..96ea5bb5 100644 --- a/python/src/server/services/crawling/strategies/single_page.py +++ b/python/src/server/services/crawling/strategies/single_page.py @@ -229,17 +229,43 @@ class SinglePageCrawlStrategy: ) -> list[dict[str, Any]]: """ Crawl a .txt or markdown file with comprehensive error handling and progress reporting. - + Args: url: URL of the text/markdown file transform_url_func: Function to transform URLs (e.g., GitHub URLs) progress_callback: Optional callback for progress updates - start_progress: Starting progress percentage - end_progress: Ending progress percentage - + start_progress: Starting progress percentage (must be 0-100) + end_progress: Ending progress percentage (must be 0-100 and > start_progress) + Returns: List containing the crawled document + + Raises: + ValueError: If start_progress or end_progress are invalid """ + # Validate progress parameters before any async work or progress reporting + if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)): + raise ValueError( + f"start_progress and end_progress must be int or float, " + f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}" + ) + + if not (0 <= start_progress <= 100): + raise ValueError( + f"start_progress must be in range [0, 100], got {start_progress}" + ) + + if not (0 <= end_progress <= 100): + raise ValueError( + f"end_progress must be in range [0, 100], got {end_progress}" + ) + + if start_progress >= end_progress: + raise ValueError( + f"start_progress must be less than end_progress, " + f"got start_progress={start_progress}, end_progress={end_progress}" + ) + try: # Transform GitHub URLs to raw content URLs if applicable original_url = url