fix: Improve path detection and add progress validation

- Replace dot-based file detection with explicit extension checking
  in discovery service to correctly handle versioned directories
  like /docs.v2
- Add comprehensive validation for start_progress and end_progress
  parameters in crawl_markdown_file to ensure they are valid
  numeric values in range [0, 100] with start < end
- Validation runs before any async work or progress reporting begins
- Clear error messages indicate which parameter is invalid and why

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-10-17 22:57:55 +02:00
parent cdf4323534
commit 8ab6c754fe
2 changed files with 44 additions and 5 deletions

View File

@@ -142,10 +142,23 @@ class DiscoveryService:
# Get the directory path of the base URL
parsed = urlparse(base_url)
base_path = parsed.path.rstrip('/')
# Known file extensions - only treat as file if last segment has one of these
FILE_EXTENSIONS = {
'.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv',
'.rss', '.yaml', '.yml', '.pdf', '.zip'
}
# Extract directory (remove filename if present)
if '.' in base_path.split('/')[-1]:
last_segment = base_path.split('/')[-1] if base_path else ''
# Check if the last segment ends with a known file extension
has_file_extension = any(last_segment.lower().endswith(ext) for ext in FILE_EXTENSIONS)
if has_file_extension:
# Last segment is a file, strip it to get directory
base_dir = '/'.join(base_path.split('/')[:-1])
else:
# Last segment is a directory (e.g., /docs.v2)
base_dir = base_path
# Phase 1: Check llms files at ALL priority levels before checking sitemaps

View File

@@ -229,17 +229,43 @@ class SinglePageCrawlStrategy:
) -> list[dict[str, Any]]:
"""
Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
Args:
url: URL of the text/markdown file
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
progress_callback: Optional callback for progress updates
start_progress: Starting progress percentage
end_progress: Ending progress percentage
start_progress: Starting progress percentage (must be 0-100)
end_progress: Ending progress percentage (must be 0-100 and > start_progress)
Returns:
List containing the crawled document
Raises:
ValueError: If start_progress or end_progress are invalid
"""
# Validate progress parameters before any async work or progress reporting
if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)):
raise ValueError(
f"start_progress and end_progress must be int or float, "
f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}"
)
if not (0 <= start_progress <= 100):
raise ValueError(
f"start_progress must be in range [0, 100], got {start_progress}"
)
if not (0 <= end_progress <= 100):
raise ValueError(
f"end_progress must be in range [0, 100], got {end_progress}"
)
if start_progress >= end_progress:
raise ValueError(
f"start_progress must be less than end_progress, "
f"got start_progress={start_progress}, end_progress={end_progress}"
)
try:
# Transform GitHub URLs to raw content URLs if applicable
original_url = url