mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-30 13:39:44 -05:00
fix: Improve path detection and add progress validation
- Replace dot-based file detection with explicit extension checking in discovery service to correctly handle versioned directories like /docs.v2 - Add comprehensive validation for start_progress and end_progress parameters in crawl_markdown_file to ensure they are valid numeric values in range [0, 100] with start < end - Validation runs before any async work or progress reporting begins - Clear error messages indicate which parameter is invalid and why 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -142,10 +142,23 @@ class DiscoveryService:
|
||||
# Get the directory path of the base URL
|
||||
parsed = urlparse(base_url)
|
||||
base_path = parsed.path.rstrip('/')
|
||||
|
||||
# Known file extensions - only treat as file if last segment has one of these
|
||||
FILE_EXTENSIONS = {
|
||||
'.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv',
|
||||
'.rss', '.yaml', '.yml', '.pdf', '.zip'
|
||||
}
|
||||
|
||||
# Extract directory (remove filename if present)
|
||||
if '.' in base_path.split('/')[-1]:
|
||||
last_segment = base_path.split('/')[-1] if base_path else ''
|
||||
# Check if the last segment ends with a known file extension
|
||||
has_file_extension = any(last_segment.lower().endswith(ext) for ext in FILE_EXTENSIONS)
|
||||
|
||||
if has_file_extension:
|
||||
# Last segment is a file, strip it to get directory
|
||||
base_dir = '/'.join(base_path.split('/')[:-1])
|
||||
else:
|
||||
# Last segment is a directory (e.g., /docs.v2)
|
||||
base_dir = base_path
|
||||
|
||||
# Phase 1: Check llms files at ALL priority levels before checking sitemaps
|
||||
|
||||
@@ -229,17 +229,43 @@ class SinglePageCrawlStrategy:
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
|
||||
|
||||
|
||||
Args:
|
||||
url: URL of the text/markdown file
|
||||
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
|
||||
progress_callback: Optional callback for progress updates
|
||||
start_progress: Starting progress percentage
|
||||
end_progress: Ending progress percentage
|
||||
|
||||
start_progress: Starting progress percentage (must be 0-100)
|
||||
end_progress: Ending progress percentage (must be 0-100 and > start_progress)
|
||||
|
||||
Returns:
|
||||
List containing the crawled document
|
||||
|
||||
Raises:
|
||||
ValueError: If start_progress or end_progress are invalid
|
||||
"""
|
||||
# Validate progress parameters before any async work or progress reporting
|
||||
if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)):
|
||||
raise ValueError(
|
||||
f"start_progress and end_progress must be int or float, "
|
||||
f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}"
|
||||
)
|
||||
|
||||
if not (0 <= start_progress <= 100):
|
||||
raise ValueError(
|
||||
f"start_progress must be in range [0, 100], got {start_progress}"
|
||||
)
|
||||
|
||||
if not (0 <= end_progress <= 100):
|
||||
raise ValueError(
|
||||
f"end_progress must be in range [0, 100], got {end_progress}"
|
||||
)
|
||||
|
||||
if start_progress >= end_progress:
|
||||
raise ValueError(
|
||||
f"start_progress must be less than end_progress, "
|
||||
f"got start_progress={start_progress}, end_progress={end_progress}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Transform GitHub URLs to raw content URLs if applicable
|
||||
original_url = url
|
||||
|
||||
Reference in New Issue
Block a user