The New Archon (Beta) - The Operating System for AI Coding Assistants!

2026-01-02 12:48:54 -05:00 · 2025-08-13 07:58:24 -05:00
parent 13e1fc6a0e
commit 59084036f6
603 changed files with 131376 additions and 417 deletions
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -0,0 +1,215 @@
+"""
+Recursive Crawling Strategy
+
+Handles recursive crawling of websites by following internal links.
+"""
+import asyncio
+from typing import List, Dict, Any, Optional, Callable
+from urllib.parse import urldefrag
+
+from crawl4ai import CrawlerRunConfig, CacheMode, MemoryAdaptiveDispatcher
+from ....config.logfire_config import get_logger
+from ...credential_service import credential_service
+
+logger = get_logger(__name__)
+
+
+class RecursiveCrawlStrategy:
+    """Strategy for recursive crawling of websites."""
+    
+    def __init__(self, crawler, markdown_generator):
+        """
+        Initialize recursive crawl strategy.
+        
+        Args:
+            crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
+            markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
+        """
+        self.crawler = crawler
+        self.markdown_generator = markdown_generator
+    
+    async def crawl_recursive_with_progress(
+        self,
+        start_urls: List[str],
+        transform_url_func: Callable[[str], str],
+        is_documentation_site_func: Callable[[str], bool],
+        max_depth: int = 3,
+        max_concurrent: int = None,
+        progress_callback: Optional[Callable] = None,
+        start_progress: int = 10,
+        end_progress: int = 60
+    ) -> List[Dict[str, Any]]:
+        """
+        Recursively crawl internal links from start URLs up to a maximum depth with progress reporting.
+        
+        Args:
+            start_urls: List of starting URLs
+            transform_url_func: Function to transform URLs (e.g., GitHub URLs)
+            is_documentation_site_func: Function to check if URL is a documentation site
+            max_depth: Maximum crawl depth
+            max_concurrent: Maximum concurrent crawls
+            progress_callback: Optional callback for progress updates
+            start_progress: Starting progress percentage
+            end_progress: Ending progress percentage
+            
+        Returns:
+            List of crawl results
+        """
+        if not self.crawler:
+            logger.error("No crawler instance available for recursive crawling")
+            if progress_callback:
+                await progress_callback('error', 0, 'Crawler not available')
+            return []
+        
+        # Load settings from database
+        try:
+            settings = await credential_service.get_credentials_by_category("rag_strategy")
+            batch_size = int(settings.get("CRAWL_BATCH_SIZE", "50"))
+            if max_concurrent is None:
+                max_concurrent = int(settings.get("CRAWL_MAX_CONCURRENT", "10"))
+            memory_threshold = float(settings.get("MEMORY_THRESHOLD_PERCENT", "80"))
+            check_interval = float(settings.get("DISPATCHER_CHECK_INTERVAL", "0.5"))
+        except Exception as e:
+            logger.warning(f"Failed to load crawl settings: {e}, using defaults")
+            batch_size = 50
+            if max_concurrent is None:
+                max_concurrent = 10
+            memory_threshold = 80.0
+            check_interval = 0.5
+            settings = {}  # Empty dict for defaults
+        
+        # Check if start URLs include documentation sites
+        has_doc_sites = any(is_documentation_site_func(url) for url in start_urls)
+        
+        if has_doc_sites:
+            logger.info("Detected documentation sites for recursive crawl, using enhanced configuration")
+            run_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                stream=True,  # Enable streaming for faster parallel processing
+                markdown_generator=self.markdown_generator,
+                wait_for='body',
+                wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
+                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
+                delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "1.0")),
+                wait_for_images=False,  # Skip images for faster crawling
+                scan_full_page=True,  # Trigger lazy loading
+                exclude_all_images=False,
+                remove_overlay_elements=True,
+                process_iframes=True
+            )
+        else:
+            # Configuration for regular recursive crawling
+            run_config = CrawlerRunConfig(
+                cache_mode=CacheMode.BYPASS,
+                stream=True,  # Enable streaming
+                markdown_generator=self.markdown_generator,
+                wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
+                page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
+                delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
+                scan_full_page=True
+            )
+        
+        dispatcher = MemoryAdaptiveDispatcher(
+            memory_threshold_percent=memory_threshold,
+            check_interval=check_interval,
+            max_session_permit=max_concurrent
+        )
+        
+        async def report_progress(percentage: int, message: str, **kwargs):
+            """Helper to report progress if callback is available"""
+            if progress_callback:
+                # Add step information for multi-progress tracking
+                step_info = {
+                    'currentStep': message,
+                    'stepMessage': message,
+                    **kwargs
+                }
+                await progress_callback('crawling', percentage, message, **step_info)
+        
+        visited = set()
+        
+        def normalize_url(url):
+            return urldefrag(url)[0]
+        
+        current_urls = set([normalize_url(u) for u in start_urls])
+        results_all = []
+        total_processed = 0
+        
+        for depth in range(max_depth):
+            urls_to_crawl = [normalize_url(url) for url in current_urls if normalize_url(url) not in visited]
+            if not urls_to_crawl:
+                break
+            
+            # Calculate progress for this depth level
+            depth_start = start_progress + int((depth / max_depth) * (end_progress - start_progress) * 0.8)
+            depth_end = start_progress + int(((depth + 1) / max_depth) * (end_progress - start_progress) * 0.8)
+            
+            await report_progress(depth_start, f'Crawling depth {depth + 1}/{max_depth}: {len(urls_to_crawl)} URLs to process')
+            
+            # Use configured batch size for recursive crawling
+            next_level_urls = set()
+            depth_successful = 0
+            
+            for batch_idx in range(0, len(urls_to_crawl), batch_size):
+                batch_urls = urls_to_crawl[batch_idx:batch_idx + batch_size]
+                batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
+                
+                # Calculate progress for this batch within the depth
+                batch_progress = depth_start + int((batch_idx / len(urls_to_crawl)) * (depth_end - depth_start))
+                await report_progress(batch_progress,
+                                    f'Depth {depth + 1}: crawling URLs {batch_idx + 1}-{batch_end_idx} of {len(urls_to_crawl)}',
+                                    totalPages=total_processed + batch_idx,
+                                    processedPages=len(results_all))
+                
+                # Use arun_many for native parallel crawling with streaming
+                logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
+                batch_results = await self.crawler.arun_many(urls=batch_urls, config=run_config, dispatcher=dispatcher)
+                
+                # Handle streaming results from arun_many
+                i = 0
+                async for result in batch_results:
+                    # Map back to original URL if transformed
+                    original_url = result.url
+                    for orig_url in batch_urls:
+                        if transform_url_func(orig_url) == result.url:
+                            original_url = orig_url
+                            break
+                    
+                    norm_url = normalize_url(original_url)
+                    visited.add(norm_url)
+                    total_processed += 1
+                    
+                    if result.success and result.markdown:
+                        results_all.append({
+                            'url': original_url,
+                            'markdown': result.markdown,
+                            'html': result.html  # Always use raw HTML for code extraction
+                        })
+                        depth_successful += 1
+                        
+                        # Find internal links for next depth
+                        for link in result.links.get("internal", []):
+                            next_url = normalize_url(link["href"])
+                            if next_url not in visited:
+                                next_level_urls.add(next_url)
+                    else:
+                        logger.warning(f"Failed to crawl {original_url}: {getattr(result, 'error_message', 'Unknown error')}")
+                    
+                    # Report progress every few URLs
+                    current_idx = batch_idx + i + 1
+                    if current_idx % 5 == 0 or current_idx == len(urls_to_crawl):
+                        current_progress = depth_start + int((current_idx / len(urls_to_crawl)) * (depth_end - depth_start))
+                        await report_progress(current_progress,
+                                            f'Depth {depth + 1}: processed {current_idx}/{len(urls_to_crawl)} URLs ({depth_successful} successful)',
+                                            totalPages=total_processed,
+                                            processedPages=len(results_all))
+                    i += 1
+            
+            current_urls = next_level_urls
+            
+            # Report completion of this depth
+            await report_progress(depth_end,
+                                f'Depth {depth + 1} completed: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth')
+        
+        await report_progress(end_progress, f'Recursive crawling completed: {len(results_all)} total pages crawled across {max_depth} depth levels')
+        return results_all