fix: Address CodeRabbit critical issues for discovery service

- Fix progress regression: map crawl callback progress through ProgressMapper - Prevents UI progress bars from jumping backwards - Ensures consistent progress reporting across all stages - Add same-domain filtering for discovered file link following - Discovery targets (llms.txt) can follow links but only to same domain - Prevents external crawling while preserving related AI guidance - Add _is_same_domain() method for domain comparison - Fix filename filtering false positives with regex token matching - Replace substring 'full' check with token-aware regex pattern - Prevents excluding files like "helpful.md" or "meaningful.txt" - Only excludes actual "full" variants like "llms-full.txt" - Add llms-full.txt to URLHandler detection patterns - Support for highest priority discovery file format - Ensures proper file type detection for link following logic 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-24 02:39:17 -05:00 · 2025-09-08 11:18:49 +02:00
parent 43af7b747c
commit d2adc15be2
3 changed files with 59 additions and 10 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -135,15 +135,16 @@ class CrawlingService:
                    f"kwargs_keys={list(kwargs.keys())}"
                )

-                # Update progress via tracker (stores in memory for HTTP polling)
+                # Update progress via tracker (stage-mapped to avoid regressions)
+                mapped = self.progress_mapper.map_progress(base_status, progress)
                await self.progress_tracker.update(
                    status=base_status,
-                    progress=progress,
+                    progress=mapped,
                    log=message,
                    **kwargs
                )
                safe_logfire_info(
-                    f"Updated crawl progress | progress_id={self.progress_id} | status={base_status} | progress={progress} | "
+                    f"Updated crawl progress | progress_id={self.progress_id} | status={base_status} | progress={mapped} | "
                    f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')}"
                )

@@ -379,9 +380,16 @@ class CrawlingService:
                    processed_pages=0
                )
                
-                # Crawl only the discovered file
-                safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}")
-                crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request)
+                # Crawl only the discovered file with discovery context
+                discovered_url = discovered_urls[0]
+                safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}")
+                
+                # Mark this as a discovery target for domain filtering
+                discovery_request = request.copy()
+                discovery_request["is_discovery_target"] = True
+                discovery_request["original_domain"] = self.url_handler.get_base_url(url)
+                
+                crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request)
                
            else:
                # No discovery - crawl the main URL normally
@@ -580,6 +588,28 @@ class CrawlingService:
                    f"Unregistered orchestration service on error | progress_id={self.progress_id}"
                )

+    def _is_same_domain(self, url: str, base_domain: str) -> bool:
+        """
+        Check if a URL belongs to the same domain as the base domain.
+        
+        Args:
+            url: URL to check
+            base_domain: Base domain URL to compare against
+            
+        Returns:
+            True if the URL is from the same domain
+        """
+        try:
+            from urllib.parse import urlparse
+            
+            url_domain = urlparse(url).netloc.lower()
+            base_netloc = urlparse(base_domain).netloc.lower()
+            
+            return url_domain == base_netloc
+        except Exception:
+            # If parsing fails, be conservative and exclude the URL
+            return False
+
    def _is_self_link(self, link: str, base_url: str) -> bool:
        """
        Check if a link is a self-referential link to the base URL.
@@ -659,6 +689,19 @@ class CrawlingService:
                        if self_filtered_count > 0:
                            logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")

+                    # For discovery targets, only follow same-domain links
+                    if extracted_links and request.get("is_discovery_target"):
+                        original_domain = request.get("original_domain")
+                        if original_domain:
+                            original_count = len(extracted_links)
+                            extracted_links = [
+                                link for link in extracted_links 
+                                if self._is_same_domain(link, original_domain)
+                            ]
+                            domain_filtered_count = original_count - len(extracted_links)
+                            if domain_filtered_count > 0:
+                                safe_logfire_info(f"Discovery mode: filtered out {domain_filtered_count} external links, keeping {len(extracted_links)} same-domain links")
+
                    # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
                    if extracted_links:
                        original_count = len(extracted_links)
@@ -667,6 +710,9 @@ class CrawlingService:
                        if filtered_count > 0:
                            logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")

+                        # Deduplicate to reduce redundant work
+                        extracted_links = list(dict.fromkeys(extracted_links))
+
                    if extracted_links:
                        # Crawl the extracted links using batch crawling
                        logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -399,8 +399,9 @@ class URLHandler:
            # Pattern-based detection for variations, but exclude "full" variants
            # Only match files that are likely link collections, not complete content files
            if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
-                # Exclude files with "full" in the name - these typically contain complete content, not just links
-                if 'full' not in filename:
+                # Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
+                import re
+                if not re.search(r'(^|[._-])full([._-]|$)', filename):
                    # Match files that start with common link collection prefixes
                    base_patterns = ['llms', 'links', 'resources', 'references']
                    if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
@@ -410,7 +411,8 @@ class URLHandler:
            # Content-based detection if content is provided
            if content:
                # Never treat "full" variants as link collections to preserve single-page behavior
-                if 'full' in filename:
+                import re
+                if re.search(r'(^|[._-])full([._-]|$)', filename):
                    logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
                    return False
                # Reuse extractor to avoid regex divergence and maintain consistency
@@ -622,7 +624,7 @@ class URLHandler:
            filename = path.split('/')[-1] if '/' in path else path

            # Check for llms file variants
-            llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
+            llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
            return filename in llms_variants
        except Exception as e:
            logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
--- a/python/tests/test_url_handler.py
+++ b/python/tests/test_url_handler.py
@@ -156,6 +156,7 @@ class TestURLHandler:
        handler = URLHandler()
        
        # All llms variants
+        assert handler.is_llms_variant("https://example.com/llms-full.txt") is True
        assert handler.is_llms_variant("https://example.com/llms.txt") is True
        assert handler.is_llms_variant("https://example.com/llms.md") is True
        assert handler.is_llms_variant("https://example.com/llms.mdx") is True