From d2adc15be2361c01fe23f0f8b9b7f4c5d97356f0 Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Mon, 8 Sep 2025 11:18:49 +0200
Subject: [PATCH] fix: Address CodeRabbit critical issues for discovery service
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix progress regression: map crawl callback progress through ProgressMapper
  - Prevents UI progress bars from jumping backwards
  - Ensures consistent progress reporting across all stages

- Add same-domain filtering for discovered file link following
  - Discovery targets (llms.txt) can follow links but only to same domain
  - Prevents external crawling while preserving related AI guidance
  - Add _is_same_domain() method for domain comparison

- Fix filename filtering false positives with regex token matching
  - Replace substring 'full' check with token-aware regex pattern
  - Prevents excluding files like "helpful.md" or "meaningful.txt"
  - Only excludes actual "full" variants like "llms-full.txt"

- Add llms-full.txt to URLHandler detection patterns
  - Support for highest priority discovery file format
  - Ensures proper file type detection for link following logic

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 .../services/crawling/crawling_service.py     | 58 +++++++++++++++++--
 .../services/crawling/helpers/url_handler.py  | 10 ++--
 python/tests/test_url_handler.py              |  1 +
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
index 3faebf28..b19d1233 100644
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -135,15 +135,16 @@ class CrawlingService:
                     f"kwargs_keys={list(kwargs.keys())}"
                 )
 
-                # Update progress via tracker (stores in memory for HTTP polling)
+                # Update progress via tracker (stage-mapped to avoid regressions)
+                mapped = self.progress_mapper.map_progress(base_status, progress)
                 await self.progress_tracker.update(
                     status=base_status,
-                    progress=progress,
+                    progress=mapped,
                     log=message,
                     **kwargs
                 )
                 safe_logfire_info(
-                    f"Updated crawl progress | progress_id={self.progress_id} | status={base_status} | progress={progress} | "
+                    f"Updated crawl progress | progress_id={self.progress_id} | status={base_status} | progress={mapped} | "
                     f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')}"
                 )
 
@@ -379,9 +380,16 @@ class CrawlingService:
                     processed_pages=0
                 )
                 
-                # Crawl only the discovered file
-                safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}")
-                crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request)
+                # Crawl only the discovered file with discovery context
+                discovered_url = discovered_urls[0]
+                safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}")
+                
+                # Mark this as a discovery target for domain filtering
+                discovery_request = request.copy()
+                discovery_request["is_discovery_target"] = True
+                discovery_request["original_domain"] = self.url_handler.get_base_url(url)
+                
+                crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request)
                 
             else:
                 # No discovery - crawl the main URL normally
@@ -580,6 +588,28 @@ class CrawlingService:
                     f"Unregistered orchestration service on error | progress_id={self.progress_id}"
                 )
 
+    def _is_same_domain(self, url: str, base_domain: str) -> bool:
+        """
+        Check if a URL belongs to the same domain as the base domain.
+        
+        Args:
+            url: URL to check
+            base_domain: Base domain URL to compare against
+            
+        Returns:
+            True if the URL is from the same domain
+        """
+        try:
+            from urllib.parse import urlparse
+            
+            url_domain = urlparse(url).netloc.lower()
+            base_netloc = urlparse(base_domain).netloc.lower()
+            
+            return url_domain == base_netloc
+        except Exception:
+            # If parsing fails, be conservative and exclude the URL
+            return False
+
     def _is_self_link(self, link: str, base_url: str) -> bool:
         """
         Check if a link is a self-referential link to the base URL.
@@ -659,6 +689,19 @@ class CrawlingService:
                         if self_filtered_count > 0:
                             logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
 
+                    # For discovery targets, only follow same-domain links
+                    if extracted_links and request.get("is_discovery_target"):
+                        original_domain = request.get("original_domain")
+                        if original_domain:
+                            original_count = len(extracted_links)
+                            extracted_links = [
+                                link for link in extracted_links 
+                                if self._is_same_domain(link, original_domain)
+                            ]
+                            domain_filtered_count = original_count - len(extracted_links)
+                            if domain_filtered_count > 0:
+                                safe_logfire_info(f"Discovery mode: filtered out {domain_filtered_count} external links, keeping {len(extracted_links)} same-domain links")
+
                     # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
                     if extracted_links:
                         original_count = len(extracted_links)
@@ -667,6 +710,9 @@ class CrawlingService:
                         if filtered_count > 0:
                             logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
 
+                        # Deduplicate to reduce redundant work
+                        extracted_links = list(dict.fromkeys(extracted_links))
+
                     if extracted_links:
                         # Crawl the extracted links using batch crawling
                         logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py
index b4bb0e58..fef5785a 100644
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -399,8 +399,9 @@ class URLHandler:
             # Pattern-based detection for variations, but exclude "full" variants
             # Only match files that are likely link collections, not complete content files
             if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
-                # Exclude files with "full" in the name - these typically contain complete content, not just links
-                if 'full' not in filename:
+                # Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
+                import re
+                if not re.search(r'(^|[._-])full([._-]|$)', filename):
                     # Match files that start with common link collection prefixes
                     base_patterns = ['llms', 'links', 'resources', 'references']
                     if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
@@ -410,7 +411,8 @@ class URLHandler:
             # Content-based detection if content is provided
             if content:
                 # Never treat "full" variants as link collections to preserve single-page behavior
-                if 'full' in filename:
+                import re
+                if re.search(r'(^|[._-])full([._-]|$)', filename):
                     logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
                     return False
                 # Reuse extractor to avoid regex divergence and maintain consistency
@@ -622,7 +624,7 @@ class URLHandler:
             filename = path.split('/')[-1] if '/' in path else path
 
             # Check for llms file variants
-            llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
+            llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown']
             return filename in llms_variants
         except Exception as e:
             logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
diff --git a/python/tests/test_url_handler.py b/python/tests/test_url_handler.py
index 33c80d9e..e466239f 100644
--- a/python/tests/test_url_handler.py
+++ b/python/tests/test_url_handler.py
@@ -156,6 +156,7 @@ class TestURLHandler:
         handler = URLHandler()
         
         # All llms variants
+        assert handler.is_llms_variant("https://example.com/llms-full.txt") is True
         assert handler.is_llms_variant("https://example.com/llms.txt") is True
         assert handler.is_llms_variant("https://example.com/llms.md") is True
         assert handler.is_llms_variant("https://example.com/llms.mdx") is True