From d2adc15be2361c01fe23f0f8b9b7f4c5d97356f0 Mon Sep 17 00:00:00 2001 From: leex279 Date: Mon, 8 Sep 2025 11:18:49 +0200 Subject: [PATCH] fix: Address CodeRabbit critical issues for discovery service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix progress regression: map crawl callback progress through ProgressMapper - Prevents UI progress bars from jumping backwards - Ensures consistent progress reporting across all stages - Add same-domain filtering for discovered file link following - Discovery targets (llms.txt) can follow links but only to same domain - Prevents external crawling while preserving related AI guidance - Add _is_same_domain() method for domain comparison - Fix filename filtering false positives with regex token matching - Replace substring 'full' check with token-aware regex pattern - Prevents excluding files like "helpful.md" or "meaningful.txt" - Only excludes actual "full" variants like "llms-full.txt" - Add llms-full.txt to URLHandler detection patterns - Support for highest priority discovery file format - Ensures proper file type detection for link following logic 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .../services/crawling/crawling_service.py | 58 +++++++++++++++++-- .../services/crawling/helpers/url_handler.py | 10 ++-- python/tests/test_url_handler.py | 1 + 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 3faebf28..b19d1233 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -135,15 +135,16 @@ class CrawlingService: f"kwargs_keys={list(kwargs.keys())}" ) - # Update progress via tracker (stores in memory for HTTP polling) + # Update progress via tracker (stage-mapped to avoid regressions) + mapped = self.progress_mapper.map_progress(base_status, progress) await self.progress_tracker.update( status=base_status, - progress=progress, + progress=mapped, log=message, **kwargs ) safe_logfire_info( - f"Updated crawl progress | progress_id={self.progress_id} | status={base_status} | progress={progress} | " + f"Updated crawl progress | progress_id={self.progress_id} | status={base_status} | progress={mapped} | " f"total_pages={kwargs.get('total_pages', 'N/A')} | processed_pages={kwargs.get('processed_pages', 'N/A')}" ) @@ -379,9 +380,16 @@ class CrawlingService: processed_pages=0 ) - # Crawl only the discovered file - safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_urls[0]}") - crawl_results, crawl_type = await self._crawl_by_url_type(discovered_urls[0], request) + # Crawl only the discovered file with discovery context + discovered_url = discovered_urls[0] + safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}") + + # Mark this as a discovery target for domain filtering + discovery_request = request.copy() + discovery_request["is_discovery_target"] = True + discovery_request["original_domain"] = self.url_handler.get_base_url(url) + + crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request) else: # No discovery - crawl the main URL normally @@ -580,6 +588,28 @@ class CrawlingService: f"Unregistered orchestration service on error | progress_id={self.progress_id}" ) + def _is_same_domain(self, url: str, base_domain: str) -> bool: + """ + Check if a URL belongs to the same domain as the base domain. + + Args: + url: URL to check + base_domain: Base domain URL to compare against + + Returns: + True if the URL is from the same domain + """ + try: + from urllib.parse import urlparse + + url_domain = urlparse(url).netloc.lower() + base_netloc = urlparse(base_domain).netloc.lower() + + return url_domain == base_netloc + except Exception: + # If parsing fails, be conservative and exclude the URL + return False + def _is_self_link(self, link: str, base_url: str) -> bool: """ Check if a link is a self-referential link to the base URL. @@ -659,6 +689,19 @@ class CrawlingService: if self_filtered_count > 0: logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links") + # For discovery targets, only follow same-domain links + if extracted_links and request.get("is_discovery_target"): + original_domain = request.get("original_domain") + if original_domain: + original_count = len(extracted_links) + extracted_links = [ + link for link in extracted_links + if self._is_same_domain(link, original_domain) + ] + domain_filtered_count = original_count - len(extracted_links) + if domain_filtered_count > 0: + safe_logfire_info(f"Discovery mode: filtered out {domain_filtered_count} external links, keeping {len(extracted_links)} same-domain links") + # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling if extracted_links: original_count = len(extracted_links) @@ -667,6 +710,9 @@ class CrawlingService: if filtered_count > 0: logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links") + # Deduplicate to reduce redundant work + extracted_links = list(dict.fromkeys(extracted_links)) + if extracted_links: # Crawl the extracted links using batch crawling logger.info(f"Crawling {len(extracted_links)} extracted links from {url}") diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index b4bb0e58..fef5785a 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -399,8 +399,9 @@ class URLHandler: # Pattern-based detection for variations, but exclude "full" variants # Only match files that are likely link collections, not complete content files if filename.endswith(('.txt', '.md', '.mdx', '.markdown')): - # Exclude files with "full" in the name - these typically contain complete content, not just links - if 'full' not in filename: + # Exclude files with "full" as standalone token (avoid false positives like "helpful.md") + import re + if not re.search(r'(^|[._-])full([._-]|$)', filename): # Match files that start with common link collection prefixes base_patterns = ['llms', 'links', 'resources', 'references'] if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns): @@ -410,7 +411,8 @@ class URLHandler: # Content-based detection if content is provided if content: # Never treat "full" variants as link collections to preserve single-page behavior - if 'full' in filename: + import re + if re.search(r'(^|[._-])full([._-]|$)', filename): logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}") return False # Reuse extractor to avoid regex divergence and maintain consistency @@ -622,7 +624,7 @@ class URLHandler: filename = path.split('/')[-1] if '/' in path else path # Check for llms file variants - llms_variants = ['llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown'] + llms_variants = ['llms-full.txt', 'llms.txt', 'llms.md', 'llms.mdx', 'llms.markdown'] return filename in llms_variants except Exception as e: logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True) diff --git a/python/tests/test_url_handler.py b/python/tests/test_url_handler.py index 33c80d9e..e466239f 100644 --- a/python/tests/test_url_handler.py +++ b/python/tests/test_url_handler.py @@ -156,6 +156,7 @@ class TestURLHandler: handler = URLHandler() # All llms variants + assert handler.is_llms_variant("https://example.com/llms-full.txt") is True assert handler.is_llms_variant("https://example.com/llms.txt") is True assert handler.is_llms_variant("https://example.com/llms.md") is True assert handler.is_llms_variant("https://example.com/llms.mdx") is True