fix: Skip link extraction for discovery targets (single-file mode)

When a file is selected through discovery, it should be crawled as a single file without following any links contained within it. This preserves the efficiency gains of the discovery feature. Changes: - Skip link extraction when is_discovery_target is true for link collection files - Return sitemap metadata without crawling URLs when is_discovery_target is true - Add clear logging to indicate single-file mode is active This ensures discovered files (llms.txt, sitemap.xml, etc.) are processed as single authoritative sources rather than starting recursive crawls, which aligns with the PR's objective of efficient single-file discovery and crawling.
2025-12-24 02:39:17 -05:00 · 2025-09-20 13:55:15 +02:00
parent c1677a9220
commit 597fc86c39
1 changed files with 22 additions and 1 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -753,7 +753,14 @@ class CrawlingService:
            if crawl_results and len(crawl_results) > 0:
                content = crawl_results[0].get('markdown', '')
                if self.url_handler.is_link_collection_file(url, content):
-                    # Extract links from the content
+                    # If this file was selected by discovery, skip link extraction (single-file mode)
+                    if request.get("is_discovery_target"):
+                        logger.info(f"Discovery single-file mode: skipping link extraction for {url}")
+                        crawl_type = "discovery_single_file"
+                        logger.info(f"Discovery file crawling completed: {len(crawl_results)} result")
+                        return crawl_results, crawl_type
+
+                    # Extract links from the content for non-discovery files
                    extracted_links = self.url_handler.extract_markdown_links(content, url)

                    # Filter out self-referential links to avoid redundant crawling
@@ -838,6 +845,20 @@ class CrawlingService:
                "Detected sitemap, parsing URLs...",
                crawl_type=crawl_type
            )
+
+            # If this sitemap was selected by discovery, just return the sitemap itself (single-file mode)
+            if request.get("is_discovery_target"):
+                logger.info(f"Discovery single-file mode: returning sitemap itself without crawling URLs from {url}")
+                crawl_type = "discovery_sitemap"
+                # Return the sitemap file as the result
+                crawl_results = [{
+                    'url': url,
+                    'markdown': f"# Sitemap: {url}\n\nThis is a sitemap file discovered and returned in single-file mode.",
+                    'title': f"Sitemap - {self.url_handler.extract_display_name(url)}",
+                    'crawl_type': crawl_type
+                }]
+                return crawl_results, crawl_type
+
            sitemap_urls = self.parse_sitemap(url)

            if sitemap_urls: