diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 1e92ca14..c5eca80c 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -753,7 +753,14 @@ class CrawlingService: if crawl_results and len(crawl_results) > 0: content = crawl_results[0].get('markdown', '') if self.url_handler.is_link_collection_file(url, content): - # Extract links from the content + # If this file was selected by discovery, skip link extraction (single-file mode) + if request.get("is_discovery_target"): + logger.info(f"Discovery single-file mode: skipping link extraction for {url}") + crawl_type = "discovery_single_file" + logger.info(f"Discovery file crawling completed: {len(crawl_results)} result") + return crawl_results, crawl_type + + # Extract links from the content for non-discovery files extracted_links = self.url_handler.extract_markdown_links(content, url) # Filter out self-referential links to avoid redundant crawling @@ -838,6 +845,20 @@ class CrawlingService: "Detected sitemap, parsing URLs...", crawl_type=crawl_type ) + + # If this sitemap was selected by discovery, just return the sitemap itself (single-file mode) + if request.get("is_discovery_target"): + logger.info(f"Discovery single-file mode: returning sitemap itself without crawling URLs from {url}") + crawl_type = "discovery_sitemap" + # Return the sitemap file as the result + crawl_results = [{ + 'url': url, + 'markdown': f"# Sitemap: {url}\n\nThis is a sitemap file discovered and returned in single-file mode.", + 'title': f"Sitemap - {self.url_handler.extract_display_name(url)}", + 'crawl_type': crawl_type + }] + return crawl_results, crawl_type + sitemap_urls = self.parse_sitemap(url) if sitemap_urls: