fix: Skip link extraction for discovery targets (single-file mode)

When a file is selected through discovery, it should be crawled as a single file without
following any links contained within it. This preserves the efficiency gains of the
discovery feature.

Changes:
- Skip link extraction when is_discovery_target is true for link collection files
- Return sitemap metadata without crawling URLs when is_discovery_target is true
- Add clear logging to indicate single-file mode is active

This ensures discovered files (llms.txt, sitemap.xml, etc.) are processed as single
authoritative sources rather than starting recursive crawls, which aligns with the
PR's objective of efficient single-file discovery and crawling.
This commit is contained in:
leex279
2025-09-20 13:55:15 +02:00
parent c1677a9220
commit 597fc86c39

View File

@@ -753,7 +753,14 @@ class CrawlingService:
if crawl_results and len(crawl_results) > 0:
content = crawl_results[0].get('markdown', '')
if self.url_handler.is_link_collection_file(url, content):
# Extract links from the content
# If this file was selected by discovery, skip link extraction (single-file mode)
if request.get("is_discovery_target"):
logger.info(f"Discovery single-file mode: skipping link extraction for {url}")
crawl_type = "discovery_single_file"
logger.info(f"Discovery file crawling completed: {len(crawl_results)} result")
return crawl_results, crawl_type
# Extract links from the content for non-discovery files
extracted_links = self.url_handler.extract_markdown_links(content, url)
# Filter out self-referential links to avoid redundant crawling
@@ -838,6 +845,20 @@ class CrawlingService:
"Detected sitemap, parsing URLs...",
crawl_type=crawl_type
)
# If this sitemap was selected by discovery, just return the sitemap itself (single-file mode)
if request.get("is_discovery_target"):
logger.info(f"Discovery single-file mode: returning sitemap itself without crawling URLs from {url}")
crawl_type = "discovery_sitemap"
# Return the sitemap file as the result
crawl_results = [{
'url': url,
'markdown': f"# Sitemap: {url}\n\nThis is a sitemap file discovered and returned in single-file mode.",
'title': f"Sitemap - {self.url_handler.extract_display_name(url)}",
'crawl_type': crawl_type
}]
return crawl_results, crawl_type
sitemap_urls = self.parse_sitemap(url)
if sitemap_urls: