From c1677a9220943e2003f3f1c01780cf0031b86363 Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Sat, 20 Sep 2025 13:34:07 +0200
Subject: [PATCH] fix: Skip discovery when user provides direct discovery file
 URLs

When a user directly provides a URL to a discovery file (sitemap.xml, llms.txt, robots.txt, etc.),
the system now skips the discovery phase and uses the provided file directly.

This prevents unnecessary discovery attempts and respects the user's explicit choice.

Changes:
- Check if the URL is already a discovery target before running discovery
- Skip discovery for: sitemap files, llms variants, robots.txt, well-known files, and any .txt files
- Add logging to indicate when discovery is skipped

Example: When crawling 'xyz.com/sitemap.xml' directly, the system will now use that sitemap
instead of trying to discover a different file like llms.txt
---
 .../server/services/crawling/crawling_service.py   | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
index cf0d1ba5..1e92ca14 100644
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -339,7 +339,19 @@ class CrawlingService:
 
             # Discovery phase - find the single best related file
             discovered_urls = []
-            if request.get("auto_discovery", True):  # Default enabled
+            # Skip discovery if the URL itself is already a discovery target (sitemap, llms file, etc.)
+            is_already_discovery_target = (
+                self.url_handler.is_sitemap(url) or
+                self.url_handler.is_llms_variant(url) or
+                self.url_handler.is_robots_txt(url) or
+                self.url_handler.is_well_known_file(url) or
+                self.url_handler.is_txt(url)  # Also skip for any .txt file that user provides directly
+            )
+
+            if is_already_discovery_target:
+                safe_logfire_info(f"Skipping discovery - URL is already a discovery target file: {url}")
+
+            if request.get("auto_discovery", True) and not is_already_discovery_target:  # Default enabled, but skip if already a discovery file
                 await update_mapped_progress(
                     "discovery", 25, f"Discovering best related file for {url}", current_url=url
                 )