From 265c248c16e46f997dff5cb35b121464c80d977c Mon Sep 17 00:00:00 2001 From: leex279 Date: Sun, 7 Sep 2025 14:16:30 +0200 Subject: [PATCH] Fix sitemap URL detection to require .xml extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves issue where URLs containing 'sitemap' in path (like https://nx.dev/see-also/sitemap) were incorrectly treated as XML sitemaps, causing XML parsing errors. - Changed detection to require both .xml extension AND 'sitemap' in path - Fixes XML parsing error: "not well-formed (invalid token)" - Maintains compatibility with existing test cases - Now correctly identifies only actual XML sitemap files Fixes #607 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- python/src/server/services/crawling/helpers/url_handler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 33c75c57..97a9c5a5 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -29,7 +29,10 @@ class URLHandler: True if URL is a sitemap, False otherwise """ try: - return url.endswith("sitemap.xml") or "sitemap" in urlparse(url).path + parsed = urlparse(url) + path = parsed.path.lower() + # Only match URLs that end with .xml and contain sitemap in the filename + return path.endswith(".xml") and "sitemap" in path except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False