diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py
index eaa9ab1b..ab8ccdad 100644
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -20,11 +20,11 @@ from ..credential_service import credential_service
 # Import operations
 from .discovery_service import DiscoveryService
 from .document_storage_operations import DocumentStorageOperations
-from .page_storage_operations import PageStorageOperations
 from .helpers.site_config import SiteConfig
 
 # Import helpers
 from .helpers.url_handler import URLHandler
+from .page_storage_operations import PageStorageOperations
 from .progress_mapper import ProgressMapper
 from .strategies.batch import BatchCrawlStrategy
 from .strategies.recursive import RecursiveCrawlStrategy
@@ -413,18 +413,18 @@ class CrawlingService:
                     total_pages=total_urls_to_crawl,
                     processed_pages=0
                 )
-                
+
                 # Crawl only the discovered file with discovery context
                 discovered_url = discovered_urls[0]
                 safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}")
-                
+
                 # Mark this as a discovery target for domain filtering
                 discovery_request = request.copy()
                 discovery_request["is_discovery_target"] = True
-                discovery_request["original_domain"] = self.url_handler.get_base_url(url)
-                
+                discovery_request["original_domain"] = self.url_handler.get_base_url(discovered_url)
+
                 crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request)
-                
+
             else:
                 # No discovery - crawl the main URL normally
                 total_urls_to_crawl = 1
@@ -433,7 +433,7 @@ class CrawlingService:
                     total_pages=total_urls_to_crawl,
                     processed_pages=0
                 )
-                
+
                 # Crawl the main URL
                 safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
                 crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
@@ -608,7 +608,7 @@ class CrawlingService:
                     logger.error("Code extraction failed, continuing crawl without code examples", exc_info=True)
                     safe_logfire_error(f"Code extraction failed | error={e}")
                     code_examples_count = 0
-                    
+
                     # Report code extraction failure to progress tracker
                     if self.progress_tracker:
                         await self.progress_tracker.update(
@@ -708,11 +708,11 @@ class CrawlingService:
     def _is_same_domain(self, url: str, base_domain: str) -> bool:
         """
         Check if a URL belongs to the same domain as the base domain.
-        
+
         Args:
             url: URL to check
             base_domain: Base domain URL to compare against
-            
+
         Returns:
             True if the URL is from the same domain
         """
@@ -842,7 +842,7 @@ class CrawlingService:
 
                     if extracted_links_with_text:
                         # Build mapping of URL -> link text for title fallback
-                        url_to_link_text = {link: text for link, text in extracted_links_with_text}
+                        url_to_link_text = dict(extracted_links_with_text)
                         extracted_links = [link for link, _ in extracted_links_with_text]
 
                         # For discovery targets, respect max_depth for same-domain links
diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py
index 467f1673..778604a9 100644
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -124,10 +124,10 @@ class DiscoveryService:
         """
         Main discovery orchestrator - selects ONE best file across all categories.
         All files contain similar AI/crawling guidance, so we only need the best one.
-        
+
         Args:
             base_url: Base URL to discover files for
-            
+
         Returns:
             Single best URL found, or None if no files discovered
         """
@@ -182,7 +182,7 @@ class DiscoveryService:
     def _discover_best_sitemap(self, base_url: str) -> str | None:
         """
         Discover the best available sitemap using priority-based selection.
-        
+
         Priority order:
         1. Sitemaps from robots.txt (highest priority - explicitly declared)
         2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml)
@@ -228,7 +228,7 @@ class DiscoveryService:
     def _discover_best_llms_file(self, base_url: str) -> str | None:
         """
         Discover the best available llms file using priority-based selection.
-        
+
         Priority order:
         1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown)
         2. Common subdirectory variations (static, public, docs, assets)
@@ -270,7 +270,7 @@ class DiscoveryService:
                 return robots_url
         except Exception:
             logger.exception(f"Error discovering robots file for {base_url}")
-        
+
         return None
 
     def _check_url_exists(self, url: str) -> bool:
@@ -315,12 +315,17 @@ class DiscoveryService:
                 content = self._read_response_with_limit(resp, robots_url)
 
                 # Parse robots.txt content for sitemap directives
-                for line in content.splitlines():
-                    line = line.strip().lower()
-                    if line.startswith("sitemap:"):
-                        sitemap_url = line.split(":", 1)[1].strip()
-                        # Validate URL format before adding
-                        if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')):
+                for raw_line in content.splitlines():
+                    line = raw_line.strip()
+                    if line.lower().startswith("sitemap:"):
+                        sitemap_value = line.split(":", 1)[1].strip()
+                        if sitemap_value:
+                            # Allow absolute and relative sitemap values
+                            if sitemap_value.lower().startswith(("http://", "https://")):
+                                sitemap_url = sitemap_value
+                            else:
+                                # Resolve relative path against base_url
+                                sitemap_url = urljoin(base_url, sitemap_value)
                             sitemaps.append(sitemap_url)
                             logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
 
@@ -341,10 +346,10 @@ class DiscoveryService:
     def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]:
         """
         Check common file locations for discovery targets.
-        
+
         Args:
             base_url: Base URL to check standard locations for
-            
+
         Returns:
             Dictionary with file types and discovered URLs
         """
@@ -395,10 +400,10 @@ class DiscoveryService:
     def _parse_html_meta_tags(self, base_url: str) -> list[str]:
         """
         Extract sitemap references from HTML meta tags.
-        
+
         Args:
             base_url: Base URL to check HTML for meta tags
-            
+
         Returns:
             List of sitemap URLs found in HTML meta tags
         """
@@ -415,28 +420,36 @@ class DiscoveryService:
 
                 # Read response with size limit
                 content = self._read_response_with_limit(resp, base_url)
-                content = content.lower()
 
                 # Look for sitemap meta tags or link elements
                 import re
+                from urllib.parse import urlparse
 
-                # Check for <link rel="sitemap" href="...">
-                sitemap_link_pattern = r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']'
-                matches = re.findall(sitemap_link_pattern, content)
+                # Check for <link rel="sitemap" href="..."> (case-insensitive)
+                sitemap_link_pattern = re.compile(
+                    r'<link[^>]*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']',
+                    re.IGNORECASE
+                )
+                matches = sitemap_link_pattern.findall(content)
 
                 for match in matches:
                     sitemap_url = urljoin(base_url, match)
-                    sitemaps.append(sitemap_url)
-                    logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
+                    if urlparse(sitemap_url).scheme in ("http", "https"):
+                        sitemaps.append(sitemap_url)
+                        logger.info(f"Found sitemap in HTML link tag: {sitemap_url}")
 
-                # Check for <meta name="sitemap" content="...">
-                sitemap_meta_pattern = r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']'
-                matches = re.findall(sitemap_meta_pattern, content)
+                # Check for <meta name="sitemap" content="..."> (case-insensitive)
+                sitemap_meta_pattern = re.compile(
+                    r'<meta[^>]*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']',
+                    re.IGNORECASE
+                )
+                matches = sitemap_meta_pattern.findall(content)
 
                 for match in matches:
                     sitemap_url = urljoin(base_url, match)
-                    sitemaps.append(sitemap_url)
-                    logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
+                    if urlparse(sitemap_url).scheme in ("http", "https"):
+                        sitemaps.append(sitemap_url)
+                        logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}")
 
             finally:
                 resp.close()
@@ -454,10 +467,10 @@ class DiscoveryService:
     def _check_well_known_directory(self, base_url: str) -> list[str]:
         """
         Check .well-known/* files for discovery targets.
-        
+
         Args:
             base_url: Base URL to check .well-known directory for
-            
+
         Returns:
             List of URLs found in .well-known directory
         """
@@ -490,10 +503,10 @@ class DiscoveryService:
     def _try_common_variations(self, base_url: str) -> dict[str, list[str]]:
         """
         Try pattern variations for discovery targets.
-        
+
         Args:
             base_url: Base URL to try variations for
-            
+
         Returns:
             Dictionary with file types and discovered variation URLs
         """