diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index eaa9ab1b..ab8ccdad 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -20,11 +20,11 @@ from ..credential_service import credential_service # Import operations from .discovery_service import DiscoveryService from .document_storage_operations import DocumentStorageOperations -from .page_storage_operations import PageStorageOperations from .helpers.site_config import SiteConfig # Import helpers from .helpers.url_handler import URLHandler +from .page_storage_operations import PageStorageOperations from .progress_mapper import ProgressMapper from .strategies.batch import BatchCrawlStrategy from .strategies.recursive import RecursiveCrawlStrategy @@ -413,18 +413,18 @@ class CrawlingService: total_pages=total_urls_to_crawl, processed_pages=0 ) - + # Crawl only the discovered file with discovery context discovered_url = discovered_urls[0] safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}") - + # Mark this as a discovery target for domain filtering discovery_request = request.copy() discovery_request["is_discovery_target"] = True - discovery_request["original_domain"] = self.url_handler.get_base_url(url) - + discovery_request["original_domain"] = self.url_handler.get_base_url(discovered_url) + crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request) - + else: # No discovery - crawl the main URL normally total_urls_to_crawl = 1 @@ -433,7 +433,7 @@ class CrawlingService: total_pages=total_urls_to_crawl, processed_pages=0 ) - + # Crawl the main URL safe_logfire_info(f"No discovery file found, crawling main URL: {url}") crawl_results, crawl_type = await self._crawl_by_url_type(url, request) @@ -608,7 +608,7 @@ class CrawlingService: logger.error("Code extraction failed, continuing crawl without code examples", exc_info=True) safe_logfire_error(f"Code extraction failed | error={e}") code_examples_count = 0 - + # Report code extraction failure to progress tracker if self.progress_tracker: await self.progress_tracker.update( @@ -708,11 +708,11 @@ class CrawlingService: def _is_same_domain(self, url: str, base_domain: str) -> bool: """ Check if a URL belongs to the same domain as the base domain. - + Args: url: URL to check base_domain: Base domain URL to compare against - + Returns: True if the URL is from the same domain """ @@ -842,7 +842,7 @@ class CrawlingService: if extracted_links_with_text: # Build mapping of URL -> link text for title fallback - url_to_link_text = {link: text for link, text in extracted_links_with_text} + url_to_link_text = dict(extracted_links_with_text) extracted_links = [link for link, _ in extracted_links_with_text] # For discovery targets, respect max_depth for same-domain links diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index 467f1673..778604a9 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -124,10 +124,10 @@ class DiscoveryService: """ Main discovery orchestrator - selects ONE best file across all categories. All files contain similar AI/crawling guidance, so we only need the best one. - + Args: base_url: Base URL to discover files for - + Returns: Single best URL found, or None if no files discovered """ @@ -182,7 +182,7 @@ class DiscoveryService: def _discover_best_sitemap(self, base_url: str) -> str | None: """ Discover the best available sitemap using priority-based selection. - + Priority order: 1. Sitemaps from robots.txt (highest priority - explicitly declared) 2. Standard locations (sitemap_index.xml > sitemap-index.xml > sitemap.xml) @@ -228,7 +228,7 @@ class DiscoveryService: def _discover_best_llms_file(self, base_url: str) -> str | None: """ Discover the best available llms file using priority-based selection. - + Priority order: 1. Standard locations (llms-full.txt > llms.txt > llms.md > llms.mdx > llms.markdown) 2. Common subdirectory variations (static, public, docs, assets) @@ -270,7 +270,7 @@ class DiscoveryService: return robots_url except Exception: logger.exception(f"Error discovering robots file for {base_url}") - + return None def _check_url_exists(self, url: str) -> bool: @@ -315,12 +315,17 @@ class DiscoveryService: content = self._read_response_with_limit(resp, robots_url) # Parse robots.txt content for sitemap directives - for line in content.splitlines(): - line = line.strip().lower() - if line.startswith("sitemap:"): - sitemap_url = line.split(":", 1)[1].strip() - # Validate URL format before adding - if sitemap_url and (sitemap_url.startswith('http://') or sitemap_url.startswith('https://')): + for raw_line in content.splitlines(): + line = raw_line.strip() + if line.lower().startswith("sitemap:"): + sitemap_value = line.split(":", 1)[1].strip() + if sitemap_value: + # Allow absolute and relative sitemap values + if sitemap_value.lower().startswith(("http://", "https://")): + sitemap_url = sitemap_value + else: + # Resolve relative path against base_url + sitemap_url = urljoin(base_url, sitemap_value) sitemaps.append(sitemap_url) logger.info(f"Found sitemap in robots.txt: {sitemap_url}") @@ -341,10 +346,10 @@ class DiscoveryService: def _check_standard_patterns(self, base_url: str) -> dict[str, list[str]]: """ Check common file locations for discovery targets. - + Args: base_url: Base URL to check standard locations for - + Returns: Dictionary with file types and discovered URLs """ @@ -395,10 +400,10 @@ class DiscoveryService: def _parse_html_meta_tags(self, base_url: str) -> list[str]: """ Extract sitemap references from HTML meta tags. - + Args: base_url: Base URL to check HTML for meta tags - + Returns: List of sitemap URLs found in HTML meta tags """ @@ -415,28 +420,36 @@ class DiscoveryService: # Read response with size limit content = self._read_response_with_limit(resp, base_url) - content = content.lower() # Look for sitemap meta tags or link elements import re + from urllib.parse import urlparse - # Check for - sitemap_link_pattern = r']*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']' - matches = re.findall(sitemap_link_pattern, content) + # Check for (case-insensitive) + sitemap_link_pattern = re.compile( + r']*rel=["\']sitemap["\'][^>]*href=["\']([^"\']+)["\']', + re.IGNORECASE + ) + matches = sitemap_link_pattern.findall(content) for match in matches: sitemap_url = urljoin(base_url, match) - sitemaps.append(sitemap_url) - logger.info(f"Found sitemap in HTML link tag: {sitemap_url}") + if urlparse(sitemap_url).scheme in ("http", "https"): + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in HTML link tag: {sitemap_url}") - # Check for - sitemap_meta_pattern = r']*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']' - matches = re.findall(sitemap_meta_pattern, content) + # Check for (case-insensitive) + sitemap_meta_pattern = re.compile( + r']*name=["\']sitemap["\'][^>]*content=["\']([^"\']+)["\']', + re.IGNORECASE + ) + matches = sitemap_meta_pattern.findall(content) for match in matches: sitemap_url = urljoin(base_url, match) - sitemaps.append(sitemap_url) - logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}") + if urlparse(sitemap_url).scheme in ("http", "https"): + sitemaps.append(sitemap_url) + logger.info(f"Found sitemap in HTML meta tag: {sitemap_url}") finally: resp.close() @@ -454,10 +467,10 @@ class DiscoveryService: def _check_well_known_directory(self, base_url: str) -> list[str]: """ Check .well-known/* files for discovery targets. - + Args: base_url: Base URL to check .well-known directory for - + Returns: List of URLs found in .well-known directory """ @@ -490,10 +503,10 @@ class DiscoveryService: def _try_common_variations(self, base_url: str) -> dict[str, list[str]]: """ Try pattern variations for discovery targets. - + Args: base_url: Base URL to try variations for - + Returns: Dictionary with file types and discovered variation URLs """