diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 545b0995..cf0d1ba5 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -202,8 +202,6 @@ class CrawlingService: urls: list[str], max_concurrent: int | None = None, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None, - start_progress: int = 15, - end_progress: int = 60, ) -> list[dict[str, Any]]: """Batch crawl multiple URLs in parallel.""" return await self.batch_strategy.crawl_batch_with_progress( @@ -212,8 +210,6 @@ class CrawlingService: self.site_config.is_documentation_site, max_concurrent, progress_callback, - start_progress, - end_progress, self._check_cancellation, # Pass cancellation check ) @@ -223,8 +219,6 @@ class CrawlingService: max_depth: int = 3, max_concurrent: int | None = None, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None, - start_progress: int = 10, - end_progress: int = 60, ) -> list[dict[str, Any]]: """Recursively crawl internal links from start URLs.""" return await self.recursive_strategy.crawl_recursive_with_progress( @@ -234,8 +228,6 @@ class CrawlingService: max_depth, max_concurrent, progress_callback, - start_progress, - end_progress, self._check_cancellation, # Pass cancellation check ) @@ -799,8 +791,6 @@ class CrawlingService: max_depth=max_depth - 1, # Reduce depth since we're already 1 level deep max_concurrent=request.get('max_concurrent'), progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=10, - end_progress=20, ) else: # Depth limit reached, just crawl the immediate links without following further @@ -809,8 +799,6 @@ class CrawlingService: extracted_links, max_concurrent=request.get('max_concurrent'), progress_callback=await self._create_crawl_progress_callback("crawling"), - start_progress=10, - end_progress=20, ) else: # Use normal batch crawling for non-discovery targets diff --git a/python/src/server/services/crawling/discovery_service.py b/python/src/server/services/crawling/discovery_service.py index 6a3762a4..511b63a2 100644 --- a/python/src/server/services/crawling/discovery_service.py +++ b/python/src/server/services/crawling/discovery_service.py @@ -64,24 +64,24 @@ class DiscoveryService: # Check files in global priority order for filename in self.DISCOVERY_PRIORITY: - # Try root location first - file_url = urljoin(base_url, f"/{filename}") + # Try location relative to the given URL + file_url = urljoin(base_url, filename) if self._check_url_exists(file_url): logger.info(f"Discovery found best file: {file_url}") return file_url - + # For llms files, also try common subdirectories if filename.startswith('llms'): for subdir in ["static", "public", "docs", "assets", "doc", "api"]: - subdir_url = urljoin(base_url, f"/{subdir}/{filename}") + subdir_url = urljoin(base_url, f"{subdir}/{filename}") if self._check_url_exists(subdir_url): logger.info(f"Discovery found best file in subdirectory: {subdir_url}") return subdir_url - + # For sitemap files, also try common subdirectories if filename.endswith('.xml') and not filename.startswith('.well-known'): for subdir in ["sitemaps", "sitemap", "xml", "feed"]: - subdir_url = urljoin(base_url, f"/{subdir}/{filename}") + subdir_url = urljoin(base_url, f"{subdir}/{filename}") if self._check_url_exists(subdir_url): logger.info(f"Discovery found best file in subdirectory: {subdir_url}") return subdir_url @@ -119,7 +119,7 @@ class DiscoveryService: # Priority 2: Check standard locations in priority order for filename in self.DISCOVERY_TARGETS["sitemap_files"]: - sitemap_url = urljoin(base_url, f"/{filename}") + sitemap_url = urljoin(base_url, filename) if self._check_url_exists(sitemap_url): return sitemap_url @@ -127,7 +127,7 @@ class DiscoveryService: subdirs = ["sitemaps", "sitemap", "xml", "feed"] for subdir in subdirs: for filename in self.DISCOVERY_TARGETS["sitemap_files"]: - sitemap_url = urljoin(base_url, f"/{subdir}/{filename}") + sitemap_url = urljoin(base_url, f"{subdir}/{filename}") if self._check_url_exists(sitemap_url): return sitemap_url @@ -137,7 +137,7 @@ class DiscoveryService: return html_sitemaps[0] # Use first sitemap from HTML # Priority 5: Check .well-known directory - well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml") + well_known_sitemap = urljoin(base_url, ".well-known/sitemap.xml") if self._check_url_exists(well_known_sitemap): return well_known_sitemap @@ -158,7 +158,7 @@ class DiscoveryService: try: # Priority 1: Check standard root locations in priority order for filename in self.DISCOVERY_TARGETS["llms_files"]: - llms_url = urljoin(base_url, f"/{filename}") + llms_url = urljoin(base_url, filename) if self._check_url_exists(llms_url): return llms_url @@ -166,13 +166,13 @@ class DiscoveryService: subdirs = ["static", "public", "docs", "assets", "doc", "api"] for subdir in subdirs: for filename in self.DISCOVERY_TARGETS["llms_files"]: - llms_url = urljoin(base_url, f"/{subdir}/{filename}") + llms_url = urljoin(base_url, f"{subdir}/{filename}") if self._check_url_exists(llms_url): return llms_url # Priority 3: Check .well-known directory variants for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]: - well_known_url = urljoin(base_url, f"/{well_known_file}") + well_known_url = urljoin(base_url, well_known_file) if self._check_url_exists(well_known_url): return well_known_url @@ -186,7 +186,7 @@ class DiscoveryService: Discover robots.txt file (always single file at root). """ try: - robots_url = urljoin(base_url, "/robots.txt") + robots_url = urljoin(base_url, "robots.txt") if self._check_url_exists(robots_url): return robots_url except Exception: @@ -210,17 +210,18 @@ class DiscoveryService: def _parse_robots_txt(self, base_url: str) -> list[str]: """ Extract sitemap URLs from robots.txt. - + Args: base_url: Base URL to check robots.txt for - + Returns: List of sitemap URLs found in robots.txt """ sitemaps: list[str] = [] try: - robots_url = urljoin(base_url, "/robots.txt") + # Use robots.txt relative to the given URL, not always root + robots_url = urljoin(base_url, "robots.txt") logger.info(f"Checking robots.txt at {robots_url}") resp = requests.get(robots_url, timeout=30) @@ -272,7 +273,7 @@ class DiscoveryService: for target_type, filename in all_targets: try: - file_url = urljoin(base_url, f"/{filename}") + file_url = urljoin(base_url, filename) resp = requests.get(file_url, timeout=30, allow_redirects=True) if resp.status_code == 200: @@ -361,7 +362,7 @@ class DiscoveryService: try: for filename in self.DISCOVERY_TARGETS["well_known_files"]: try: - file_url = urljoin(base_url, f"/{filename}") + file_url = urljoin(base_url, filename) resp = requests.get(file_url, timeout=30, allow_redirects=True) if resp.status_code == 200: @@ -401,7 +402,7 @@ class DiscoveryService: for subdir in subdirs: for llms_file in self.DISCOVERY_TARGETS["llms_files"]: try: - file_url = urljoin(base_url, f"/{subdir}/{llms_file}") + file_url = urljoin(base_url, f"{subdir}/{llms_file}") resp = requests.get(file_url, timeout=30, allow_redirects=True) if resp.status_code == 200: @@ -423,7 +424,7 @@ class DiscoveryService: for sitemap_path in sitemap_paths: try: - file_url = urljoin(base_url, f"/{sitemap_path}") + file_url = urljoin(base_url, sitemap_path) resp = requests.get(file_url, timeout=30, allow_redirects=True) if resp.status_code == 200: