fix: Discovery now respects given URL path and fix method signature mismatches

Two critical fixes for the automatic discovery feature: 1. Discovery Service path handling: - Changed from always using root domain (/) to respecting given URL path - e.g., for 'supabase.com/docs', now checks 'supabase.com/docs/robots.txt' - Previously incorrectly checked 'supabase.com/robots.txt' - Fixed all urljoin calls to use relative paths instead of absolute paths 2. Method signature mismatches: - Removed start_progress and end_progress parameters from crawl_batch_with_progress - Removed same parameters from crawl_recursive_with_progress - Fixed all calls to these methods to match the strategy implementations These fixes ensure discovery works correctly for subdirectory URLs and prevents TypeError crashes during crawling.
2025-12-24 02:39:17 -05:00 · 2025-09-20 13:06:41 +02:00
parent 0a2c43f6b4
commit 7f74aea476
2 changed files with 21 additions and 32 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -202,8 +202,6 @@ class CrawlingService:
        urls: list[str],
        max_concurrent: int | None = None,
        progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
-        start_progress: int = 15,
-        end_progress: int = 60,
    ) -> list[dict[str, Any]]:
        """Batch crawl multiple URLs in parallel."""
        return await self.batch_strategy.crawl_batch_with_progress(
@@ -212,8 +210,6 @@ class CrawlingService:
            self.site_config.is_documentation_site,
            max_concurrent,
            progress_callback,
-            start_progress,
-            end_progress,
            self._check_cancellation,  # Pass cancellation check
        )

@@ -223,8 +219,6 @@ class CrawlingService:
        max_depth: int = 3,
        max_concurrent: int | None = None,
        progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
-        start_progress: int = 10,
-        end_progress: int = 60,
    ) -> list[dict[str, Any]]:
        """Recursively crawl internal links from start URLs."""
        return await self.recursive_strategy.crawl_recursive_with_progress(
@@ -234,8 +228,6 @@ class CrawlingService:
            max_depth,
            max_concurrent,
            progress_callback,
-            start_progress,
-            end_progress,
            self._check_cancellation,  # Pass cancellation check
        )

@@ -799,8 +791,6 @@ class CrawlingService:
                                max_depth=max_depth - 1,  # Reduce depth since we're already 1 level deep
                                max_concurrent=request.get('max_concurrent'),
                                progress_callback=await self._create_crawl_progress_callback("crawling"),
-                                start_progress=10,
-                                end_progress=20,
                            )
                        else:
                            # Depth limit reached, just crawl the immediate links without following further
@@ -809,8 +799,6 @@ class CrawlingService:
                                extracted_links,
                                max_concurrent=request.get('max_concurrent'),
                                progress_callback=await self._create_crawl_progress_callback("crawling"),
-                                start_progress=10,
-                                end_progress=20,
                            )
                    else:
                        # Use normal batch crawling for non-discovery targets
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -64,24 +64,24 @@ class DiscoveryService:

            # Check files in global priority order
            for filename in self.DISCOVERY_PRIORITY:
-                # Try root location first
-                file_url = urljoin(base_url, f"/{filename}")
+                # Try location relative to the given URL
+                file_url = urljoin(base_url, filename)
                if self._check_url_exists(file_url):
                    logger.info(f"Discovery found best file: {file_url}")
                    return file_url
-                
+
                # For llms files, also try common subdirectories
                if filename.startswith('llms'):
                    for subdir in ["static", "public", "docs", "assets", "doc", "api"]:
-                        subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
+                        subdir_url = urljoin(base_url, f"{subdir}/{filename}")
                        if self._check_url_exists(subdir_url):
                            logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
                            return subdir_url
-                
+
                # For sitemap files, also try common subdirectories
                if filename.endswith('.xml') and not filename.startswith('.well-known'):
                    for subdir in ["sitemaps", "sitemap", "xml", "feed"]:
-                        subdir_url = urljoin(base_url, f"/{subdir}/{filename}")
+                        subdir_url = urljoin(base_url, f"{subdir}/{filename}")
                        if self._check_url_exists(subdir_url):
                            logger.info(f"Discovery found best file in subdirectory: {subdir_url}")
                            return subdir_url
@@ -119,7 +119,7 @@ class DiscoveryService:

            # Priority 2: Check standard locations in priority order
            for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
-                sitemap_url = urljoin(base_url, f"/{filename}")
+                sitemap_url = urljoin(base_url, filename)
                if self._check_url_exists(sitemap_url):
                    return sitemap_url

@@ -127,7 +127,7 @@ class DiscoveryService:
            subdirs = ["sitemaps", "sitemap", "xml", "feed"]
            for subdir in subdirs:
                for filename in self.DISCOVERY_TARGETS["sitemap_files"]:
-                    sitemap_url = urljoin(base_url, f"/{subdir}/{filename}")
+                    sitemap_url = urljoin(base_url, f"{subdir}/{filename}")
                    if self._check_url_exists(sitemap_url):
                        return sitemap_url

@@ -137,7 +137,7 @@ class DiscoveryService:
                return html_sitemaps[0]  # Use first sitemap from HTML

            # Priority 5: Check .well-known directory
-            well_known_sitemap = urljoin(base_url, "/.well-known/sitemap.xml")
+            well_known_sitemap = urljoin(base_url, ".well-known/sitemap.xml")
            if self._check_url_exists(well_known_sitemap):
                return well_known_sitemap

@@ -158,7 +158,7 @@ class DiscoveryService:
        try:
            # Priority 1: Check standard root locations in priority order
            for filename in self.DISCOVERY_TARGETS["llms_files"]:
-                llms_url = urljoin(base_url, f"/{filename}")
+                llms_url = urljoin(base_url, filename)
                if self._check_url_exists(llms_url):
                    return llms_url

@@ -166,13 +166,13 @@ class DiscoveryService:
            subdirs = ["static", "public", "docs", "assets", "doc", "api"]
            for subdir in subdirs:
                for filename in self.DISCOVERY_TARGETS["llms_files"]:
-                    llms_url = urljoin(base_url, f"/{subdir}/{filename}")
+                    llms_url = urljoin(base_url, f"{subdir}/{filename}")
                    if self._check_url_exists(llms_url):
                        return llms_url

            # Priority 3: Check .well-known directory variants
            for well_known_file in [".well-known/ai.txt", ".well-known/llms.txt"]:
-                well_known_url = urljoin(base_url, f"/{well_known_file}")
+                well_known_url = urljoin(base_url, well_known_file)
                if self._check_url_exists(well_known_url):
                    return well_known_url

@@ -186,7 +186,7 @@ class DiscoveryService:
        Discover robots.txt file (always single file at root).
        """
        try:
-            robots_url = urljoin(base_url, "/robots.txt")
+            robots_url = urljoin(base_url, "robots.txt")
            if self._check_url_exists(robots_url):
                return robots_url
        except Exception:
@@ -210,17 +210,18 @@ class DiscoveryService:
    def _parse_robots_txt(self, base_url: str) -> list[str]:
        """
        Extract sitemap URLs from robots.txt.
-        
+
        Args:
            base_url: Base URL to check robots.txt for
-            
+
        Returns:
            List of sitemap URLs found in robots.txt
        """
        sitemaps: list[str] = []

        try:
-            robots_url = urljoin(base_url, "/robots.txt")
+            # Use robots.txt relative to the given URL, not always root
+            robots_url = urljoin(base_url, "robots.txt")
            logger.info(f"Checking robots.txt at {robots_url}")

            resp = requests.get(robots_url, timeout=30)
@@ -272,7 +273,7 @@ class DiscoveryService:

            for target_type, filename in all_targets:
                try:
-                    file_url = urljoin(base_url, f"/{filename}")
+                    file_url = urljoin(base_url, filename)
                    resp = requests.get(file_url, timeout=30, allow_redirects=True)

                    if resp.status_code == 200:
@@ -361,7 +362,7 @@ class DiscoveryService:
        try:
            for filename in self.DISCOVERY_TARGETS["well_known_files"]:
                try:
-                    file_url = urljoin(base_url, f"/{filename}")
+                    file_url = urljoin(base_url, filename)
                    resp = requests.get(file_url, timeout=30, allow_redirects=True)

                    if resp.status_code == 200:
@@ -401,7 +402,7 @@ class DiscoveryService:
            for subdir in subdirs:
                for llms_file in self.DISCOVERY_TARGETS["llms_files"]:
                    try:
-                        file_url = urljoin(base_url, f"/{subdir}/{llms_file}")
+                        file_url = urljoin(base_url, f"{subdir}/{llms_file}")
                        resp = requests.get(file_url, timeout=30, allow_redirects=True)

                        if resp.status_code == 200:
@@ -423,7 +424,7 @@ class DiscoveryService:

            for sitemap_path in sitemap_paths:
                try:
-                    file_url = urljoin(base_url, f"/{sitemap_path}")
+                    file_url = urljoin(base_url, sitemap_path)
                    resp = requests.get(file_url, timeout=30, allow_redirects=True)

                    if resp.status_code == 200: