Updating title exxtraction for llms.txt

2025-12-24 02:39:17 -05:00 · 2025-10-10 18:16:03 -05:00
parent 4a9ed51cff
commit 77e9342c27
4 changed files with 75 additions and 28 deletions
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -214,6 +214,7 @@ class CrawlingService:
        urls: list[str],
        max_concurrent: int | None = None,
        progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
+        link_text_fallbacks: dict[str, str] | None = None,
    ) -> list[dict[str, Any]]:
        """Batch crawl multiple URLs in parallel."""
        return await self.batch_strategy.crawl_batch_with_progress(
@@ -223,6 +224,7 @@ class CrawlingService:
            max_concurrent,
            progress_callback,
            self._check_cancellation,  # Pass cancellation check
+            link_text_fallbacks,  # Pass link text fallbacks
        )

    async def crawl_recursive_with_progress(
@@ -698,35 +700,40 @@ class CrawlingService:
            if crawl_results and len(crawl_results) > 0:
                content = crawl_results[0].get('markdown', '')
                if self.url_handler.is_link_collection_file(url, content):
-                    # Extract links from the content
-                    extracted_links = self.url_handler.extract_markdown_links(content, url)
+                    # Extract links WITH text from the content
+                    extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)

                    # Filter out self-referential links to avoid redundant crawling
-                    if extracted_links:
-                        original_count = len(extracted_links)
-                        extracted_links = [
-                            link for link in extracted_links
+                    if extracted_links_with_text:
+                        original_count = len(extracted_links_with_text)
+                        extracted_links_with_text = [
+                            (link, text) for link, text in extracted_links_with_text
                            if not self._is_self_link(link, url)
                        ]
-                        self_filtered_count = original_count - len(extracted_links)
+                        self_filtered_count = original_count - len(extracted_links_with_text)
                        if self_filtered_count > 0:
                            logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")

                    # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
-                    if extracted_links:
-                        original_count = len(extracted_links)
-                        extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)]
-                        filtered_count = original_count - len(extracted_links)
+                    if extracted_links_with_text:
+                        original_count = len(extracted_links_with_text)
+                        extracted_links_with_text = [(link, text) for link, text in extracted_links_with_text if not self.url_handler.is_binary_file(link)]
+                        filtered_count = original_count - len(extracted_links_with_text)
                        if filtered_count > 0:
                            logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")

-                    if extracted_links:
+                    if extracted_links_with_text:
+                        # Build mapping of URL -> link text for title fallback
+                        url_to_link_text = {link: text for link, text in extracted_links_with_text}
+                        extracted_links = [link for link, _ in extracted_links_with_text]
+
                        # Crawl the extracted links using batch crawling
                        logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
                        batch_results = await self.crawl_batch_with_progress(
                            extracted_links,
                            max_concurrent=request.get('max_concurrent'),  # None -> use DB settings
                            progress_callback=await self._create_crawl_progress_callback("crawling"),
+                            link_text_fallbacks=url_to_link_text,  # Pass link text for title fallback
                        )

                        # Combine original text file results with batch results
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -282,21 +282,37 @@ class URLHandler:
    def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
        """
        Extract markdown-style links from text content.
-        
+
        Args:
            content: Text content to extract links from
            base_url: Base URL to resolve relative links against
-            
+
        Returns:
            List of absolute URLs found in the content
        """
+        # Extract with text and return only URLs for backward compatibility
+        links_with_text = URLHandler.extract_markdown_links_with_text(content, base_url)
+        return [url for url, _ in links_with_text]
+
+    @staticmethod
+    def extract_markdown_links_with_text(content: str, base_url: Optional[str] = None) -> List[tuple[str, str]]:
+        """
+        Extract markdown-style links from text content with their link text.
+
+        Args:
+            content: Text content to extract links from
+            base_url: Base URL to resolve relative links against
+
+        Returns:
+            List of (url, link_text) tuples
+        """
        try:
            if not content:
                return []
-            
+
            # Ultimate URL pattern with comprehensive format support:
            #  1) [text](url) - markdown links
-            #  2) <https://...> - autolinks  
+            #  2) <https://...> - autolinks
            #  3) https://... - bare URLs with protocol
            #  4) //example.com - protocol-relative URLs
            #  5) www.example.com - scheme-less www URLs
@@ -317,7 +333,7 @@ class URLHandler:
                cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc'))
                return cleaned

-            urls = []
+            links = []
            for match in re.finditer(combined_pattern, content):
                url = (
                    match.group('md')
@@ -350,21 +366,24 @@ class URLHandler:

                # Only include HTTP/HTTPS URLs
                if url.startswith(('http://', 'https://')):
-                    urls.append(url)
-            
-            # Remove duplicates while preserving order
+                    # Extract link text if available (from markdown links)
+                    link_text = match.group('text') if match.group('md') else ''
+                    link_text = link_text.strip() if link_text else ''
+                    links.append((url, link_text))
+
+            # Remove duplicates while preserving order (first occurrence wins)
            seen = set()
-            unique_urls = []
-            for url in urls:
+            unique_links = []
+            for url, text in links:
                if url not in seen:
                    seen.add(url)
-                    unique_urls.append(url)
-            
-            logger.info(f"Extracted {len(unique_urls)} unique links from content")
-            return unique_urls
-            
+                    unique_links.append((url, text))
+
+            logger.info(f"Extracted {len(unique_links)} unique links from content")
+            return unique_links
+
        except Exception as e:
-            logger.error(f"Error extracting markdown links: {e}", exc_info=True)
+            logger.error(f"Error extracting markdown links with text: {e}", exc_info=True)
            return []
    
    @staticmethod
--- a/python/src/server/services/crawling/strategies/batch.py
+++ b/python/src/server/services/crawling/strategies/batch.py
@@ -38,6 +38,7 @@ class BatchCrawlStrategy:
        max_concurrent: int | None = None,
        progress_callback: Callable[..., Awaitable[None]] | None = None,
        cancellation_check: Callable[[], None] | None = None,
+        link_text_fallbacks: dict[str, str] | None = None,
    ) -> list[dict[str, Any]]:
        """
        Batch crawl multiple URLs in parallel with progress reporting.
@@ -49,6 +50,7 @@ class BatchCrawlStrategy:
            max_concurrent: Maximum concurrent crawls
            progress_callback: Optional callback for progress updates
            cancellation_check: Optional function to check for cancellation
+            link_text_fallbacks: Optional dict mapping URLs to link text for title fallback

        Returns:
            List of crawl results
@@ -247,6 +249,12 @@ class BatchCrawlStrategy:
                            if extracted_title:
                                title = extracted_title

+                    # Fallback to link text if HTML title extraction failed
+                    if title == "Untitled" and link_text_fallbacks:
+                        fallback_text = link_text_fallbacks.get(original_url, "")
+                        if fallback_text:
+                            title = fallback_text
+
                    successful_results.append({
                        "url": original_url,
                        "markdown": result.markdown.fit_markdown,
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -277,10 +277,23 @@ class RecursiveCrawlStrategy:
                    total_processed += 1

                    if result.success and result.markdown and result.markdown.fit_markdown:
+                        # Extract title from HTML <title> tag
+                        title = "Untitled"
+                        if result.html:
+                            import re
+                            title_match = re.search(r'<title[^>]*>(.*?)</title>', result.html, re.IGNORECASE | re.DOTALL)
+                            if title_match:
+                                extracted_title = title_match.group(1).strip()
+                                # Clean up HTML entities
+                                extracted_title = extracted_title.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"')
+                                if extracted_title:
+                                    title = extracted_title
+
                        results_all.append({
                            "url": original_url,
                            "markdown": result.markdown.fit_markdown,
                            "html": result.html,  # Always use raw HTML for code extraction
+                            "title": title,
                        })
                        depth_successful += 1