diff --git a/python/src/server/services/crawling/crawling_service.py b/python/src/server/services/crawling/crawling_service.py index 7a68030e..745f7d93 100644 --- a/python/src/server/services/crawling/crawling_service.py +++ b/python/src/server/services/crawling/crawling_service.py @@ -214,6 +214,7 @@ class CrawlingService: urls: list[str], max_concurrent: int | None = None, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None, + link_text_fallbacks: dict[str, str] | None = None, ) -> list[dict[str, Any]]: """Batch crawl multiple URLs in parallel.""" return await self.batch_strategy.crawl_batch_with_progress( @@ -223,6 +224,7 @@ class CrawlingService: max_concurrent, progress_callback, self._check_cancellation, # Pass cancellation check + link_text_fallbacks, # Pass link text fallbacks ) async def crawl_recursive_with_progress( @@ -698,35 +700,40 @@ class CrawlingService: if crawl_results and len(crawl_results) > 0: content = crawl_results[0].get('markdown', '') if self.url_handler.is_link_collection_file(url, content): - # Extract links from the content - extracted_links = self.url_handler.extract_markdown_links(content, url) + # Extract links WITH text from the content + extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url) # Filter out self-referential links to avoid redundant crawling - if extracted_links: - original_count = len(extracted_links) - extracted_links = [ - link for link in extracted_links + if extracted_links_with_text: + original_count = len(extracted_links_with_text) + extracted_links_with_text = [ + (link, text) for link, text in extracted_links_with_text if not self._is_self_link(link, url) ] - self_filtered_count = original_count - len(extracted_links) + self_filtered_count = original_count - len(extracted_links_with_text) if self_filtered_count > 0: logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links") # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling - if extracted_links: - original_count = len(extracted_links) - extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)] - filtered_count = original_count - len(extracted_links) + if extracted_links_with_text: + original_count = len(extracted_links_with_text) + extracted_links_with_text = [(link, text) for link, text in extracted_links_with_text if not self.url_handler.is_binary_file(link)] + filtered_count = original_count - len(extracted_links_with_text) if filtered_count > 0: logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links") - if extracted_links: + if extracted_links_with_text: + # Build mapping of URL -> link text for title fallback + url_to_link_text = {link: text for link, text in extracted_links_with_text} + extracted_links = [link for link, _ in extracted_links_with_text] + # Crawl the extracted links using batch crawling logger.info(f"Crawling {len(extracted_links)} extracted links from {url}") batch_results = await self.crawl_batch_with_progress( extracted_links, max_concurrent=request.get('max_concurrent'), # None -> use DB settings progress_callback=await self._create_crawl_progress_callback("crawling"), + link_text_fallbacks=url_to_link_text, # Pass link text for title fallback ) # Combine original text file results with batch results diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 97a9c5a5..3cf0f1dc 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -282,21 +282,37 @@ class URLHandler: def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]: """ Extract markdown-style links from text content. - + Args: content: Text content to extract links from base_url: Base URL to resolve relative links against - + Returns: List of absolute URLs found in the content """ + # Extract with text and return only URLs for backward compatibility + links_with_text = URLHandler.extract_markdown_links_with_text(content, base_url) + return [url for url, _ in links_with_text] + + @staticmethod + def extract_markdown_links_with_text(content: str, base_url: Optional[str] = None) -> List[tuple[str, str]]: + """ + Extract markdown-style links from text content with their link text. + + Args: + content: Text content to extract links from + base_url: Base URL to resolve relative links against + + Returns: + List of (url, link_text) tuples + """ try: if not content: return [] - + # Ultimate URL pattern with comprehensive format support: # 1) [text](url) - markdown links - # 2) - autolinks + # 2) - autolinks # 3) https://... - bare URLs with protocol # 4) //example.com - protocol-relative URLs # 5) www.example.com - scheme-less www URLs @@ -317,7 +333,7 @@ class URLHandler: cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc')) return cleaned - urls = [] + links = [] for match in re.finditer(combined_pattern, content): url = ( match.group('md') @@ -350,21 +366,24 @@ class URLHandler: # Only include HTTP/HTTPS URLs if url.startswith(('http://', 'https://')): - urls.append(url) - - # Remove duplicates while preserving order + # Extract link text if available (from markdown links) + link_text = match.group('text') if match.group('md') else '' + link_text = link_text.strip() if link_text else '' + links.append((url, link_text)) + + # Remove duplicates while preserving order (first occurrence wins) seen = set() - unique_urls = [] - for url in urls: + unique_links = [] + for url, text in links: if url not in seen: seen.add(url) - unique_urls.append(url) - - logger.info(f"Extracted {len(unique_urls)} unique links from content") - return unique_urls - + unique_links.append((url, text)) + + logger.info(f"Extracted {len(unique_links)} unique links from content") + return unique_links + except Exception as e: - logger.error(f"Error extracting markdown links: {e}", exc_info=True) + logger.error(f"Error extracting markdown links with text: {e}", exc_info=True) return [] @staticmethod diff --git a/python/src/server/services/crawling/strategies/batch.py b/python/src/server/services/crawling/strategies/batch.py index 6a318879..16aea020 100644 --- a/python/src/server/services/crawling/strategies/batch.py +++ b/python/src/server/services/crawling/strategies/batch.py @@ -38,6 +38,7 @@ class BatchCrawlStrategy: max_concurrent: int | None = None, progress_callback: Callable[..., Awaitable[None]] | None = None, cancellation_check: Callable[[], None] | None = None, + link_text_fallbacks: dict[str, str] | None = None, ) -> list[dict[str, Any]]: """ Batch crawl multiple URLs in parallel with progress reporting. @@ -49,6 +50,7 @@ class BatchCrawlStrategy: max_concurrent: Maximum concurrent crawls progress_callback: Optional callback for progress updates cancellation_check: Optional function to check for cancellation + link_text_fallbacks: Optional dict mapping URLs to link text for title fallback Returns: List of crawl results @@ -247,6 +249,12 @@ class BatchCrawlStrategy: if extracted_title: title = extracted_title + # Fallback to link text if HTML title extraction failed + if title == "Untitled" and link_text_fallbacks: + fallback_text = link_text_fallbacks.get(original_url, "") + if fallback_text: + title = fallback_text + successful_results.append({ "url": original_url, "markdown": result.markdown.fit_markdown, diff --git a/python/src/server/services/crawling/strategies/recursive.py b/python/src/server/services/crawling/strategies/recursive.py index d13b51d4..3cdee750 100644 --- a/python/src/server/services/crawling/strategies/recursive.py +++ b/python/src/server/services/crawling/strategies/recursive.py @@ -277,10 +277,23 @@ class RecursiveCrawlStrategy: total_processed += 1 if result.success and result.markdown and result.markdown.fit_markdown: + # Extract title from HTML tag + title = "Untitled" + if result.html: + import re + title_match = re.search(r'<title[^>]*>(.*?)', result.html, re.IGNORECASE | re.DOTALL) + if title_match: + extracted_title = title_match.group(1).strip() + # Clean up HTML entities + extracted_title = extracted_title.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + if extracted_title: + title = extracted_title + results_all.append({ "url": original_url, "markdown": result.markdown.fit_markdown, "html": result.html, # Always use raw HTML for code extraction + "title": title, }) depth_successful += 1