Updating title exxtraction for llms.txt

This commit is contained in:
Cole Medin
2025-10-10 18:16:03 -05:00
parent 4a9ed51cff
commit 77e9342c27
4 changed files with 75 additions and 28 deletions

View File

@@ -214,6 +214,7 @@ class CrawlingService:
urls: list[str],
max_concurrent: int | None = None,
progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
link_text_fallbacks: dict[str, str] | None = None,
) -> list[dict[str, Any]]:
"""Batch crawl multiple URLs in parallel."""
return await self.batch_strategy.crawl_batch_with_progress(
@@ -223,6 +224,7 @@ class CrawlingService:
max_concurrent,
progress_callback,
self._check_cancellation, # Pass cancellation check
link_text_fallbacks, # Pass link text fallbacks
)
async def crawl_recursive_with_progress(
@@ -698,35 +700,40 @@ class CrawlingService:
if crawl_results and len(crawl_results) > 0:
content = crawl_results[0].get('markdown', '')
if self.url_handler.is_link_collection_file(url, content):
# Extract links from the content
extracted_links = self.url_handler.extract_markdown_links(content, url)
# Extract links WITH text from the content
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
# Filter out self-referential links to avoid redundant crawling
if extracted_links:
original_count = len(extracted_links)
extracted_links = [
link for link in extracted_links
if extracted_links_with_text:
original_count = len(extracted_links_with_text)
extracted_links_with_text = [
(link, text) for link, text in extracted_links_with_text
if not self._is_self_link(link, url)
]
self_filtered_count = original_count - len(extracted_links)
self_filtered_count = original_count - len(extracted_links_with_text)
if self_filtered_count > 0:
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
if extracted_links:
original_count = len(extracted_links)
extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)]
filtered_count = original_count - len(extracted_links)
if extracted_links_with_text:
original_count = len(extracted_links_with_text)
extracted_links_with_text = [(link, text) for link, text in extracted_links_with_text if not self.url_handler.is_binary_file(link)]
filtered_count = original_count - len(extracted_links_with_text)
if filtered_count > 0:
logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
if extracted_links:
if extracted_links_with_text:
# Build mapping of URL -> link text for title fallback
url_to_link_text = {link: text for link, text in extracted_links_with_text}
extracted_links = [link for link, _ in extracted_links_with_text]
# Crawl the extracted links using batch crawling
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
batch_results = await self.crawl_batch_with_progress(
extracted_links,
max_concurrent=request.get('max_concurrent'), # None -> use DB settings
progress_callback=await self._create_crawl_progress_callback("crawling"),
link_text_fallbacks=url_to_link_text, # Pass link text for title fallback
)
# Combine original text file results with batch results

View File

@@ -282,21 +282,37 @@ class URLHandler:
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
"""
Extract markdown-style links from text content.
Args:
content: Text content to extract links from
base_url: Base URL to resolve relative links against
Returns:
List of absolute URLs found in the content
"""
# Extract with text and return only URLs for backward compatibility
links_with_text = URLHandler.extract_markdown_links_with_text(content, base_url)
return [url for url, _ in links_with_text]
@staticmethod
def extract_markdown_links_with_text(content: str, base_url: Optional[str] = None) -> List[tuple[str, str]]:
"""
Extract markdown-style links from text content with their link text.
Args:
content: Text content to extract links from
base_url: Base URL to resolve relative links against
Returns:
List of (url, link_text) tuples
"""
try:
if not content:
return []
# Ultimate URL pattern with comprehensive format support:
# 1) [text](url) - markdown links
# 2) <https://...> - autolinks
# 2) <https://...> - autolinks
# 3) https://... - bare URLs with protocol
# 4) //example.com - protocol-relative URLs
# 5) www.example.com - scheme-less www URLs
@@ -317,7 +333,7 @@ class URLHandler:
cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc'))
return cleaned
urls = []
links = []
for match in re.finditer(combined_pattern, content):
url = (
match.group('md')
@@ -350,21 +366,24 @@ class URLHandler:
# Only include HTTP/HTTPS URLs
if url.startswith(('http://', 'https://')):
urls.append(url)
# Remove duplicates while preserving order
# Extract link text if available (from markdown links)
link_text = match.group('text') if match.group('md') else ''
link_text = link_text.strip() if link_text else ''
links.append((url, link_text))
# Remove duplicates while preserving order (first occurrence wins)
seen = set()
unique_urls = []
for url in urls:
unique_links = []
for url, text in links:
if url not in seen:
seen.add(url)
unique_urls.append(url)
logger.info(f"Extracted {len(unique_urls)} unique links from content")
return unique_urls
unique_links.append((url, text))
logger.info(f"Extracted {len(unique_links)} unique links from content")
return unique_links
except Exception as e:
logger.error(f"Error extracting markdown links: {e}", exc_info=True)
logger.error(f"Error extracting markdown links with text: {e}", exc_info=True)
return []
@staticmethod

View File

@@ -38,6 +38,7 @@ class BatchCrawlStrategy:
max_concurrent: int | None = None,
progress_callback: Callable[..., Awaitable[None]] | None = None,
cancellation_check: Callable[[], None] | None = None,
link_text_fallbacks: dict[str, str] | None = None,
) -> list[dict[str, Any]]:
"""
Batch crawl multiple URLs in parallel with progress reporting.
@@ -49,6 +50,7 @@ class BatchCrawlStrategy:
max_concurrent: Maximum concurrent crawls
progress_callback: Optional callback for progress updates
cancellation_check: Optional function to check for cancellation
link_text_fallbacks: Optional dict mapping URLs to link text for title fallback
Returns:
List of crawl results
@@ -247,6 +249,12 @@ class BatchCrawlStrategy:
if extracted_title:
title = extracted_title
# Fallback to link text if HTML title extraction failed
if title == "Untitled" and link_text_fallbacks:
fallback_text = link_text_fallbacks.get(original_url, "")
if fallback_text:
title = fallback_text
successful_results.append({
"url": original_url,
"markdown": result.markdown.fit_markdown,

View File

@@ -277,10 +277,23 @@ class RecursiveCrawlStrategy:
total_processed += 1
if result.success and result.markdown and result.markdown.fit_markdown:
# Extract title from HTML <title> tag
title = "Untitled"
if result.html:
import re
title_match = re.search(r'<title[^>]*>(.*?)</title>', result.html, re.IGNORECASE | re.DOTALL)
if title_match:
extracted_title = title_match.group(1).strip()
# Clean up HTML entities
extracted_title = extracted_title.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"')
if extracted_title:
title = extracted_title
results_all.append({
"url": original_url,
"markdown": result.markdown.fit_markdown,
"html": result.html, # Always use raw HTML for code extraction
"title": title,
})
depth_successful += 1