mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Updating title exxtraction for llms.txt
This commit is contained in:
@@ -214,6 +214,7 @@ class CrawlingService:
|
||||
urls: list[str],
|
||||
max_concurrent: int | None = None,
|
||||
progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
|
||||
link_text_fallbacks: dict[str, str] | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Batch crawl multiple URLs in parallel."""
|
||||
return await self.batch_strategy.crawl_batch_with_progress(
|
||||
@@ -223,6 +224,7 @@ class CrawlingService:
|
||||
max_concurrent,
|
||||
progress_callback,
|
||||
self._check_cancellation, # Pass cancellation check
|
||||
link_text_fallbacks, # Pass link text fallbacks
|
||||
)
|
||||
|
||||
async def crawl_recursive_with_progress(
|
||||
@@ -698,35 +700,40 @@ class CrawlingService:
|
||||
if crawl_results and len(crawl_results) > 0:
|
||||
content = crawl_results[0].get('markdown', '')
|
||||
if self.url_handler.is_link_collection_file(url, content):
|
||||
# Extract links from the content
|
||||
extracted_links = self.url_handler.extract_markdown_links(content, url)
|
||||
# Extract links WITH text from the content
|
||||
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
|
||||
|
||||
# Filter out self-referential links to avoid redundant crawling
|
||||
if extracted_links:
|
||||
original_count = len(extracted_links)
|
||||
extracted_links = [
|
||||
link for link in extracted_links
|
||||
if extracted_links_with_text:
|
||||
original_count = len(extracted_links_with_text)
|
||||
extracted_links_with_text = [
|
||||
(link, text) for link, text in extracted_links_with_text
|
||||
if not self._is_self_link(link, url)
|
||||
]
|
||||
self_filtered_count = original_count - len(extracted_links)
|
||||
self_filtered_count = original_count - len(extracted_links_with_text)
|
||||
if self_filtered_count > 0:
|
||||
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
|
||||
|
||||
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
|
||||
if extracted_links:
|
||||
original_count = len(extracted_links)
|
||||
extracted_links = [link for link in extracted_links if not self.url_handler.is_binary_file(link)]
|
||||
filtered_count = original_count - len(extracted_links)
|
||||
if extracted_links_with_text:
|
||||
original_count = len(extracted_links_with_text)
|
||||
extracted_links_with_text = [(link, text) for link, text in extracted_links_with_text if not self.url_handler.is_binary_file(link)]
|
||||
filtered_count = original_count - len(extracted_links_with_text)
|
||||
if filtered_count > 0:
|
||||
logger.info(f"Filtered out {filtered_count} binary files from {original_count} extracted links")
|
||||
|
||||
if extracted_links:
|
||||
if extracted_links_with_text:
|
||||
# Build mapping of URL -> link text for title fallback
|
||||
url_to_link_text = {link: text for link, text in extracted_links_with_text}
|
||||
extracted_links = [link for link, _ in extracted_links_with_text]
|
||||
|
||||
# Crawl the extracted links using batch crawling
|
||||
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
|
||||
batch_results = await self.crawl_batch_with_progress(
|
||||
extracted_links,
|
||||
max_concurrent=request.get('max_concurrent'), # None -> use DB settings
|
||||
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||
link_text_fallbacks=url_to_link_text, # Pass link text for title fallback
|
||||
)
|
||||
|
||||
# Combine original text file results with batch results
|
||||
|
||||
@@ -282,21 +282,37 @@ class URLHandler:
|
||||
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
|
||||
"""
|
||||
Extract markdown-style links from text content.
|
||||
|
||||
|
||||
Args:
|
||||
content: Text content to extract links from
|
||||
base_url: Base URL to resolve relative links against
|
||||
|
||||
|
||||
Returns:
|
||||
List of absolute URLs found in the content
|
||||
"""
|
||||
# Extract with text and return only URLs for backward compatibility
|
||||
links_with_text = URLHandler.extract_markdown_links_with_text(content, base_url)
|
||||
return [url for url, _ in links_with_text]
|
||||
|
||||
@staticmethod
|
||||
def extract_markdown_links_with_text(content: str, base_url: Optional[str] = None) -> List[tuple[str, str]]:
|
||||
"""
|
||||
Extract markdown-style links from text content with their link text.
|
||||
|
||||
Args:
|
||||
content: Text content to extract links from
|
||||
base_url: Base URL to resolve relative links against
|
||||
|
||||
Returns:
|
||||
List of (url, link_text) tuples
|
||||
"""
|
||||
try:
|
||||
if not content:
|
||||
return []
|
||||
|
||||
|
||||
# Ultimate URL pattern with comprehensive format support:
|
||||
# 1) [text](url) - markdown links
|
||||
# 2) <https://...> - autolinks
|
||||
# 2) <https://...> - autolinks
|
||||
# 3) https://... - bare URLs with protocol
|
||||
# 4) //example.com - protocol-relative URLs
|
||||
# 5) www.example.com - scheme-less www URLs
|
||||
@@ -317,7 +333,7 @@ class URLHandler:
|
||||
cleaned = ''.join(c for c in cleaned if unicodedata.category(c) not in ('Cf', 'Cc'))
|
||||
return cleaned
|
||||
|
||||
urls = []
|
||||
links = []
|
||||
for match in re.finditer(combined_pattern, content):
|
||||
url = (
|
||||
match.group('md')
|
||||
@@ -350,21 +366,24 @@ class URLHandler:
|
||||
|
||||
# Only include HTTP/HTTPS URLs
|
||||
if url.startswith(('http://', 'https://')):
|
||||
urls.append(url)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
# Extract link text if available (from markdown links)
|
||||
link_text = match.group('text') if match.group('md') else ''
|
||||
link_text = link_text.strip() if link_text else ''
|
||||
links.append((url, link_text))
|
||||
|
||||
# Remove duplicates while preserving order (first occurrence wins)
|
||||
seen = set()
|
||||
unique_urls = []
|
||||
for url in urls:
|
||||
unique_links = []
|
||||
for url, text in links:
|
||||
if url not in seen:
|
||||
seen.add(url)
|
||||
unique_urls.append(url)
|
||||
|
||||
logger.info(f"Extracted {len(unique_urls)} unique links from content")
|
||||
return unique_urls
|
||||
|
||||
unique_links.append((url, text))
|
||||
|
||||
logger.info(f"Extracted {len(unique_links)} unique links from content")
|
||||
return unique_links
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting markdown links: {e}", exc_info=True)
|
||||
logger.error(f"Error extracting markdown links with text: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -38,6 +38,7 @@ class BatchCrawlStrategy:
|
||||
max_concurrent: int | None = None,
|
||||
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
||||
cancellation_check: Callable[[], None] | None = None,
|
||||
link_text_fallbacks: dict[str, str] | None = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Batch crawl multiple URLs in parallel with progress reporting.
|
||||
@@ -49,6 +50,7 @@ class BatchCrawlStrategy:
|
||||
max_concurrent: Maximum concurrent crawls
|
||||
progress_callback: Optional callback for progress updates
|
||||
cancellation_check: Optional function to check for cancellation
|
||||
link_text_fallbacks: Optional dict mapping URLs to link text for title fallback
|
||||
|
||||
Returns:
|
||||
List of crawl results
|
||||
@@ -247,6 +249,12 @@ class BatchCrawlStrategy:
|
||||
if extracted_title:
|
||||
title = extracted_title
|
||||
|
||||
# Fallback to link text if HTML title extraction failed
|
||||
if title == "Untitled" and link_text_fallbacks:
|
||||
fallback_text = link_text_fallbacks.get(original_url, "")
|
||||
if fallback_text:
|
||||
title = fallback_text
|
||||
|
||||
successful_results.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown.fit_markdown,
|
||||
|
||||
@@ -277,10 +277,23 @@ class RecursiveCrawlStrategy:
|
||||
total_processed += 1
|
||||
|
||||
if result.success and result.markdown and result.markdown.fit_markdown:
|
||||
# Extract title from HTML <title> tag
|
||||
title = "Untitled"
|
||||
if result.html:
|
||||
import re
|
||||
title_match = re.search(r'<title[^>]*>(.*?)</title>', result.html, re.IGNORECASE | re.DOTALL)
|
||||
if title_match:
|
||||
extracted_title = title_match.group(1).strip()
|
||||
# Clean up HTML entities
|
||||
extracted_title = extracted_title.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
|
||||
if extracted_title:
|
||||
title = extracted_title
|
||||
|
||||
results_all.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown.fit_markdown,
|
||||
"html": result.html, # Always use raw HTML for code extraction
|
||||
"title": title,
|
||||
})
|
||||
depth_successful += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user