mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-04 21:58:47 -05:00
Merge branch 'ui-changes-backup' into merge-ui-socket-fixes
This commit is contained in:
@@ -316,8 +316,8 @@ def register_task_tools(mcp: FastMCP):
|
||||
|
||||
Args:
|
||||
task_id: UUID of the task to update
|
||||
title: New title (optional)
|
||||
description: New description (optional)
|
||||
title: New task title (optional)
|
||||
description: New task description (optional)
|
||||
status: New status - "todo" | "doing" | "review" | "done" (optional)
|
||||
assignee: New assignee (optional)
|
||||
task_order: New priority order (optional)
|
||||
@@ -358,7 +358,7 @@ def register_task_tools(mcp: FastMCP):
|
||||
if not update_fields:
|
||||
return MCPErrorFormatter.format_error(
|
||||
error_type="validation_error",
|
||||
message="No fields to update",
|
||||
message="No fields provided to update",
|
||||
suggestion="Provide at least one field to update",
|
||||
)
|
||||
|
||||
|
||||
@@ -18,9 +18,7 @@ from pydantic import BaseModel
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Import Socket.IO instance
|
||||
from ..socketio_app import get_socketio_instance
|
||||
|
||||
sio = get_socketio_instance()
|
||||
from ..socketio_app import sio
|
||||
|
||||
# Create router
|
||||
router = APIRouter(prefix="/api/agent-chat", tags=["agent-chat"])
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,12 +8,10 @@ No other modules should import from this file.
|
||||
import asyncio
|
||||
|
||||
from ..config.logfire_config import get_logger
|
||||
from ..socketio_app import get_socketio_instance
|
||||
from ..socketio_app import sio
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Get Socket.IO instance
|
||||
sio = get_socketio_instance()
|
||||
|
||||
|
||||
# Core broadcast functions
|
||||
|
||||
@@ -13,13 +13,10 @@ from ..config.logfire_config import get_logger
|
||||
from ..services.background_task_manager import get_task_manager
|
||||
from ..services.projects.project_service import ProjectService
|
||||
from ..services.projects.source_linking_service import SourceLinkingService
|
||||
from ..socketio_app import get_socketio_instance
|
||||
from ..socketio_app import sio
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Get Socket.IO instance
|
||||
sio = get_socketio_instance()
|
||||
logger.info(f"🔗 [SOCKETIO] Socket.IO instance ID: {id(sio)}")
|
||||
|
||||
# Rate limiting for Socket.IO broadcasts
|
||||
_last_broadcast_times: dict[str, float] = {}
|
||||
|
||||
@@ -217,14 +217,21 @@ class CodeExtractionService:
|
||||
Returns:
|
||||
List of code blocks with metadata
|
||||
"""
|
||||
import asyncio
|
||||
import time
|
||||
|
||||
# Progress will be reported during the loop below
|
||||
|
||||
all_code_blocks = []
|
||||
total_docs = len(crawl_results)
|
||||
completed_docs = 0
|
||||
|
||||
# PERFORMANCE: Track extraction time per document
|
||||
MAX_EXTRACTION_TIME_PER_DOC = 5.0 # 5 seconds max per document
|
||||
|
||||
for doc in crawl_results:
|
||||
try:
|
||||
doc_start_time = time.time()
|
||||
source_url = doc["url"]
|
||||
html_content = doc.get("html", "")
|
||||
md = doc.get("markdown", "")
|
||||
@@ -234,9 +241,7 @@ class CodeExtractionService:
|
||||
f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
|
||||
)
|
||||
|
||||
# Get dynamic minimum length based on document context
|
||||
# Extract some context from the document for analysis
|
||||
doc_context = md[:1000] if md else html_content[:1000] if html_content else ""
|
||||
# Dynamic minimum length is handled inside the extraction methods
|
||||
|
||||
# Check markdown first to see if it has code blocks
|
||||
if md:
|
||||
@@ -287,15 +292,32 @@ class CodeExtractionService:
|
||||
|
||||
# If not a text file or no code blocks found, try HTML extraction first
|
||||
if len(code_blocks) == 0 and html_content and not is_text_file:
|
||||
safe_logfire_info(
|
||||
f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
|
||||
)
|
||||
html_code_blocks = await self._extract_html_code_blocks(html_content)
|
||||
if html_code_blocks:
|
||||
code_blocks = html_code_blocks
|
||||
# PERFORMANCE: Check if we've already spent too much time on this document
|
||||
elapsed_time = time.time() - doc_start_time
|
||||
if elapsed_time > MAX_EXTRACTION_TIME_PER_DOC:
|
||||
safe_logfire_info(
|
||||
f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
|
||||
f"⏱️ Skipping HTML extraction for {source_url} - already spent {elapsed_time:.1f}s"
|
||||
)
|
||||
else:
|
||||
safe_logfire_info(
|
||||
f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
|
||||
)
|
||||
# Create a timeout for HTML extraction
|
||||
remaining_time = MAX_EXTRACTION_TIME_PER_DOC - elapsed_time
|
||||
try:
|
||||
html_code_blocks = await asyncio.wait_for(
|
||||
self._extract_html_code_blocks(html_content, source_url),
|
||||
timeout=remaining_time
|
||||
)
|
||||
if html_code_blocks:
|
||||
code_blocks = html_code_blocks
|
||||
safe_logfire_info(
|
||||
f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
safe_logfire_info(
|
||||
f"⏱️ HTML extraction timed out after {remaining_time:.1f}s for {source_url}"
|
||||
)
|
||||
|
||||
# If still no code blocks, try markdown extraction as fallback
|
||||
if len(code_blocks) == 0 and md and "```" in md:
|
||||
@@ -322,6 +344,14 @@ class CodeExtractionService:
|
||||
|
||||
# Update progress only after completing document extraction
|
||||
completed_docs += 1
|
||||
extraction_time = time.time() - doc_start_time
|
||||
if extraction_time > 2.0: # Log slow extractions
|
||||
safe_logfire_info(
|
||||
f"⏱️ Document extraction took {extraction_time:.1f}s | url={source_url} | "
|
||||
f"html_size={len(html_content) if html_content else 0} | "
|
||||
f"blocks_found={len([b for b in all_code_blocks if b['source_url'] == source_url])}"
|
||||
)
|
||||
|
||||
if progress_callback and total_docs > 0:
|
||||
# Calculate progress within the specified range
|
||||
raw_progress = completed_docs / total_docs
|
||||
@@ -343,13 +373,14 @@ class CodeExtractionService:
|
||||
|
||||
return all_code_blocks
|
||||
|
||||
async def _extract_html_code_blocks(self, content: str) -> list[dict[str, Any]]:
|
||||
async def _extract_html_code_blocks(self, content: str, source_url: str = "") -> list[dict[str, Any]]:
|
||||
"""
|
||||
Extract code blocks from HTML patterns in content.
|
||||
This is a fallback when markdown conversion didn't preserve code blocks.
|
||||
|
||||
Args:
|
||||
content: The content to search for HTML code patterns
|
||||
source_url: The URL of the document being processed
|
||||
min_length: Minimum length for code blocks
|
||||
|
||||
Returns:
|
||||
@@ -359,6 +390,20 @@ class CodeExtractionService:
|
||||
|
||||
# Add detailed logging
|
||||
safe_logfire_info(f"Processing HTML of length {len(content)} for code extraction")
|
||||
|
||||
# PERFORMANCE OPTIMIZATION: Skip extremely large HTML files or chunk them
|
||||
MAX_HTML_SIZE = 1_000_000 # 1MB limit for single-pass processing (increased from 500KB)
|
||||
if len(content) > MAX_HTML_SIZE:
|
||||
safe_logfire_info(
|
||||
f"⚠️ HTML content is very large ({len(content)} bytes). "
|
||||
f"Limiting to first {MAX_HTML_SIZE} bytes to prevent timeout."
|
||||
)
|
||||
# For very large files, focus on the first portion where code examples are likely to be
|
||||
content = content[:MAX_HTML_SIZE]
|
||||
# Try to find a good cutoff point (end of a tag)
|
||||
last_tag_end = content.rfind('>')
|
||||
if last_tag_end > MAX_HTML_SIZE - 1000:
|
||||
content = content[:last_tag_end + 1]
|
||||
|
||||
# Check if we have actual content
|
||||
if len(content) < 1000:
|
||||
@@ -510,9 +555,71 @@ class CodeExtractionService:
|
||||
),
|
||||
]
|
||||
|
||||
for pattern_tuple in patterns:
|
||||
# PERFORMANCE: Early exit checks to avoid unnecessary regex processing
|
||||
# Check more content (20KB instead of 5KB) and add URL-based exceptions
|
||||
check_size = min(20000, len(content)) # Check first 20KB or entire content if smaller
|
||||
has_code_indicators = any(indicator in content[:check_size] for indicator in
|
||||
['<pre', '<code', 'language-', 'hljs', 'prism', 'shiki', 'highlight'])
|
||||
|
||||
# Never skip certain documentation sites that we know have code
|
||||
is_known_code_site = any(domain in source_url.lower() for domain in
|
||||
['milkdown', 'github.com', 'gitlab', 'docs.', 'dev.', 'api.'])
|
||||
|
||||
if not has_code_indicators and not is_known_code_site:
|
||||
safe_logfire_info(f"No code indicators found in first {check_size} chars and not a known code site, skipping HTML extraction | url={source_url}")
|
||||
return []
|
||||
|
||||
if is_known_code_site and not has_code_indicators:
|
||||
safe_logfire_info(f"Known code site but no indicators in first {check_size} chars, continuing anyway | url={source_url}")
|
||||
|
||||
# PERFORMANCE: Limit number of patterns to check based on detected libraries
|
||||
patterns_to_check = []
|
||||
content_lower = content[:10000].lower() # Check first 10KB for library detection
|
||||
|
||||
# Selectively add patterns based on what's detected
|
||||
if 'milkdown' in content_lower:
|
||||
patterns_to_check.extend([p for p in patterns if 'milkdown' in p[1]])
|
||||
if 'monaco' in content_lower:
|
||||
patterns_to_check.extend([p for p in patterns if 'monaco' in p[1]])
|
||||
if 'codemirror' in content_lower or 'cm-' in content_lower:
|
||||
patterns_to_check.extend([p for p in patterns if 'codemirror' in p[1]])
|
||||
if 'prism' in content_lower:
|
||||
patterns_to_check.extend([p for p in patterns if 'prism' in p[1]])
|
||||
if 'hljs' in content_lower or 'highlight' in content_lower:
|
||||
patterns_to_check.extend([p for p in patterns if 'hljs' in p[1] or 'highlight' in p[1]])
|
||||
if 'shiki' in content_lower or 'astro' in content_lower:
|
||||
patterns_to_check.extend([p for p in patterns if 'shiki' in p[1] or 'astro' in p[1]])
|
||||
|
||||
# Always include standard patterns as fallback (get ALL standard/generic patterns, not just last 5)
|
||||
standard_patterns = [p for p in patterns if any(tag in p[1] for tag in ['standard', 'generic', 'prism', 'hljs'])]
|
||||
patterns_to_check.extend(standard_patterns)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_patterns = []
|
||||
for p in patterns_to_check:
|
||||
if p[1] not in seen:
|
||||
unique_patterns.append(p)
|
||||
seen.add(p[1])
|
||||
patterns_to_check = unique_patterns
|
||||
|
||||
# If we have very few patterns and it's a known code site, add more generic patterns
|
||||
if len(patterns_to_check) < 5 and is_known_code_site:
|
||||
safe_logfire_info(f"Known code site with few patterns ({len(patterns_to_check)}), adding more generic patterns")
|
||||
patterns_to_check = patterns # Use all patterns for known code sites
|
||||
|
||||
safe_logfire_info(f"Checking {len(patterns_to_check)} relevant patterns out of {len(patterns)} total")
|
||||
|
||||
for pattern_tuple in patterns_to_check:
|
||||
pattern_str, source_type = pattern_tuple
|
||||
matches = list(re.finditer(pattern_str, content, re.DOTALL | re.IGNORECASE))
|
||||
|
||||
# PERFORMANCE: Use re.finditer with smaller chunks for very long content
|
||||
# Only use DOTALL for patterns that really need it (multi-line blocks)
|
||||
flags = re.IGNORECASE
|
||||
if 'monaco' in source_type or 'codemirror' in source_type:
|
||||
flags |= re.DOTALL # These need DOTALL for multi-line matching
|
||||
|
||||
matches = list(re.finditer(pattern_str, content, flags))
|
||||
|
||||
# Log pattern matches for Milkdown patterns and CodeMirror
|
||||
if matches and (
|
||||
|
||||
@@ -433,6 +433,9 @@ class CrawlingService:
|
||||
)
|
||||
|
||||
# Complete - send both the progress update and completion event
|
||||
# CRITICAL: This is the ONLY place that should send status="completed"!
|
||||
# All crawl strategies (batch, recursive, etc.) should use "finished" or other words.
|
||||
# The frontend disconnects when it sees status="completed", so this must be the final step.
|
||||
await update_mapped_progress(
|
||||
"completed",
|
||||
100,
|
||||
|
||||
@@ -73,7 +73,8 @@ class BatchCrawlStrategy:
|
||||
except Exception as e:
|
||||
# For non-critical errors (e.g., network issues), use defaults but log prominently
|
||||
logger.error(
|
||||
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
|
||||
f"Failed to load crawl settings from database: {e}, using defaults",
|
||||
exc_info=True
|
||||
)
|
||||
batch_size = 50
|
||||
if max_concurrent is None:
|
||||
@@ -98,101 +99,93 @@ class BatchCrawlStrategy:
|
||||
wait_for_images=False, # Skip images for faster crawling
|
||||
scan_full_page=True, # Trigger lazy loading
|
||||
exclude_all_images=False,
|
||||
remove_overlay_elements=True,
|
||||
process_iframes=True,
|
||||
)
|
||||
else:
|
||||
# Configuration for regular batch crawling
|
||||
# Regular sites use standard configuration
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True, # Enable streaming
|
||||
stream=True,
|
||||
markdown_generator=self.markdown_generator,
|
||||
wait_until=settings.get("CRAWL_WAIT_STRATEGY", "domcontentloaded"),
|
||||
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "45000")),
|
||||
page_timeout=int(settings.get("CRAWL_PAGE_TIMEOUT", "30000")),
|
||||
delay_before_return_html=float(settings.get("CRAWL_DELAY_BEFORE_HTML", "0.5")),
|
||||
scan_full_page=True,
|
||||
wait_for_images=False,
|
||||
scan_full_page=False, # Don't scan full page for non-doc sites
|
||||
exclude_all_images=False,
|
||||
)
|
||||
|
||||
# Transform URLs if needed
|
||||
processed_urls = [transform_url_func(url) for url in urls]
|
||||
|
||||
# Create memory adaptive dispatcher
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=memory_threshold,
|
||||
max_sessions=max_concurrent,
|
||||
memory_threshold_mb=memory_threshold,
|
||||
check_interval=check_interval,
|
||||
max_session_permit=max_concurrent,
|
||||
)
|
||||
|
||||
async def report_progress(percentage: int, message: str, **kwargs):
|
||||
"""Helper to report progress if callback is available"""
|
||||
# Crawl URLs in batches using arun_many
|
||||
results = []
|
||||
total_urls = len(processed_urls)
|
||||
|
||||
for batch_start in range(0, total_urls, batch_size):
|
||||
batch_end = min(batch_start + batch_size, total_urls)
|
||||
batch = processed_urls[batch_start:batch_end]
|
||||
|
||||
# Calculate progress for this batch
|
||||
if progress_callback:
|
||||
step_info = {"currentStep": message, "stepMessage": message, **kwargs}
|
||||
await progress_callback("crawling", percentage, message, step_info=step_info)
|
||||
|
||||
total_urls = len(urls)
|
||||
await report_progress(start_progress, f"Starting to crawl {total_urls} URLs...")
|
||||
|
||||
# Use configured batch size
|
||||
successful_results = []
|
||||
processed = 0
|
||||
|
||||
# Transform all URLs at the beginning
|
||||
url_mapping = {} # Map transformed URLs back to original
|
||||
transformed_urls = []
|
||||
for url in urls:
|
||||
transformed = transform_url_func(url)
|
||||
transformed_urls.append(transformed)
|
||||
url_mapping[transformed] = url
|
||||
|
||||
for i in range(0, total_urls, batch_size):
|
||||
batch_urls = transformed_urls[i : i + batch_size]
|
||||
batch_start = i
|
||||
batch_end = min(i + batch_size, total_urls)
|
||||
|
||||
# Report batch start with smooth progress
|
||||
progress_percentage = start_progress + int(
|
||||
(i / total_urls) * (end_progress - start_progress)
|
||||
)
|
||||
await report_progress(
|
||||
progress_percentage,
|
||||
f"Processing batch {batch_start + 1}-{batch_end} of {total_urls} URLs...",
|
||||
)
|
||||
|
||||
# Crawl this batch using arun_many with streaming
|
||||
logger.info(
|
||||
f"Starting parallel crawl of batch {batch_start + 1}-{batch_end} ({len(batch_urls)} URLs)"
|
||||
)
|
||||
batch_results = await self.crawler.arun_many(
|
||||
urls=batch_urls, config=crawl_config, dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Handle streaming results
|
||||
async for result in batch_results:
|
||||
processed += 1
|
||||
if result.success and result.markdown:
|
||||
# Map back to original URL
|
||||
original_url = url_mapping.get(result.url, result.url)
|
||||
successful_results.append({
|
||||
"url": original_url,
|
||||
"markdown": result.markdown,
|
||||
"html": result.html, # Use raw HTML
|
||||
})
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to crawl {result.url}: {getattr(result, 'error_message', 'Unknown error')}"
|
||||
)
|
||||
|
||||
# Report individual URL progress with smooth increments
|
||||
progress_percentage = start_progress + int(
|
||||
(processed / total_urls) * (end_progress - start_progress)
|
||||
batch_progress = start_progress + ((batch_start / total_urls) * (end_progress - start_progress))
|
||||
await progress_callback(
|
||||
"batch_crawling",
|
||||
int(batch_progress),
|
||||
f"Crawling batch {batch_start // batch_size + 1} ({batch_start + 1}-{batch_end}/{total_urls} URLs)"
|
||||
)
|
||||
# Report more frequently for smoother progress
|
||||
if (
|
||||
processed % 5 == 0 or processed == total_urls
|
||||
): # Report every 5 URLs or at the end
|
||||
await report_progress(
|
||||
progress_percentage,
|
||||
f"Crawled {processed}/{total_urls} pages ({len(successful_results)} successful)",
|
||||
)
|
||||
|
||||
await report_progress(
|
||||
end_progress,
|
||||
f"Batch crawling completed: {len(successful_results)}/{total_urls} pages successful",
|
||||
)
|
||||
return successful_results
|
||||
# Run batch crawl
|
||||
try:
|
||||
batch_results = await self.crawler.arun_many(
|
||||
batch,
|
||||
config=crawl_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Process results
|
||||
for result in batch_results:
|
||||
if result.success:
|
||||
results.append({
|
||||
"url": result.url,
|
||||
"markdown": result.markdown_v2.raw_markdown if result.markdown_v2 else "",
|
||||
"success": True,
|
||||
"metadata": result.extracted_content if hasattr(result, 'extracted_content') else {}
|
||||
})
|
||||
else:
|
||||
logger.warning(f"Failed to crawl {result.url}: {result.error_message}")
|
||||
results.append({
|
||||
"url": result.url,
|
||||
"markdown": "",
|
||||
"success": False,
|
||||
"error": result.error_message
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Batch crawl error: {e}", exc_info=True)
|
||||
# Add failed results for this batch
|
||||
for url in batch:
|
||||
results.append({
|
||||
"url": url,
|
||||
"markdown": "",
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Update progress after batch completion
|
||||
# IMPORTANT: Use "finished" not "completed" - only the final orchestrator should send "completed"
|
||||
if progress_callback:
|
||||
batch_progress = start_progress + ((batch_end / total_urls) * (end_progress - start_progress))
|
||||
await progress_callback(
|
||||
"batch_crawling",
|
||||
int(batch_progress),
|
||||
f"Finished batch {batch_start // batch_size + 1}"
|
||||
)
|
||||
|
||||
return results
|
||||
@@ -60,7 +60,7 @@ class RecursiveCrawlStrategy:
|
||||
if not self.crawler:
|
||||
logger.error("No crawler instance available for recursive crawling")
|
||||
if progress_callback:
|
||||
await progress_callback("error", 0, "Crawler not available")
|
||||
await progress_callback("error", 0, "Crawler not available", step_info={"currentStep": "error", "stepMessage": "Crawler not available"})
|
||||
return []
|
||||
|
||||
# Load settings from database - fail fast on configuration errors
|
||||
@@ -78,7 +78,8 @@ class RecursiveCrawlStrategy:
|
||||
except Exception as e:
|
||||
# For non-critical errors (e.g., network issues), use defaults but log prominently
|
||||
logger.error(
|
||||
f"Failed to load crawl settings from database: {e}, using defaults", exc_info=True
|
||||
f"Failed to load crawl settings from database: {e}, using defaults",
|
||||
exc_info=True
|
||||
)
|
||||
batch_size = 50
|
||||
if max_concurrent is None:
|
||||
@@ -126,11 +127,19 @@ class RecursiveCrawlStrategy:
|
||||
)
|
||||
|
||||
async def report_progress(percentage: int, message: str, **kwargs):
|
||||
"""Helper to report progress if callback is available"""
|
||||
"""Helper to report progress if callback is available
|
||||
|
||||
IMPORTANT: Never use "complete" or "completed" in messages here!
|
||||
This is just an intermediate step in the overall crawl process.
|
||||
Only the final orchestrator should send "completed" status.
|
||||
"""
|
||||
if progress_callback:
|
||||
# Add step information for multi-progress tracking
|
||||
step_info = {"currentStep": message, "stepMessage": message, **kwargs}
|
||||
await progress_callback("crawling", percentage, message, **step_info)
|
||||
step_info = {
|
||||
"currentStep": message,
|
||||
"stepMessage": message
|
||||
}
|
||||
await progress_callback("crawling", percentage, message, step_info=step_info, **kwargs)
|
||||
|
||||
visited = set()
|
||||
|
||||
@@ -169,14 +178,6 @@ class RecursiveCrawlStrategy:
|
||||
batch_urls = urls_to_crawl[batch_idx : batch_idx + batch_size]
|
||||
batch_end_idx = min(batch_idx + batch_size, len(urls_to_crawl))
|
||||
|
||||
# Transform URLs and create mapping for this batch
|
||||
url_mapping = {}
|
||||
transformed_batch_urls = []
|
||||
for url in batch_urls:
|
||||
transformed = transform_url_func(url)
|
||||
transformed_batch_urls.append(transformed)
|
||||
url_mapping[transformed] = url
|
||||
|
||||
# Calculate progress for this batch within the depth
|
||||
batch_progress = depth_start + int(
|
||||
(batch_idx / len(urls_to_crawl)) * (depth_end - depth_start)
|
||||
@@ -191,14 +192,20 @@ class RecursiveCrawlStrategy:
|
||||
# Use arun_many for native parallel crawling with streaming
|
||||
logger.info(f"Starting parallel crawl of {len(batch_urls)} URLs with arun_many")
|
||||
batch_results = await self.crawler.arun_many(
|
||||
urls=transformed_batch_urls, config=run_config, dispatcher=dispatcher
|
||||
urls=batch_urls,
|
||||
config=run_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Handle streaming results from arun_many
|
||||
i = 0
|
||||
async for result in batch_results:
|
||||
# Map back to original URL using the mapping dict
|
||||
original_url = url_mapping.get(result.url, result.url)
|
||||
# Map back to original URL if transformed
|
||||
original_url = result.url
|
||||
for orig_url in batch_urls:
|
||||
if transform_url_func(orig_url) == result.url:
|
||||
original_url = orig_url
|
||||
break
|
||||
|
||||
norm_url = normalize_url(original_url)
|
||||
visited.add(norm_url)
|
||||
@@ -213,14 +220,14 @@ class RecursiveCrawlStrategy:
|
||||
depth_successful += 1
|
||||
|
||||
# Find internal links for next depth
|
||||
links = getattr(result, "links", {}) or {}
|
||||
for link in links.get("internal", []):
|
||||
for link in result.links.get("internal", []):
|
||||
next_url = normalize_url(link["href"])
|
||||
# Skip binary files and already visited URLs
|
||||
is_binary = self.url_handler.is_binary_file(next_url)
|
||||
if next_url not in visited and not is_binary:
|
||||
if next_url not in visited and not self.url_handler.is_binary_file(
|
||||
next_url
|
||||
):
|
||||
next_level_urls.add(next_url)
|
||||
elif is_binary:
|
||||
elif self.url_handler.is_binary_file(next_url):
|
||||
logger.debug(f"Skipping binary file from crawl queue: {next_url}")
|
||||
else:
|
||||
logger.warning(
|
||||
@@ -243,14 +250,16 @@ class RecursiveCrawlStrategy:
|
||||
|
||||
current_urls = next_level_urls
|
||||
|
||||
# Report completion of this depth
|
||||
# Report completion of this depth - IMPORTANT: Use "finished" not "completed"!
|
||||
await report_progress(
|
||||
depth_end,
|
||||
f"Depth {depth + 1} completed: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth",
|
||||
f"Depth {depth + 1} finished: {depth_successful} pages crawled, {len(next_level_urls)} URLs found for next depth",
|
||||
)
|
||||
|
||||
# IMPORTANT: Use "finished" not "complete" - only the final orchestrator should say "completed"
|
||||
await report_progress(
|
||||
end_progress,
|
||||
f"Recursive crawling completed: {len(results_all)} total pages crawled across {max_depth} depth levels",
|
||||
f"Recursive crawl finished: {len(results_all)} pages successfully crawled",
|
||||
)
|
||||
return results_all
|
||||
|
||||
return results_all
|
||||
@@ -1,472 +1,473 @@
|
||||
"""
|
||||
Knowledge Item Service
|
||||
|
||||
Handles all knowledge item CRUD operations and data transformations.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from ...config.logfire_config import safe_logfire_error, safe_logfire_info
|
||||
|
||||
|
||||
class KnowledgeItemService:
|
||||
"""
|
||||
Service for managing knowledge items including listing, filtering, updating, and deletion.
|
||||
"""
|
||||
|
||||
def __init__(self, supabase_client):
|
||||
"""
|
||||
Initialize the knowledge item service.
|
||||
|
||||
Args:
|
||||
supabase_client: The Supabase client for database operations
|
||||
"""
|
||||
self.supabase = supabase_client
|
||||
|
||||
async def list_items(
|
||||
self,
|
||||
page: int = 1,
|
||||
per_page: int = 20,
|
||||
knowledge_type: str | None = None,
|
||||
search: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
List knowledge items with pagination and filtering.
|
||||
|
||||
Args:
|
||||
page: Page number (1-based)
|
||||
per_page: Items per page
|
||||
knowledge_type: Filter by knowledge type
|
||||
search: Search term for filtering
|
||||
|
||||
Returns:
|
||||
Dict containing items, pagination info, and total count
|
||||
"""
|
||||
try:
|
||||
# Build the query with filters at database level for better performance
|
||||
query = self.supabase.from_("archon_sources").select("*")
|
||||
|
||||
# Apply knowledge type filter at database level if provided
|
||||
if knowledge_type:
|
||||
query = query.eq("metadata->>knowledge_type", knowledge_type)
|
||||
|
||||
# Apply search filter at database level if provided
|
||||
if search:
|
||||
search_pattern = f"%{search}%"
|
||||
query = query.or_(
|
||||
f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
|
||||
)
|
||||
|
||||
# Get total count before pagination
|
||||
# Clone the query for counting
|
||||
count_query = self.supabase.from_("archon_sources").select(
|
||||
"*", count="exact", head=True
|
||||
)
|
||||
|
||||
# Apply same filters to count query
|
||||
if knowledge_type:
|
||||
count_query = count_query.eq("metadata->>knowledge_type", knowledge_type)
|
||||
|
||||
if search:
|
||||
search_pattern = f"%{search}%"
|
||||
count_query = count_query.or_(
|
||||
f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
|
||||
)
|
||||
|
||||
count_result = count_query.execute()
|
||||
total = count_result.count if hasattr(count_result, "count") else 0
|
||||
|
||||
# Apply pagination at database level
|
||||
start_idx = (page - 1) * per_page
|
||||
query = query.range(start_idx, start_idx + per_page - 1)
|
||||
|
||||
# Execute query
|
||||
result = query.execute()
|
||||
sources = result.data if result.data else []
|
||||
|
||||
# Get source IDs for batch queries
|
||||
source_ids = [source["source_id"] for source in sources]
|
||||
|
||||
# Debug log source IDs
|
||||
safe_logfire_info(f"Source IDs for batch query: {source_ids}")
|
||||
|
||||
# Batch fetch related data to avoid N+1 queries
|
||||
first_urls = {}
|
||||
code_example_counts = {}
|
||||
chunk_counts = {}
|
||||
|
||||
if source_ids:
|
||||
# Batch fetch first URLs
|
||||
urls_result = (
|
||||
self.supabase.from_("archon_crawled_pages")
|
||||
.select("source_id, url")
|
||||
.in_("source_id", source_ids)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Group URLs by source_id (take first one for each)
|
||||
for item in urls_result.data or []:
|
||||
if item["source_id"] not in first_urls:
|
||||
first_urls[item["source_id"]] = item["url"]
|
||||
|
||||
# Get code example counts per source - NO CONTENT, just counts!
|
||||
# Fetch counts individually for each source
|
||||
for source_id in source_ids:
|
||||
count_result = (
|
||||
self.supabase.from_("archon_code_examples")
|
||||
.select("id", count="exact", head=True)
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
code_example_counts[source_id] = (
|
||||
count_result.count if hasattr(count_result, "count") else 0
|
||||
)
|
||||
|
||||
# Ensure all sources have a count (default to 0)
|
||||
for source_id in source_ids:
|
||||
if source_id not in code_example_counts:
|
||||
code_example_counts[source_id] = 0
|
||||
chunk_counts[source_id] = 0 # Default to 0 to avoid timeout
|
||||
|
||||
safe_logfire_info(f"Code example counts: {code_example_counts}")
|
||||
|
||||
# Transform sources to items with batched data
|
||||
items = []
|
||||
for source in sources:
|
||||
source_id = source["source_id"]
|
||||
source_metadata = source.get("metadata", {})
|
||||
|
||||
# Use batched data instead of individual queries
|
||||
first_page_url = first_urls.get(source_id, f"source://{source_id}")
|
||||
code_examples_count = code_example_counts.get(source_id, 0)
|
||||
chunks_count = chunk_counts.get(source_id, 0)
|
||||
|
||||
# Determine source type
|
||||
source_type = self._determine_source_type(source_metadata, first_page_url)
|
||||
|
||||
item = {
|
||||
"id": source_id,
|
||||
"title": source.get("title", source.get("summary", "Untitled")),
|
||||
"url": first_page_url,
|
||||
"source_id": source_id,
|
||||
"code_examples": [{"count": code_examples_count}]
|
||||
if code_examples_count > 0
|
||||
else [], # Minimal array just for count display
|
||||
"metadata": {
|
||||
"knowledge_type": source_metadata.get("knowledge_type", "technical"),
|
||||
"tags": source_metadata.get("tags", []),
|
||||
"source_type": source_type,
|
||||
"status": "active",
|
||||
"description": source_metadata.get(
|
||||
"description", source.get("summary", "")
|
||||
),
|
||||
"chunks_count": chunks_count,
|
||||
"word_count": source.get("total_word_count", 0),
|
||||
"estimated_pages": round(source.get("total_word_count", 0) / 250, 1),
|
||||
"pages_tooltip": f"{round(source.get('total_word_count', 0) / 250, 1)} pages (≈ {source.get('total_word_count', 0):,} words)",
|
||||
"last_scraped": source.get("updated_at"),
|
||||
"file_name": source_metadata.get("file_name"),
|
||||
"file_type": source_metadata.get("file_type"),
|
||||
"update_frequency": source_metadata.get("update_frequency", 7),
|
||||
"code_examples_count": code_examples_count,
|
||||
**source_metadata,
|
||||
},
|
||||
"created_at": source.get("created_at"),
|
||||
"updated_at": source.get("updated_at"),
|
||||
}
|
||||
items.append(item)
|
||||
|
||||
safe_logfire_info(
|
||||
f"Knowledge items retrieved | total={total} | page={page} | filtered_count={len(items)}"
|
||||
)
|
||||
|
||||
return {
|
||||
"items": items,
|
||||
"total": total,
|
||||
"page": page,
|
||||
"per_page": per_page,
|
||||
"pages": (total + per_page - 1) // per_page,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Failed to list knowledge items | error={str(e)}")
|
||||
raise
|
||||
|
||||
async def get_item(self, source_id: str) -> dict[str, Any] | None:
|
||||
"""
|
||||
Get a single knowledge item by source ID.
|
||||
|
||||
Args:
|
||||
source_id: The source ID to retrieve
|
||||
|
||||
Returns:
|
||||
Knowledge item dict or None if not found
|
||||
"""
|
||||
try:
|
||||
safe_logfire_info(f"Getting knowledge item | source_id={source_id}")
|
||||
|
||||
# Get the source record
|
||||
result = (
|
||||
self.supabase.from_("archon_sources")
|
||||
.select("*")
|
||||
.eq("source_id", source_id)
|
||||
.single()
|
||||
.execute()
|
||||
)
|
||||
|
||||
if not result.data:
|
||||
return None
|
||||
|
||||
# Transform the source to item format
|
||||
item = await self._transform_source_to_item(result.data)
|
||||
return item
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(
|
||||
f"Failed to get knowledge item | error={str(e)} | source_id={source_id}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def update_item(
|
||||
self, source_id: str, updates: dict[str, Any]
|
||||
) -> tuple[bool, dict[str, Any]]:
|
||||
"""
|
||||
Update a knowledge item's metadata.
|
||||
|
||||
Args:
|
||||
source_id: The source ID to update
|
||||
updates: Dictionary of fields to update
|
||||
|
||||
Returns:
|
||||
Tuple of (success, result)
|
||||
"""
|
||||
try:
|
||||
safe_logfire_info(
|
||||
f"Updating knowledge item | source_id={source_id} | updates={updates}"
|
||||
)
|
||||
|
||||
# Prepare update data
|
||||
update_data = {}
|
||||
|
||||
# Handle title updates
|
||||
if "title" in updates:
|
||||
update_data["title"] = updates["title"]
|
||||
|
||||
# Handle metadata updates
|
||||
metadata_fields = [
|
||||
"description",
|
||||
"knowledge_type",
|
||||
"tags",
|
||||
"status",
|
||||
"update_frequency",
|
||||
"group_name",
|
||||
]
|
||||
metadata_updates = {k: v for k, v in updates.items() if k in metadata_fields}
|
||||
|
||||
if metadata_updates:
|
||||
# Get current metadata
|
||||
current_response = (
|
||||
self.supabase.table("archon_sources")
|
||||
.select("metadata")
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
if current_response.data:
|
||||
current_metadata = current_response.data[0].get("metadata", {})
|
||||
current_metadata.update(metadata_updates)
|
||||
update_data["metadata"] = current_metadata
|
||||
else:
|
||||
update_data["metadata"] = metadata_updates
|
||||
|
||||
# Perform the update
|
||||
result = (
|
||||
self.supabase.table("archon_sources")
|
||||
.update(update_data)
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
|
||||
if result.data:
|
||||
safe_logfire_info(f"Knowledge item updated successfully | source_id={source_id}")
|
||||
return True, {
|
||||
"success": True,
|
||||
"message": f"Successfully updated knowledge item {source_id}",
|
||||
"source_id": source_id,
|
||||
}
|
||||
else:
|
||||
safe_logfire_error(f"Knowledge item not found | source_id={source_id}")
|
||||
return False, {"error": f"Knowledge item {source_id} not found"}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(
|
||||
f"Failed to update knowledge item | error={str(e)} | source_id={source_id}"
|
||||
)
|
||||
return False, {"error": str(e)}
|
||||
|
||||
async def get_available_sources(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get all available sources with their details.
|
||||
|
||||
Returns:
|
||||
Dict containing sources list and count
|
||||
"""
|
||||
try:
|
||||
# Query the sources table
|
||||
result = self.supabase.from_("archon_sources").select("*").order("source_id").execute()
|
||||
|
||||
# Format the sources
|
||||
sources = []
|
||||
if result.data:
|
||||
for source in result.data:
|
||||
sources.append({
|
||||
"source_id": source.get("source_id"),
|
||||
"title": source.get("title", source.get("summary", "Untitled")),
|
||||
"summary": source.get("summary"),
|
||||
"metadata": source.get("metadata", {}),
|
||||
"total_words": source.get("total_words", source.get("total_word_count", 0)),
|
||||
"update_frequency": source.get("update_frequency", 7),
|
||||
"created_at": source.get("created_at"),
|
||||
"updated_at": source.get("updated_at", source.get("created_at")),
|
||||
})
|
||||
|
||||
return {"success": True, "sources": sources, "count": len(sources)}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Failed to get available sources | error={str(e)}")
|
||||
return {"success": False, "error": str(e), "sources": [], "count": 0}
|
||||
|
||||
async def _get_all_sources(self) -> list[dict[str, Any]]:
|
||||
"""Get all sources from the database."""
|
||||
result = await self.get_available_sources()
|
||||
return result.get("sources", [])
|
||||
|
||||
async def _transform_source_to_item(self, source: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Transform a source record into a knowledge item with enriched data.
|
||||
|
||||
Args:
|
||||
source: The source record from database
|
||||
|
||||
Returns:
|
||||
Transformed knowledge item
|
||||
"""
|
||||
source_metadata = source.get("metadata", {})
|
||||
source_id = source["source_id"]
|
||||
|
||||
# Get first page URL
|
||||
first_page_url = await self._get_first_page_url(source_id)
|
||||
|
||||
# Determine source type
|
||||
source_type = self._determine_source_type(source_metadata, first_page_url)
|
||||
|
||||
# Get code examples
|
||||
code_examples = await self._get_code_examples(source_id)
|
||||
|
||||
return {
|
||||
"id": source_id,
|
||||
"title": source.get("title", source.get("summary", "Untitled")),
|
||||
"url": first_page_url,
|
||||
"source_id": source_id,
|
||||
"code_examples": code_examples,
|
||||
"metadata": {
|
||||
# Spread source_metadata first, then override with computed values
|
||||
**source_metadata,
|
||||
"knowledge_type": source_metadata.get("knowledge_type", "technical"),
|
||||
"tags": source_metadata.get("tags", []),
|
||||
"source_type": source_type, # This should be the correctly determined source_type
|
||||
"status": "active",
|
||||
"description": source_metadata.get("description", source.get("summary", "")),
|
||||
"chunks_count": await self._get_chunks_count(source_id), # Get actual chunk count
|
||||
"word_count": source.get("total_words", 0),
|
||||
"estimated_pages": round(
|
||||
source.get("total_words", 0) / 250, 1
|
||||
), # Average book page = 250 words
|
||||
"pages_tooltip": f"{round(source.get('total_words', 0) / 250, 1)} pages (≈ {source.get('total_words', 0):,} words)",
|
||||
"last_scraped": source.get("updated_at"),
|
||||
"file_name": source_metadata.get("file_name"),
|
||||
"file_type": source_metadata.get("file_type"),
|
||||
"update_frequency": source.get("update_frequency", 7),
|
||||
"code_examples_count": len(code_examples),
|
||||
},
|
||||
"created_at": source.get("created_at"),
|
||||
"updated_at": source.get("updated_at"),
|
||||
}
|
||||
|
||||
async def _get_first_page_url(self, source_id: str) -> str:
|
||||
"""Get the first page URL for a source."""
|
||||
try:
|
||||
pages_response = (
|
||||
self.supabase.from_("archon_crawled_pages")
|
||||
.select("url")
|
||||
.eq("source_id", source_id)
|
||||
.limit(1)
|
||||
.execute()
|
||||
)
|
||||
|
||||
if pages_response.data:
|
||||
return pages_response.data[0].get("url", f"source://{source_id}")
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return f"source://{source_id}"
|
||||
|
||||
async def _get_code_examples(self, source_id: str) -> list[dict[str, Any]]:
|
||||
"""Get code examples for a source."""
|
||||
try:
|
||||
code_examples_response = (
|
||||
self.supabase.from_("archon_code_examples")
|
||||
.select("id, content, summary, metadata")
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
|
||||
return code_examples_response.data if code_examples_response.data else []
|
||||
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _determine_source_type(self, metadata: dict[str, Any], url: str) -> str:
|
||||
"""Determine the source type from metadata or URL pattern."""
|
||||
stored_source_type = metadata.get("source_type")
|
||||
if stored_source_type:
|
||||
return stored_source_type
|
||||
|
||||
# Legacy fallback - check URL pattern
|
||||
return "file" if url.startswith("file://") else "url"
|
||||
|
||||
def _filter_by_search(self, items: list[dict[str, Any]], search: str) -> list[dict[str, Any]]:
|
||||
"""Filter items by search term."""
|
||||
search_lower = search.lower()
|
||||
return [
|
||||
item
|
||||
for item in items
|
||||
if search_lower in item["title"].lower()
|
||||
or search_lower in item["metadata"].get("description", "").lower()
|
||||
or any(search_lower in tag.lower() for tag in item["metadata"].get("tags", []))
|
||||
]
|
||||
|
||||
def _filter_by_knowledge_type(
|
||||
self, items: list[dict[str, Any]], knowledge_type: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Filter items by knowledge type."""
|
||||
return [item for item in items if item["metadata"].get("knowledge_type") == knowledge_type]
|
||||
|
||||
async def _get_chunks_count(self, source_id: str) -> int:
|
||||
"""Get the actual number of chunks for a source."""
|
||||
try:
|
||||
# Count the actual rows in crawled_pages for this source
|
||||
result = (
|
||||
self.supabase.table("archon_crawled_pages")
|
||||
.select("*", count="exact")
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Return the count of pages (chunks)
|
||||
return result.count if result.count else 0
|
||||
|
||||
except Exception as e:
|
||||
# If we can't get chunk count, return 0
|
||||
safe_logfire_info(f"Failed to get chunk count for {source_id}: {e}")
|
||||
return 0
|
||||
"""
|
||||
Knowledge Item Service
|
||||
|
||||
Handles all knowledge item CRUD operations and data transformations.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from ...config.logfire_config import safe_logfire_error, safe_logfire_info
|
||||
|
||||
|
||||
class KnowledgeItemService:
|
||||
"""
|
||||
Service for managing knowledge items including listing, filtering, updating, and deletion.
|
||||
"""
|
||||
|
||||
def __init__(self, supabase_client):
|
||||
"""
|
||||
Initialize the knowledge item service.
|
||||
|
||||
Args:
|
||||
supabase_client: The Supabase client for database operations
|
||||
"""
|
||||
self.supabase = supabase_client
|
||||
|
||||
async def list_items(
|
||||
self,
|
||||
page: int = 1,
|
||||
per_page: int = 20,
|
||||
knowledge_type: str | None = None,
|
||||
search: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
List knowledge items with pagination and filtering.
|
||||
|
||||
Args:
|
||||
page: Page number (1-based)
|
||||
per_page: Items per page
|
||||
knowledge_type: Filter by knowledge type
|
||||
search: Search term for filtering
|
||||
|
||||
Returns:
|
||||
Dict containing items, pagination info, and total count
|
||||
"""
|
||||
try:
|
||||
# Build the query with filters at database level for better performance
|
||||
query = self.supabase.from_("archon_sources").select("*")
|
||||
|
||||
# Apply knowledge type filter at database level if provided
|
||||
if knowledge_type:
|
||||
query = query.eq("metadata->>knowledge_type", knowledge_type)
|
||||
|
||||
# Apply search filter at database level if provided
|
||||
if search:
|
||||
search_pattern = f"%{search}%"
|
||||
query = query.or_(
|
||||
f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
|
||||
)
|
||||
|
||||
# Get total count before pagination
|
||||
# Clone the query for counting
|
||||
count_query = self.supabase.from_("archon_sources").select(
|
||||
"*", count="exact", head=True
|
||||
)
|
||||
|
||||
# Apply same filters to count query
|
||||
if knowledge_type:
|
||||
count_query = count_query.eq("metadata->>knowledge_type", knowledge_type)
|
||||
|
||||
if search:
|
||||
search_pattern = f"%{search}%"
|
||||
count_query = count_query.or_(
|
||||
f"title.ilike.{search_pattern},summary.ilike.{search_pattern},source_id.ilike.{search_pattern}"
|
||||
)
|
||||
|
||||
count_result = count_query.execute()
|
||||
total = count_result.count if hasattr(count_result, "count") else 0
|
||||
|
||||
# Apply pagination at database level
|
||||
start_idx = (page - 1) * per_page
|
||||
query = query.range(start_idx, start_idx + per_page - 1)
|
||||
|
||||
# Execute query
|
||||
result = query.execute()
|
||||
sources = result.data if result.data else []
|
||||
|
||||
# Get source IDs for batch queries
|
||||
source_ids = [source["source_id"] for source in sources]
|
||||
|
||||
# Debug log source IDs
|
||||
safe_logfire_info(f"Source IDs for batch query: {source_ids}")
|
||||
|
||||
# Batch fetch related data to avoid N+1 queries
|
||||
first_urls = {}
|
||||
code_example_counts = {}
|
||||
chunk_counts = {}
|
||||
|
||||
if source_ids:
|
||||
# Batch fetch first URLs
|
||||
urls_result = (
|
||||
self.supabase.from_("archon_crawled_pages")
|
||||
.select("source_id, url")
|
||||
.in_("source_id", source_ids)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Group URLs by source_id (take first one for each)
|
||||
for item in urls_result.data or []:
|
||||
if item["source_id"] not in first_urls:
|
||||
first_urls[item["source_id"]] = item["url"]
|
||||
|
||||
# Get code example counts per source - NO CONTENT, just counts!
|
||||
# Fetch counts individually for each source
|
||||
for source_id in source_ids:
|
||||
count_result = (
|
||||
self.supabase.from_("archon_code_examples")
|
||||
.select("id", count="exact", head=True)
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
code_example_counts[source_id] = (
|
||||
count_result.count if hasattr(count_result, "count") else 0
|
||||
)
|
||||
|
||||
# Ensure all sources have a count (default to 0)
|
||||
for source_id in source_ids:
|
||||
if source_id not in code_example_counts:
|
||||
code_example_counts[source_id] = 0
|
||||
chunk_counts[source_id] = 0 # Default to 0 to avoid timeout
|
||||
|
||||
safe_logfire_info("Code example counts", code_counts=code_example_counts)
|
||||
|
||||
# Transform sources to items with batched data
|
||||
items = []
|
||||
for source in sources:
|
||||
source_id = source["source_id"]
|
||||
source_metadata = source.get("metadata", {})
|
||||
|
||||
# Use batched data instead of individual queries
|
||||
first_page_url = first_urls.get(source_id, f"source://{source_id}")
|
||||
# Use original crawl URL instead of first page URL
|
||||
original_url = source_metadata.get("original_url") or first_page_url
|
||||
code_examples_count = code_example_counts.get(source_id, 0)
|
||||
chunks_count = chunk_counts.get(source_id, 0)
|
||||
|
||||
# Determine source type
|
||||
source_type = self._determine_source_type(source_metadata, original_url)
|
||||
|
||||
item = {
|
||||
"id": source_id,
|
||||
"title": source.get("title", source.get("summary", "Untitled")),
|
||||
"url": original_url,
|
||||
"source_id": source_id,
|
||||
"code_examples": [{"count": code_examples_count}]
|
||||
if code_examples_count > 0
|
||||
else [], # Minimal array just for count display
|
||||
"metadata": {
|
||||
"knowledge_type": source_metadata.get("knowledge_type", "technical"),
|
||||
"tags": source_metadata.get("tags", []),
|
||||
"source_type": source_type,
|
||||
"status": "active",
|
||||
"description": source_metadata.get(
|
||||
"description", source.get("summary", "")
|
||||
),
|
||||
"chunks_count": chunks_count,
|
||||
"word_count": source.get("total_word_count", 0),
|
||||
"estimated_pages": round(source.get("total_word_count", 0) / 250, 1),
|
||||
"pages_tooltip": f"{round(source.get('total_word_count', 0) / 250, 1)} pages (≈ {source.get('total_word_count', 0):,} words)",
|
||||
"last_scraped": source.get("updated_at"),
|
||||
"file_name": source_metadata.get("file_name"),
|
||||
"file_type": source_metadata.get("file_type"),
|
||||
"update_frequency": source_metadata.get("update_frequency", 7),
|
||||
"code_examples_count": code_examples_count,
|
||||
**source_metadata,
|
||||
},
|
||||
"created_at": source.get("created_at"),
|
||||
"updated_at": source.get("updated_at"),
|
||||
}
|
||||
items.append(item)
|
||||
|
||||
safe_logfire_info(
|
||||
f"Knowledge items retrieved | total={total} | page={page} | filtered_count={len(items)}"
|
||||
)
|
||||
|
||||
return {
|
||||
"items": items,
|
||||
"total": total,
|
||||
"page": page,
|
||||
"per_page": per_page,
|
||||
"pages": (total + per_page - 1) // per_page,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Failed to list knowledge items | error={str(e)}")
|
||||
raise
|
||||
|
||||
async def get_item(self, source_id: str) -> dict[str, Any] | None:
|
||||
"""
|
||||
Get a single knowledge item by source ID.
|
||||
|
||||
Args:
|
||||
source_id: The source ID to retrieve
|
||||
|
||||
Returns:
|
||||
Knowledge item dict or None if not found
|
||||
"""
|
||||
try:
|
||||
safe_logfire_info(f"Getting knowledge item | source_id={source_id}")
|
||||
|
||||
# Get the source record
|
||||
result = (
|
||||
self.supabase.from_("archon_sources")
|
||||
.select("*")
|
||||
.eq("source_id", source_id)
|
||||
.single()
|
||||
.execute()
|
||||
)
|
||||
|
||||
if not result.data:
|
||||
return None
|
||||
|
||||
# Transform the source to item format
|
||||
item = await self._transform_source_to_item(result.data)
|
||||
return item
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(
|
||||
f"Failed to get knowledge item | error={str(e)} | source_id={source_id}"
|
||||
)
|
||||
return None
|
||||
|
||||
async def update_item(
|
||||
self, source_id: str, updates: dict[str, Any]
|
||||
) -> tuple[bool, dict[str, Any]]:
|
||||
"""
|
||||
Update a knowledge item's metadata.
|
||||
|
||||
Args:
|
||||
source_id: The source ID to update
|
||||
updates: Dictionary of fields to update
|
||||
|
||||
Returns:
|
||||
Tuple of (success, result)
|
||||
"""
|
||||
try:
|
||||
safe_logfire_info(
|
||||
f"Updating knowledge item | source_id={source_id} | updates={updates}"
|
||||
)
|
||||
|
||||
# Prepare update data
|
||||
update_data = {}
|
||||
|
||||
# Handle title updates
|
||||
if "title" in updates:
|
||||
update_data["title"] = updates["title"]
|
||||
|
||||
# Handle metadata updates
|
||||
metadata_fields = [
|
||||
"description",
|
||||
"knowledge_type",
|
||||
"tags",
|
||||
"status",
|
||||
"update_frequency",
|
||||
"group_name",
|
||||
]
|
||||
metadata_updates = {k: v for k, v in updates.items() if k in metadata_fields}
|
||||
|
||||
if metadata_updates:
|
||||
# Get current metadata
|
||||
current_response = (
|
||||
self.supabase.table("archon_sources")
|
||||
.select("metadata")
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
if current_response.data:
|
||||
current_metadata = current_response.data[0].get("metadata", {})
|
||||
current_metadata.update(metadata_updates)
|
||||
update_data["metadata"] = current_metadata
|
||||
else:
|
||||
update_data["metadata"] = metadata_updates
|
||||
|
||||
# Perform the update
|
||||
result = (
|
||||
self.supabase.table("archon_sources")
|
||||
.update(update_data)
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
|
||||
if result.data:
|
||||
safe_logfire_info(f"Knowledge item updated successfully | source_id={source_id}")
|
||||
return True, {
|
||||
"success": True,
|
||||
"message": f"Successfully updated knowledge item {source_id}",
|
||||
"source_id": source_id,
|
||||
}
|
||||
else:
|
||||
safe_logfire_error(f"Knowledge item not found | source_id={source_id}")
|
||||
return False, {"error": f"Knowledge item {source_id} not found"}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(
|
||||
f"Failed to update knowledge item | error={str(e)} | source_id={source_id}"
|
||||
)
|
||||
return False, {"error": str(e)}
|
||||
|
||||
async def get_available_sources(self) -> dict[str, Any]:
|
||||
"""
|
||||
Get all available sources with their details.
|
||||
|
||||
Returns:
|
||||
Dict containing sources list and count
|
||||
"""
|
||||
try:
|
||||
# Query the sources table
|
||||
result = self.supabase.from_("archon_sources").select("*").order("source_id").execute()
|
||||
|
||||
# Format the sources
|
||||
sources = []
|
||||
if result.data:
|
||||
for source in result.data:
|
||||
sources.append({
|
||||
"source_id": source.get("source_id"),
|
||||
"title": source.get("title", source.get("summary", "Untitled")),
|
||||
"summary": source.get("summary"),
|
||||
"metadata": source.get("metadata", {}),
|
||||
"total_words": source.get("total_words", source.get("total_word_count", 0)),
|
||||
"update_frequency": source.get("update_frequency", 7),
|
||||
"created_at": source.get("created_at"),
|
||||
"updated_at": source.get("updated_at", source.get("created_at")),
|
||||
})
|
||||
|
||||
return {"success": True, "sources": sources, "count": len(sources)}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Failed to get available sources | error={str(e)}")
|
||||
return {"success": False, "error": str(e), "sources": [], "count": 0}
|
||||
|
||||
async def _get_all_sources(self) -> list[dict[str, Any]]:
|
||||
"""Get all sources from the database."""
|
||||
result = await self.get_available_sources()
|
||||
return result.get("sources", [])
|
||||
|
||||
async def _transform_source_to_item(self, source: dict[str, Any]) -> dict[str, Any]:
|
||||
"""
|
||||
Transform a source record into a knowledge item with enriched data.
|
||||
|
||||
Args:
|
||||
source: The source record from database
|
||||
|
||||
Returns:
|
||||
Transformed knowledge item
|
||||
"""
|
||||
source_metadata = source.get("metadata", {})
|
||||
source_id = source["source_id"]
|
||||
|
||||
# Get first page URL
|
||||
first_page_url = await self._get_first_page_url(source_id)
|
||||
|
||||
# Determine source type
|
||||
source_type = self._determine_source_type(source_metadata, first_page_url)
|
||||
|
||||
# Get code examples
|
||||
code_examples = await self._get_code_examples(source_id)
|
||||
|
||||
return {
|
||||
"id": source_id,
|
||||
"title": source.get("title", source.get("summary", "Untitled")),
|
||||
"url": first_page_url,
|
||||
"source_id": source_id,
|
||||
"code_examples": code_examples,
|
||||
"metadata": {
|
||||
"knowledge_type": source_metadata.get("knowledge_type", "technical"),
|
||||
"tags": source_metadata.get("tags", []),
|
||||
"source_type": source_type,
|
||||
"status": "active",
|
||||
"description": source_metadata.get("description", source.get("summary", "")),
|
||||
"chunks_count": await self._get_chunks_count(source_id), # Get actual chunk count
|
||||
"word_count": source.get("total_words", 0),
|
||||
"estimated_pages": round(
|
||||
source.get("total_words", 0) / 250, 1
|
||||
), # Average book page = 250 words
|
||||
"pages_tooltip": f"{round(source.get('total_words', 0) / 250, 1)} pages (≈ {source.get('total_words', 0):,} words)",
|
||||
"last_scraped": source.get("updated_at"),
|
||||
"file_name": source_metadata.get("file_name"),
|
||||
"file_type": source_metadata.get("file_type"),
|
||||
"update_frequency": source.get("update_frequency", 7),
|
||||
"code_examples_count": len(code_examples),
|
||||
**source_metadata,
|
||||
},
|
||||
"created_at": source.get("created_at"),
|
||||
"updated_at": source.get("updated_at"),
|
||||
}
|
||||
|
||||
async def _get_first_page_url(self, source_id: str) -> str:
|
||||
"""Get the first page URL for a source."""
|
||||
try:
|
||||
pages_response = (
|
||||
self.supabase.from_("archon_crawled_pages")
|
||||
.select("url")
|
||||
.eq("source_id", source_id)
|
||||
.limit(1)
|
||||
.execute()
|
||||
)
|
||||
|
||||
if pages_response.data:
|
||||
return pages_response.data[0].get("url", f"source://{source_id}")
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return f"source://{source_id}"
|
||||
|
||||
async def _get_code_examples(self, source_id: str) -> list[dict[str, Any]]:
|
||||
"""Get code examples for a source."""
|
||||
try:
|
||||
code_examples_response = (
|
||||
self.supabase.from_("archon_code_examples")
|
||||
.select("id, content, summary, metadata")
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
|
||||
return code_examples_response.data if code_examples_response.data else []
|
||||
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def _determine_source_type(self, metadata: dict[str, Any], url: str) -> str:
|
||||
"""Determine the source type from metadata or URL pattern."""
|
||||
stored_source_type = metadata.get("source_type")
|
||||
if stored_source_type:
|
||||
return stored_source_type
|
||||
|
||||
# Legacy fallback - check URL pattern
|
||||
return "file" if url.startswith("file://") else "url"
|
||||
|
||||
def _filter_by_search(self, items: list[dict[str, Any]], search: str) -> list[dict[str, Any]]:
|
||||
"""Filter items by search term."""
|
||||
search_lower = search.lower()
|
||||
return [
|
||||
item
|
||||
for item in items
|
||||
if search_lower in item["title"].lower()
|
||||
or search_lower in item["metadata"].get("description", "").lower()
|
||||
or any(search_lower in tag.lower() for tag in item["metadata"].get("tags", []))
|
||||
]
|
||||
|
||||
def _filter_by_knowledge_type(
|
||||
self, items: list[dict[str, Any]], knowledge_type: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Filter items by knowledge type."""
|
||||
return [item for item in items if item["metadata"].get("knowledge_type") == knowledge_type]
|
||||
|
||||
async def _get_chunks_count(self, source_id: str) -> int:
|
||||
"""Get the actual number of chunks for a source."""
|
||||
try:
|
||||
# Count the actual rows in crawled_pages for this source
|
||||
result = (
|
||||
self.supabase.table("archon_crawled_pages")
|
||||
.select("*", count="exact")
|
||||
.eq("source_id", source_id)
|
||||
.execute()
|
||||
)
|
||||
|
||||
# Return the count of pages (chunks)
|
||||
return result.count if result.count else 0
|
||||
|
||||
except Exception as e:
|
||||
# If we can't get chunk count, return 0
|
||||
safe_logfire_info(f"Failed to get chunk count for {source_id}: {e}")
|
||||
return 0
|
||||
|
||||
@@ -11,13 +11,10 @@ from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from ...config.logfire_config import get_logger
|
||||
from ...socketio_app import get_socketio_instance
|
||||
from ...socketio_app import sio
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Get Socket.IO instance
|
||||
sio = get_socketio_instance()
|
||||
logger.info(f"🔗 [PROGRESS] Socket.IO instance ID: {id(sio)}")
|
||||
|
||||
|
||||
class ProgressService:
|
||||
|
||||
@@ -17,9 +17,7 @@ logger = get_logger(__name__)
|
||||
|
||||
# Import Socket.IO instance directly to avoid circular imports
|
||||
try:
|
||||
from ...socketio_app import get_socketio_instance
|
||||
|
||||
_sio = get_socketio_instance()
|
||||
from ...socketio_app import sio as _sio
|
||||
_broadcast_available = True
|
||||
logger.info("✅ Socket.IO broadcasting is AVAILABLE - real-time updates enabled")
|
||||
|
||||
|
||||
@@ -870,14 +870,17 @@ async def add_code_examples_to_supabase(
|
||||
|
||||
# Prepare batch data - only for successful embeddings
|
||||
batch_data = []
|
||||
used_indices = set() # Track which indices have been mapped to prevent duplicates
|
||||
|
||||
for j, (embedding, text) in enumerate(
|
||||
zip(valid_embeddings, successful_texts, strict=False)
|
||||
):
|
||||
# Find the original index
|
||||
# Find the original index (skip already used indices)
|
||||
orig_idx = None
|
||||
for k, orig_text in enumerate(batch_texts):
|
||||
if orig_text == text:
|
||||
if orig_text == text and k not in used_indices:
|
||||
orig_idx = k
|
||||
used_indices.add(k) # Mark this index as used
|
||||
break
|
||||
|
||||
if orig_idx is None:
|
||||
|
||||
@@ -266,20 +266,23 @@ async def add_documents_to_supabase(
|
||||
search_logger.warning(
|
||||
f"Skipping batch {batch_num} - no successful embeddings created"
|
||||
)
|
||||
completed_batches += 1
|
||||
# Don't increment completed_batches when skipping - this causes progress to jump
|
||||
continue
|
||||
|
||||
# Prepare batch data - only for successful embeddings
|
||||
batch_data = []
|
||||
used_indices = set() # Track which indices have been mapped to prevent duplicates
|
||||
|
||||
# Map successful texts back to their original indices
|
||||
for j, (embedding, text) in enumerate(
|
||||
zip(batch_embeddings, successful_texts, strict=False)
|
||||
):
|
||||
# Find the original index of this text
|
||||
# Find the original index of this text (skip already used indices)
|
||||
orig_idx = None
|
||||
for idx, orig_text in enumerate(contextual_contents):
|
||||
if orig_text == text:
|
||||
if orig_text == text and idx not in used_indices:
|
||||
orig_idx = idx
|
||||
used_indices.add(idx) # Mark this index as used
|
||||
break
|
||||
|
||||
if orig_idx is None:
|
||||
@@ -370,6 +373,9 @@ async def add_documents_to_supabase(
|
||||
search_logger.info(
|
||||
f"Individual inserts: {successful_inserts}/{len(batch_data)} successful"
|
||||
)
|
||||
# Even if we had to fall back to individual inserts, count this batch as processed
|
||||
if successful_inserts > 0:
|
||||
completed_batches += 1
|
||||
|
||||
# Minimal delay between batches to prevent overwhelming
|
||||
if i + batch_size < len(contents):
|
||||
|
||||
@@ -84,17 +84,10 @@ class RateLimiter:
|
||||
self.semaphore = asyncio.Semaphore(config.max_concurrent)
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def acquire(self, estimated_tokens: int = 8000, progress_callback: Callable | None = None) -> bool:
|
||||
"""Acquire permission to make API call with token awareness
|
||||
|
||||
Args:
|
||||
estimated_tokens: Estimated number of tokens for the operation
|
||||
progress_callback: Optional async callback for progress updates during wait
|
||||
"""
|
||||
while True: # Loop instead of recursion to avoid stack overflow
|
||||
wait_time_to_sleep = None
|
||||
|
||||
async with self._lock:
|
||||
async def acquire(self, estimated_tokens: int = 8000) -> bool:
|
||||
"""Acquire permission to make API call with token awareness"""
|
||||
async with self._lock:
|
||||
while True: # Use a loop instead of recursion
|
||||
now = time.time()
|
||||
|
||||
# Clean old entries
|
||||
@@ -106,41 +99,30 @@ class RateLimiter:
|
||||
self.request_times.append(now)
|
||||
self.token_usage.append((now, estimated_tokens))
|
||||
return True
|
||||
|
||||
# Calculate wait time if we can't make the request
|
||||
|
||||
# Calculate wait time
|
||||
wait_time = self._calculate_wait_time(estimated_tokens)
|
||||
if wait_time > 0:
|
||||
logfire_logger.info(
|
||||
f"Rate limiting: waiting {wait_time:.1f}s",
|
||||
extra={
|
||||
"tokens": estimated_tokens,
|
||||
"current_usage": self._get_current_usage(),
|
||||
}
|
||||
)
|
||||
wait_time_to_sleep = wait_time
|
||||
else:
|
||||
if wait_time <= 0:
|
||||
return False
|
||||
|
||||
# Sleep outside the lock to avoid deadlock
|
||||
if wait_time_to_sleep is not None:
|
||||
# For long waits, break into smaller chunks with progress updates
|
||||
if wait_time_to_sleep > 5 and progress_callback:
|
||||
chunks = int(wait_time_to_sleep / 5) # 5 second chunks
|
||||
for i in range(chunks):
|
||||
await asyncio.sleep(5)
|
||||
remaining = wait_time_to_sleep - (i + 1) * 5
|
||||
if progress_callback:
|
||||
await progress_callback({
|
||||
"type": "rate_limit_wait",
|
||||
"remaining_seconds": max(0, remaining),
|
||||
"message": f"waiting {max(0, remaining):.1f}s more..."
|
||||
})
|
||||
# Sleep any remaining time
|
||||
if wait_time_to_sleep % 5 > 0:
|
||||
await asyncio.sleep(wait_time_to_sleep % 5)
|
||||
else:
|
||||
await asyncio.sleep(wait_time_to_sleep)
|
||||
# Continue the loop to try again
|
||||
|
||||
logfire_logger.info(
|
||||
f"Rate limiting: waiting {wait_time:.1f}s",
|
||||
extra={
|
||||
"tokens": estimated_tokens,
|
||||
"current_usage": self._get_current_usage(),
|
||||
}
|
||||
)
|
||||
|
||||
# Release the lock while sleeping to allow other operations
|
||||
self._lock.release()
|
||||
try:
|
||||
await asyncio.sleep(wait_time)
|
||||
logfire_logger.info(f"Rate limiting: resuming after {wait_time:.1f}s wait")
|
||||
finally:
|
||||
# Re-acquire the lock before continuing
|
||||
await self._lock.acquire()
|
||||
|
||||
# Loop will continue and re-check conditions
|
||||
|
||||
def _can_make_request(self, estimated_tokens: int) -> bool:
|
||||
"""Check if request can be made within limits"""
|
||||
@@ -540,15 +522,10 @@ class ThreadingService:
|
||||
logfire_logger.info("Threading service stopped")
|
||||
|
||||
@asynccontextmanager
|
||||
async def rate_limited_operation(self, estimated_tokens: int = 8000, progress_callback: Callable | None = None):
|
||||
"""Context manager for rate-limited operations
|
||||
|
||||
Args:
|
||||
estimated_tokens: Estimated number of tokens for the operation
|
||||
progress_callback: Optional async callback for progress updates during wait
|
||||
"""
|
||||
async def rate_limited_operation(self, estimated_tokens: int = 8000):
|
||||
"""Context manager for rate-limited operations"""
|
||||
async with self.rate_limiter.semaphore:
|
||||
can_proceed = await self.rate_limiter.acquire(estimated_tokens, progress_callback)
|
||||
can_proceed = await self.rate_limiter.acquire(estimated_tokens)
|
||||
if not can_proceed:
|
||||
raise Exception("Rate limit exceeded")
|
||||
|
||||
@@ -676,4 +653,4 @@ async def stop_threading_service():
|
||||
global _threading_service
|
||||
if _threading_service:
|
||||
await _threading_service.stop()
|
||||
_threading_service = None
|
||||
_threading_service = None
|
||||
@@ -26,17 +26,6 @@ sio = socketio.AsyncServer(
|
||||
ping_interval=60, # 1 minute - check connection every minute
|
||||
)
|
||||
|
||||
# Global Socket.IO instance for use across modules
|
||||
_socketio_instance: socketio.AsyncServer | None = None
|
||||
|
||||
|
||||
def get_socketio_instance() -> socketio.AsyncServer:
|
||||
"""Get the global Socket.IO server instance."""
|
||||
global _socketio_instance
|
||||
if _socketio_instance is None:
|
||||
_socketio_instance = sio
|
||||
return _socketio_instance
|
||||
|
||||
|
||||
def create_socketio_app(app: FastAPI) -> socketio.ASGIApp:
|
||||
"""
|
||||
@@ -63,3 +52,24 @@ def create_socketio_app(app: FastAPI) -> socketio.ASGIApp:
|
||||
sio.app = app
|
||||
|
||||
return socket_app
|
||||
|
||||
# Default Socket.IO event handlers
|
||||
@sio.event
|
||||
async def connect(sid, environ):
|
||||
"""Handle new client connections."""
|
||||
logger.info(f"Client connected: {sid}")
|
||||
safe_logfire_info(f"Client connected: {sid}")
|
||||
|
||||
|
||||
@sio.event
|
||||
async def disconnect(sid):
|
||||
"""Handle client disconnections."""
|
||||
logger.info(f"Client disconnected: {sid}")
|
||||
safe_logfire_info(f"Client disconnected: {sid}")
|
||||
|
||||
|
||||
@sio.event
|
||||
async def message(sid, data):
|
||||
"""Handle incoming messages."""
|
||||
logger.info(f"Received message from {sid}: {data}")
|
||||
await sio.emit("response", {"data": "Message received!"}, to=sid)
|
||||
Reference in New Issue
Block a user