mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-07 15:18:14 -05:00
1636 lines
71 KiB
Python
1636 lines
71 KiB
Python
"""
|
|
Code Extraction Service
|
|
|
|
Handles extraction, processing, and storage of code examples from documents.
|
|
"""
|
|
|
|
import re
|
|
from collections.abc import Callable
|
|
from typing import Any
|
|
|
|
from ...config.logfire_config import safe_logfire_error, safe_logfire_info
|
|
from ...services.credential_service import credential_service
|
|
from ..storage.code_storage_service import (
|
|
add_code_examples_to_supabase,
|
|
generate_code_summaries_batch,
|
|
)
|
|
|
|
|
|
class CodeExtractionService:
|
|
"""
|
|
Service for extracting and processing code examples from documents.
|
|
"""
|
|
|
|
# Language-specific patterns for better extraction
|
|
LANGUAGE_PATTERNS = {
|
|
"typescript": {
|
|
"block_start": r"^\s*(export\s+)?(class|interface|function|const|type|enum)\s+\w+",
|
|
"block_end": r"^\}(\s*;)?$",
|
|
"min_indicators": [":", "{", "}", "=>", "function", "class", "interface", "type"],
|
|
},
|
|
"javascript": {
|
|
"block_start": r"^\s*(export\s+)?(class|function|const|let|var)\s+\w+",
|
|
"block_end": r"^\}(\s*;)?$",
|
|
"min_indicators": ["function", "{", "}", "=>", "const", "let", "var"],
|
|
},
|
|
"python": {
|
|
"block_start": r"^\s*(class|def|async\s+def)\s+\w+",
|
|
"block_end": r"^\S", # Unindented line
|
|
"min_indicators": ["def", ":", "return", "self", "import", "class"],
|
|
},
|
|
"java": {
|
|
"block_start": r"^\s*(public|private|protected)?\s*(class|interface|enum)\s+\w+",
|
|
"block_end": r"^\}$",
|
|
"min_indicators": ["class", "public", "private", "{", "}", ";"],
|
|
},
|
|
"rust": {
|
|
"block_start": r"^\s*(pub\s+)?(fn|struct|impl|trait|enum)\s+\w+",
|
|
"block_end": r"^\}$",
|
|
"min_indicators": ["fn", "let", "mut", "impl", "struct", "->"],
|
|
},
|
|
"go": {
|
|
"block_start": r"^\s*(func|type|struct)\s+\w+",
|
|
"block_end": r"^\}$",
|
|
"min_indicators": ["func", "type", "struct", "{", "}", ":="],
|
|
},
|
|
}
|
|
|
|
def __init__(self, supabase_client):
|
|
"""
|
|
Initialize the code extraction service.
|
|
|
|
Args:
|
|
supabase_client: The Supabase client for database operations
|
|
"""
|
|
self.supabase_client = supabase_client
|
|
self._settings_cache = {}
|
|
|
|
async def _get_setting(self, key: str, default: Any) -> Any:
|
|
"""Get a setting from credential service with caching."""
|
|
if key in self._settings_cache:
|
|
return self._settings_cache[key]
|
|
|
|
try:
|
|
value = await credential_service.get_credential(key, default)
|
|
# Convert string values to appropriate types
|
|
if isinstance(default, bool):
|
|
value = str(value).lower() == "true" if value is not None else default
|
|
elif isinstance(default, int):
|
|
value = int(value) if value is not None else default
|
|
elif isinstance(default, float):
|
|
value = float(value) if value is not None else default
|
|
self._settings_cache[key] = value
|
|
return value
|
|
except Exception as e:
|
|
safe_logfire_error(f"Error getting setting {key}: {e}, using default: {default}")
|
|
# Make sure we return the default value with correct type
|
|
self._settings_cache[key] = default
|
|
return default
|
|
|
|
async def _get_min_code_length(self) -> int:
|
|
"""Get minimum code block length setting."""
|
|
return await self._get_setting("MIN_CODE_BLOCK_LENGTH", 250)
|
|
|
|
async def _get_max_code_length(self) -> int:
|
|
"""Get maximum code block length setting."""
|
|
return await self._get_setting("MAX_CODE_BLOCK_LENGTH", 5000)
|
|
|
|
async def _is_complete_block_detection_enabled(self) -> bool:
|
|
"""Check if complete block detection is enabled."""
|
|
return await self._get_setting("ENABLE_COMPLETE_BLOCK_DETECTION", True)
|
|
|
|
async def _is_language_patterns_enabled(self) -> bool:
|
|
"""Check if language-specific patterns are enabled."""
|
|
return await self._get_setting("ENABLE_LANGUAGE_SPECIFIC_PATTERNS", True)
|
|
|
|
async def _is_prose_filtering_enabled(self) -> bool:
|
|
"""Check if prose filtering is enabled."""
|
|
return await self._get_setting("ENABLE_PROSE_FILTERING", True)
|
|
|
|
async def _get_max_prose_ratio(self) -> float:
|
|
"""Get maximum allowed prose ratio."""
|
|
return await self._get_setting("MAX_PROSE_RATIO", 0.15)
|
|
|
|
async def _get_min_code_indicators(self) -> int:
|
|
"""Get minimum required code indicators."""
|
|
return await self._get_setting("MIN_CODE_INDICATORS", 3)
|
|
|
|
async def _is_diagram_filtering_enabled(self) -> bool:
|
|
"""Check if diagram filtering is enabled."""
|
|
return await self._get_setting("ENABLE_DIAGRAM_FILTERING", True)
|
|
|
|
async def _is_contextual_length_enabled(self) -> bool:
|
|
"""Check if contextual length adjustment is enabled."""
|
|
return await self._get_setting("ENABLE_CONTEXTUAL_LENGTH", True)
|
|
|
|
async def _get_context_window_size(self) -> int:
|
|
"""Get context window size for code blocks."""
|
|
return await self._get_setting("CONTEXT_WINDOW_SIZE", 1000)
|
|
|
|
async def _is_code_summaries_enabled(self) -> bool:
|
|
"""Check if code summaries generation is enabled."""
|
|
return await self._get_setting("ENABLE_CODE_SUMMARIES", True)
|
|
|
|
async def extract_and_store_code_examples(
|
|
self,
|
|
crawl_results: list[dict[str, Any]],
|
|
url_to_full_document: dict[str, str],
|
|
source_id: str,
|
|
progress_callback: Callable | None = None,
|
|
start_progress: int = 0,
|
|
end_progress: int = 100,
|
|
) -> int:
|
|
"""
|
|
Extract code examples from crawled documents and store them.
|
|
|
|
Args:
|
|
crawl_results: List of crawled documents with url and markdown content
|
|
url_to_full_document: Mapping of URLs to full document content
|
|
source_id: The unique source_id for all documents
|
|
progress_callback: Optional async callback for progress updates
|
|
start_progress: Starting progress percentage (default: 0)
|
|
end_progress: Ending progress percentage (default: 100)
|
|
|
|
Returns:
|
|
Number of code examples stored
|
|
"""
|
|
# Divide the progress range into phases:
|
|
# - Extract code blocks: start_progress to 40% of range
|
|
# - Generate summaries: 40% to 80% of range
|
|
# - Store examples: 80% to end_progress
|
|
progress_range = end_progress - start_progress
|
|
extract_end = start_progress + int(progress_range * 0.4)
|
|
summary_end = start_progress + int(progress_range * 0.8)
|
|
|
|
# Extract code blocks from all documents
|
|
all_code_blocks = await self._extract_code_blocks_from_documents(
|
|
crawl_results, source_id, progress_callback, start_progress, extract_end
|
|
)
|
|
|
|
if not all_code_blocks:
|
|
safe_logfire_info("No code examples found in any crawled documents")
|
|
# Still report completion when no code examples found
|
|
if progress_callback:
|
|
await progress_callback({
|
|
"status": "code_extraction",
|
|
"percentage": end_progress,
|
|
"log": "No code examples found to extract",
|
|
})
|
|
return 0
|
|
|
|
# Log what we found
|
|
safe_logfire_info(f"Found {len(all_code_blocks)} total code blocks to process")
|
|
for i, block_data in enumerate(all_code_blocks[:3]):
|
|
block = block_data["block"]
|
|
safe_logfire_info(
|
|
f"Sample code block {i + 1} | language={block.get('language', 'none')} | code_length={len(block.get('code', ''))}"
|
|
)
|
|
|
|
# Generate summaries for code blocks with mapped progress
|
|
summary_results = await self._generate_code_summaries(
|
|
all_code_blocks, progress_callback, extract_end, summary_end
|
|
)
|
|
|
|
# Prepare code examples for storage
|
|
storage_data = self._prepare_code_examples_for_storage(all_code_blocks, summary_results)
|
|
|
|
# Store code examples in database with final phase progress
|
|
return await self._store_code_examples(
|
|
storage_data, url_to_full_document, progress_callback, summary_end, end_progress
|
|
)
|
|
|
|
async def _extract_code_blocks_from_documents(
|
|
self,
|
|
crawl_results: list[dict[str, Any]],
|
|
source_id: str,
|
|
progress_callback: Callable | None = None,
|
|
start_progress: int = 0,
|
|
end_progress: int = 100,
|
|
) -> list[dict[str, Any]]:
|
|
"""
|
|
Extract code blocks from all documents.
|
|
|
|
Args:
|
|
crawl_results: List of crawled documents
|
|
source_id: The unique source_id for all documents
|
|
|
|
Returns:
|
|
List of code blocks with metadata
|
|
"""
|
|
import asyncio
|
|
import time
|
|
|
|
# Progress will be reported during the loop below
|
|
|
|
all_code_blocks = []
|
|
total_docs = len(crawl_results)
|
|
completed_docs = 0
|
|
|
|
# PERFORMANCE: Track extraction time per document
|
|
MAX_EXTRACTION_TIME_PER_DOC = 5.0 # 5 seconds max per document
|
|
|
|
for doc in crawl_results:
|
|
try:
|
|
doc_start_time = time.time()
|
|
source_url = doc["url"]
|
|
html_content = doc.get("html", "")
|
|
md = doc.get("markdown", "")
|
|
|
|
# Debug logging
|
|
safe_logfire_info(
|
|
f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
|
|
)
|
|
|
|
# Dynamic minimum length is handled inside the extraction methods
|
|
|
|
# Check markdown first to see if it has code blocks
|
|
if md:
|
|
has_backticks = "```" in md
|
|
backtick_count = md.count("```")
|
|
safe_logfire_info(
|
|
f"Markdown check | url={source_url} | has_backticks={has_backticks} | backtick_count={backtick_count}"
|
|
)
|
|
|
|
if "getting-started" in source_url and md:
|
|
# Log a sample of the markdown
|
|
sample = md[:500]
|
|
safe_logfire_info(f"Markdown sample for getting-started: {sample}...")
|
|
|
|
# Improved extraction logic - check for text files first, then HTML, then markdown
|
|
code_blocks = []
|
|
|
|
# Check if this is a text file (e.g., .txt, .md)
|
|
is_text_file = source_url.endswith((
|
|
".txt",
|
|
".text",
|
|
".md",
|
|
)) or "text/plain" in doc.get("content_type", "")
|
|
|
|
if is_text_file:
|
|
# For text files, use specialized text extraction
|
|
safe_logfire_info(f"🎯 TEXT FILE DETECTED | url={source_url}")
|
|
safe_logfire_info(
|
|
f"📊 Content types - has_html={bool(html_content)}, has_md={bool(md)}"
|
|
)
|
|
# For text files, the HTML content should be the raw text (not wrapped in <pre>)
|
|
text_content = html_content if html_content else md
|
|
if text_content:
|
|
safe_logfire_info(
|
|
f"📝 Using {'HTML' if html_content else 'MARKDOWN'} content for text extraction"
|
|
)
|
|
safe_logfire_info(
|
|
f"🔍 Content preview (first 500 chars): {repr(text_content[:500])}..."
|
|
)
|
|
code_blocks = await self._extract_text_file_code_blocks(
|
|
text_content, source_url
|
|
)
|
|
safe_logfire_info(
|
|
f"📦 Text extraction complete | found={len(code_blocks)} blocks | url={source_url}"
|
|
)
|
|
else:
|
|
safe_logfire_info(f"⚠️ NO CONTENT for text file | url={source_url}")
|
|
|
|
# If not a text file or no code blocks found, try HTML extraction first
|
|
if len(code_blocks) == 0 and html_content and not is_text_file:
|
|
# PERFORMANCE: Check if we've already spent too much time on this document
|
|
elapsed_time = time.time() - doc_start_time
|
|
if elapsed_time > MAX_EXTRACTION_TIME_PER_DOC:
|
|
safe_logfire_info(
|
|
f"⏱️ Skipping HTML extraction for {source_url} - already spent {elapsed_time:.1f}s"
|
|
)
|
|
else:
|
|
safe_logfire_info(
|
|
f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
|
|
)
|
|
# Create a timeout for HTML extraction
|
|
remaining_time = MAX_EXTRACTION_TIME_PER_DOC - elapsed_time
|
|
try:
|
|
html_code_blocks = await asyncio.wait_for(
|
|
self._extract_html_code_blocks(html_content, source_url),
|
|
timeout=remaining_time
|
|
)
|
|
if html_code_blocks:
|
|
code_blocks = html_code_blocks
|
|
safe_logfire_info(
|
|
f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
|
|
)
|
|
except asyncio.TimeoutError:
|
|
safe_logfire_info(
|
|
f"⏱️ HTML extraction timed out after {remaining_time:.1f}s for {source_url}"
|
|
)
|
|
|
|
# If still no code blocks, try markdown extraction as fallback
|
|
if len(code_blocks) == 0 and md and "```" in md:
|
|
safe_logfire_info(
|
|
f"No code blocks from HTML, trying markdown extraction | url={source_url}"
|
|
)
|
|
from ..storage.code_storage_service import extract_code_blocks
|
|
|
|
# Use dynamic minimum for markdown extraction
|
|
base_min_length = 250 # Default for markdown
|
|
code_blocks = extract_code_blocks(md, min_length=base_min_length)
|
|
safe_logfire_info(
|
|
f"Found {len(code_blocks)} code blocks from markdown | url={source_url}"
|
|
)
|
|
|
|
if code_blocks:
|
|
# Use the provided source_id for all code blocks
|
|
for block in code_blocks:
|
|
all_code_blocks.append({
|
|
"block": block,
|
|
"source_url": source_url,
|
|
"source_id": source_id,
|
|
})
|
|
|
|
# Update progress only after completing document extraction
|
|
completed_docs += 1
|
|
extraction_time = time.time() - doc_start_time
|
|
if extraction_time > 2.0: # Log slow extractions
|
|
safe_logfire_info(
|
|
f"⏱️ Document extraction took {extraction_time:.1f}s | url={source_url} | "
|
|
f"html_size={len(html_content) if html_content else 0} | "
|
|
f"blocks_found={len([b for b in all_code_blocks if b['source_url'] == source_url])}"
|
|
)
|
|
|
|
if progress_callback and total_docs > 0:
|
|
# Calculate progress within the specified range
|
|
raw_progress = completed_docs / total_docs
|
|
mapped_progress = start_progress + int(
|
|
raw_progress * (end_progress - start_progress)
|
|
)
|
|
await progress_callback({
|
|
"status": "code_extraction",
|
|
"percentage": mapped_progress,
|
|
"log": f"Extracted code from {completed_docs}/{total_docs} documents",
|
|
"completed_documents": completed_docs,
|
|
"total_documents": total_docs,
|
|
})
|
|
|
|
except Exception as e:
|
|
safe_logfire_error(
|
|
f"Error processing code from document | url={doc.get('url')} | error={str(e)}"
|
|
)
|
|
|
|
return all_code_blocks
|
|
|
|
async def _extract_html_code_blocks(self, content: str, source_url: str = "") -> list[dict[str, Any]]:
|
|
"""
|
|
Extract code blocks from HTML patterns in content.
|
|
This is a fallback when markdown conversion didn't preserve code blocks.
|
|
|
|
Args:
|
|
content: The content to search for HTML code patterns
|
|
source_url: The URL of the document being processed
|
|
min_length: Minimum length for code blocks
|
|
|
|
Returns:
|
|
List of code blocks with metadata
|
|
"""
|
|
import re
|
|
|
|
# Add detailed logging
|
|
safe_logfire_info(f"Processing HTML of length {len(content)} for code extraction")
|
|
|
|
# PERFORMANCE OPTIMIZATION: Skip extremely large HTML files or chunk them
|
|
MAX_HTML_SIZE = 1_000_000 # 1MB limit for single-pass processing (increased from 500KB)
|
|
if len(content) > MAX_HTML_SIZE:
|
|
safe_logfire_info(
|
|
f"⚠️ HTML content is very large ({len(content)} bytes). "
|
|
f"Limiting to first {MAX_HTML_SIZE} bytes to prevent timeout."
|
|
)
|
|
# For very large files, focus on the first portion where code examples are likely to be
|
|
content = content[:MAX_HTML_SIZE]
|
|
# Try to find a good cutoff point (end of a tag)
|
|
last_tag_end = content.rfind('>')
|
|
if last_tag_end > MAX_HTML_SIZE - 1000:
|
|
content = content[:last_tag_end + 1]
|
|
|
|
# Check if we have actual content
|
|
if len(content) < 1000:
|
|
safe_logfire_info(
|
|
f"Warning: HTML content seems too short, first 500 chars: {repr(content[:500])}"
|
|
)
|
|
|
|
# Look for specific indicators of code blocks
|
|
has_prism = "prism" in content.lower()
|
|
has_highlight = "highlight" in content.lower()
|
|
has_shiki = "shiki" in content.lower()
|
|
has_codemirror = "codemirror" in content.lower() or "cm-" in content
|
|
safe_logfire_info(
|
|
f"Code library indicators | prism={has_prism} | highlight={has_highlight} | shiki={has_shiki} | codemirror={has_codemirror}"
|
|
)
|
|
|
|
# Check for any pre tags with different attributes
|
|
pre_matches = re.findall(r"<pre[^>]*>", content[:5000], re.IGNORECASE)
|
|
if pre_matches:
|
|
safe_logfire_info(f"Found {len(pre_matches)} <pre> tags in first 5000 chars")
|
|
for i, pre_tag in enumerate(pre_matches[:3]): # Show first 3
|
|
safe_logfire_info(f"Pre tag {i + 1}: {pre_tag}")
|
|
|
|
code_blocks = []
|
|
extracted_positions = set() # Track already extracted code block positions
|
|
|
|
# Comprehensive patterns for various code block formats
|
|
# Order matters - more specific patterns first
|
|
patterns = [
|
|
# GitHub/GitLab patterns
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*highlight[^"\']*["\'][^>]*>.*?<pre[^>]*class=["\'][^"\']*(?:language-)?(\w+)[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"github-highlight",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*snippet-clipboard-content[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"github-snippet",
|
|
),
|
|
# Docusaurus patterns
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*codeBlockContainer[^"\']*["\'][^>]*>.*?<pre[^>]*class=["\'][^"\']*prism-code[^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</pre>',
|
|
"docusaurus",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>.*?<pre[^>]*class=["\'][^"\']*prism-code[^"\']*["\'][^>]*>(.*?)</pre>',
|
|
"docusaurus-alt",
|
|
),
|
|
# Milkdown specific patterns - check their actual HTML structure
|
|
(
|
|
r'<pre[^>]*><code[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</code></pre>',
|
|
"milkdown-typed",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*code-wrapper[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
|
|
"milkdown-wrapper",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*code-block-wrapper[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"milkdown-wrapper-code",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*milkdown-code-block[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"milkdown-code-block",
|
|
),
|
|
(
|
|
r'<pre[^>]*class=["\'][^"\']*code-block[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"milkdown",
|
|
),
|
|
(r"<div[^>]*data-code-block[^>]*>.*?<pre[^>]*>(.*?)</pre>", "milkdown-alt"),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*milkdown[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"milkdown-div",
|
|
),
|
|
# Monaco Editor - capture all view-lines content
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*monaco-editor[^"\']*["\'][^>]*>.*?<div[^>]*class=["\'][^"\']*view-lines[^"\']*[^>]*>(.*?)</div>(?=.*?</div>.*?</div>)',
|
|
"monaco",
|
|
),
|
|
# CodeMirror patterns
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*cm-content[^"\']*["\'][^>]*>((?:<div[^>]*class=["\'][^"\']*cm-line[^"\']*["\'][^>]*>.*?</div>\s*)+)</div>',
|
|
"codemirror",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*CodeMirror[^"\']*["\'][^>]*>.*?<div[^>]*class=["\'][^"\']*CodeMirror-code[^"\']*["\'][^>]*>(.*?)</div>',
|
|
"codemirror-legacy",
|
|
),
|
|
# Prism.js with language - must be before generic pre
|
|
(
|
|
r'<pre[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
|
|
"prism",
|
|
),
|
|
(
|
|
r'<pre[^>]*>\s*<code[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</code>\s*</pre>',
|
|
"prism-alt",
|
|
),
|
|
# highlight.js - must be before generic pre/code
|
|
(
|
|
r'<pre[^>]*><code[^>]*class=["\'][^"\']*hljs(?:\s+language-(\w+))?[^"\']*["\'][^>]*>(.*?)</code></pre>',
|
|
"hljs",
|
|
),
|
|
(
|
|
r'<pre[^>]*class=["\'][^"\']*hljs[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"hljs-pre",
|
|
),
|
|
# Shiki patterns (VitePress, Astro, etc.)
|
|
(
|
|
r'<pre[^>]*class=["\'][^"\']*shiki[^"\']*["\'][^>]*(?:.*?style=["\'][^"\']*background-color[^"\']*["\'])?[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
|
|
"shiki",
|
|
),
|
|
(r'<pre[^>]*class=["\'][^"\']*astro-code[^"\']*["\'][^>]*>(.*?)</pre>', "astro-shiki"),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*astro-code[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
|
|
"astro-wrapper",
|
|
),
|
|
# VitePress/Vue patterns
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
|
|
"vitepress",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*vp-code[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
|
|
"vitepress-vp",
|
|
),
|
|
# Nextra patterns
|
|
(r"<div[^>]*data-nextra-code[^>]*>.*?<pre[^>]*>(.*?)</pre>", "nextra"),
|
|
(
|
|
r'<pre[^>]*class=["\'][^"\']*nx-[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
|
|
"nextra-nx",
|
|
),
|
|
# Standard pre/code patterns - should be near the end
|
|
(
|
|
r'<pre[^>]*><code[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</code></pre>',
|
|
"standard-lang",
|
|
),
|
|
(r"<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>", "standard"),
|
|
# Generic patterns - should be last
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*code-block[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
|
|
"generic-div",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*codeblock[^"\']*["\'][^>]*>(.*?)</div>',
|
|
"generic-codeblock",
|
|
),
|
|
(
|
|
r'<div[^>]*class=["\'][^"\']*highlight[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
|
|
"highlight",
|
|
),
|
|
]
|
|
|
|
# PERFORMANCE: Early exit checks to avoid unnecessary regex processing
|
|
# Check more content (20KB instead of 5KB) and add URL-based exceptions
|
|
check_size = min(20000, len(content)) # Check first 20KB or entire content if smaller
|
|
has_code_indicators = any(indicator in content[:check_size] for indicator in
|
|
['<pre', '<code', 'language-', 'hljs', 'prism', 'shiki', 'highlight'])
|
|
|
|
# Never skip certain documentation sites that we know have code
|
|
is_known_code_site = any(domain in source_url.lower() for domain in
|
|
['milkdown', 'github.com', 'gitlab', 'docs.', 'dev.', 'api.'])
|
|
|
|
if not has_code_indicators and not is_known_code_site:
|
|
safe_logfire_info(f"No code indicators found in first {check_size} chars and not a known code site, skipping HTML extraction | url={source_url}")
|
|
return []
|
|
|
|
if is_known_code_site and not has_code_indicators:
|
|
safe_logfire_info(f"Known code site but no indicators in first {check_size} chars, continuing anyway | url={source_url}")
|
|
|
|
# PERFORMANCE: Limit number of patterns to check based on detected libraries
|
|
patterns_to_check = []
|
|
content_lower = content[:10000].lower() # Check first 10KB for library detection
|
|
|
|
# Selectively add patterns based on what's detected
|
|
if 'milkdown' in content_lower:
|
|
patterns_to_check.extend([p for p in patterns if 'milkdown' in p[1]])
|
|
if 'monaco' in content_lower:
|
|
patterns_to_check.extend([p for p in patterns if 'monaco' in p[1]])
|
|
if 'codemirror' in content_lower or 'cm-' in content_lower:
|
|
patterns_to_check.extend([p for p in patterns if 'codemirror' in p[1]])
|
|
if 'prism' in content_lower:
|
|
patterns_to_check.extend([p for p in patterns if 'prism' in p[1]])
|
|
if 'hljs' in content_lower or 'highlight' in content_lower:
|
|
patterns_to_check.extend([p for p in patterns if 'hljs' in p[1] or 'highlight' in p[1]])
|
|
if 'shiki' in content_lower or 'astro' in content_lower:
|
|
patterns_to_check.extend([p for p in patterns if 'shiki' in p[1] or 'astro' in p[1]])
|
|
|
|
# Always include standard patterns as fallback (get ALL standard/generic patterns, not just last 5)
|
|
standard_patterns = [p for p in patterns if any(tag in p[1] for tag in ['standard', 'generic', 'prism', 'hljs'])]
|
|
patterns_to_check.extend(standard_patterns)
|
|
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique_patterns = []
|
|
for p in patterns_to_check:
|
|
if p[1] not in seen:
|
|
unique_patterns.append(p)
|
|
seen.add(p[1])
|
|
patterns_to_check = unique_patterns
|
|
|
|
# If we have very few patterns and it's a known code site, add more generic patterns
|
|
if len(patterns_to_check) < 5 and is_known_code_site:
|
|
safe_logfire_info(f"Known code site with few patterns ({len(patterns_to_check)}), adding more generic patterns")
|
|
patterns_to_check = patterns # Use all patterns for known code sites
|
|
|
|
safe_logfire_info(f"Checking {len(patterns_to_check)} relevant patterns out of {len(patterns)} total")
|
|
|
|
for pattern_tuple in patterns_to_check:
|
|
pattern_str, source_type = pattern_tuple
|
|
|
|
# PERFORMANCE: Use re.finditer with smaller chunks for very long content
|
|
# Only use DOTALL for patterns that really need it (multi-line blocks)
|
|
flags = re.IGNORECASE
|
|
if 'monaco' in source_type or 'codemirror' in source_type:
|
|
flags |= re.DOTALL # These need DOTALL for multi-line matching
|
|
|
|
matches = list(re.finditer(pattern_str, content, flags))
|
|
|
|
# Log pattern matches for Milkdown patterns and CodeMirror
|
|
if matches and (
|
|
"milkdown" in source_type
|
|
or "codemirror" in source_type
|
|
or "milkdown" in content[:1000].lower()
|
|
):
|
|
safe_logfire_info(f"Pattern {source_type} found {len(matches)} matches")
|
|
|
|
for match in matches:
|
|
# Extract code content based on pattern type
|
|
if source_type in ["standard-lang", "prism", "vitepress", "hljs", "milkdown-typed"]:
|
|
# These patterns capture language in group 1, code in group 2
|
|
if match.lastindex and match.lastindex >= 2:
|
|
language = match.group(1)
|
|
code_content = match.group(2).strip()
|
|
else:
|
|
code_content = match.group(1).strip()
|
|
language = ""
|
|
else:
|
|
# Most patterns have code in group 1
|
|
code_content = match.group(1).strip()
|
|
# Try to extract language from the full match
|
|
full_match = match.group(0)
|
|
lang_match = re.search(r'class=["\'].*?language-(\w+)', full_match)
|
|
language = lang_match.group(1) if lang_match else ""
|
|
|
|
# Get the start position for complete block extraction
|
|
code_start_pos = match.start()
|
|
|
|
# For CodeMirror, extract text from cm-lines
|
|
if source_type == "codemirror":
|
|
# Extract text from each cm-line div
|
|
cm_lines = re.findall(
|
|
r'<div[^>]*class=["\'][^"\']*cm-line[^"\']*["\'][^>]*>(.*?)</div>',
|
|
code_content,
|
|
re.DOTALL,
|
|
)
|
|
if cm_lines:
|
|
# Clean each line and join
|
|
cleaned_lines = []
|
|
for line in cm_lines:
|
|
# Remove span tags but keep content
|
|
line = re.sub(r"<span[^>]*>", "", line)
|
|
line = re.sub(r"</span>", "", line)
|
|
# Remove other HTML tags
|
|
line = re.sub(r"<[^>]+>", "", line)
|
|
cleaned_lines.append(line)
|
|
code_content = "\n".join(cleaned_lines)
|
|
else:
|
|
# Fallback: just clean HTML
|
|
code_content = re.sub(r"<span[^>]*>", "", code_content)
|
|
code_content = re.sub(r"</span>", "", code_content)
|
|
code_content = re.sub(r"<[^>]+>", "\n", code_content)
|
|
|
|
# For Monaco, extract text from nested divs
|
|
if source_type == "monaco":
|
|
# Extract actual code from Monaco's complex structure
|
|
code_content = re.sub(r"<div[^>]*>", "\n", code_content)
|
|
code_content = re.sub(r"</div>", "", code_content)
|
|
code_content = re.sub(r"<span[^>]*>", "", code_content)
|
|
code_content = re.sub(r"</span>", "", code_content)
|
|
|
|
# Calculate dynamic minimum length
|
|
context_for_length = content[max(0, code_start_pos - 500) : code_start_pos + 500]
|
|
min_length = await self._calculate_min_length(language, context_for_length)
|
|
|
|
# Skip if initial content is too short
|
|
if len(code_content) < min_length:
|
|
# Try to find complete block if we have a language
|
|
if language and code_start_pos > 0:
|
|
# Look for complete code block
|
|
complete_code, block_end_pos = await self._find_complete_code_block(
|
|
content, code_start_pos, min_length, language
|
|
)
|
|
if len(complete_code) >= min_length:
|
|
code_content = complete_code
|
|
end_pos = block_end_pos
|
|
else:
|
|
continue
|
|
else:
|
|
continue
|
|
|
|
# Extract position info for deduplication
|
|
start_pos = match.start()
|
|
end_pos = (
|
|
match.end()
|
|
if len(code_content) <= len(match.group(0))
|
|
else code_start_pos + len(code_content)
|
|
)
|
|
|
|
# Check if we've already extracted code from this position
|
|
position_key = (start_pos, end_pos)
|
|
overlapping = False
|
|
for existing_start, existing_end in extracted_positions:
|
|
# Check if this match overlaps with an existing extraction
|
|
if not (end_pos <= existing_start or start_pos >= existing_end):
|
|
overlapping = True
|
|
break
|
|
|
|
if not overlapping:
|
|
extracted_positions.add(position_key)
|
|
|
|
# Extract context
|
|
context_before = content[max(0, start_pos - 1000) : start_pos].strip()
|
|
context_after = content[end_pos : min(len(content), end_pos + 1000)].strip()
|
|
|
|
# Clean the code content
|
|
cleaned_code = self._clean_code_content(code_content, language)
|
|
|
|
# Validate code quality
|
|
if await self._validate_code_quality(cleaned_code, language):
|
|
# Log successful extraction
|
|
safe_logfire_info(
|
|
f"Extracted code block | source_type={source_type} | language={language} | min_length={min_length} | original_length={len(code_content)} | cleaned_length={len(cleaned_code)}"
|
|
)
|
|
|
|
code_blocks.append({
|
|
"code": cleaned_code,
|
|
"language": language,
|
|
"context_before": context_before,
|
|
"context_after": context_after,
|
|
"full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
|
|
"source_type": source_type, # Track which pattern matched
|
|
})
|
|
else:
|
|
safe_logfire_info(
|
|
f"Code block failed validation | source_type={source_type} | language={language} | length={len(cleaned_code)}"
|
|
)
|
|
|
|
# Pattern 2: <code>...</code> (standalone)
|
|
if not code_blocks: # Only if we didn't find pre/code blocks
|
|
code_pattern = r"<code[^>]*>(.*?)</code>"
|
|
matches = re.finditer(code_pattern, content, re.DOTALL | re.IGNORECASE)
|
|
|
|
for match in matches:
|
|
code_content = match.group(1).strip()
|
|
# Clean the code content
|
|
cleaned_code = self._clean_code_content(code_content, "")
|
|
|
|
# Check if it's multiline or substantial enough and validate quality
|
|
# Use a minimal length for standalone code tags
|
|
if len(cleaned_code) >= 100 and ("\n" in cleaned_code or len(cleaned_code) > 100):
|
|
if await self._validate_code_quality(cleaned_code, ""):
|
|
start_pos = match.start()
|
|
end_pos = match.end()
|
|
context_before = content[max(0, start_pos - 1000) : start_pos].strip()
|
|
context_after = content[end_pos : min(len(content), end_pos + 1000)].strip()
|
|
|
|
code_blocks.append({
|
|
"code": cleaned_code,
|
|
"language": "",
|
|
"context_before": context_before,
|
|
"context_after": context_after,
|
|
"full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
|
|
})
|
|
else:
|
|
safe_logfire_info(
|
|
f"Standalone code block failed validation | length={len(cleaned_code)}"
|
|
)
|
|
|
|
return code_blocks
|
|
|
|
async def _extract_text_file_code_blocks(
|
|
self, content: str, url: str, min_length: int | None = None
|
|
) -> list[dict[str, Any]]:
|
|
"""
|
|
Extract code blocks from plain text files (like .txt files).
|
|
Handles formats like llms.txt where code blocks may be indicated by:
|
|
- Triple backticks (```)
|
|
- Language indicators (e.g., "typescript", "python")
|
|
- Indentation patterns
|
|
- Code block separators
|
|
|
|
Args:
|
|
content: The plain text content
|
|
url: The URL of the text file for context
|
|
min_length: Minimum length for code blocks
|
|
|
|
Returns:
|
|
List of code blocks with metadata
|
|
"""
|
|
import re
|
|
|
|
safe_logfire_info(
|
|
f"🔍 TEXT FILE EXTRACTION START | url={url} | content_length={len(content)}"
|
|
)
|
|
safe_logfire_info(f"📄 First 1000 chars: {repr(content[:1000])}...")
|
|
safe_logfire_info(
|
|
f"📄 Sample showing backticks: {repr(content[5000:6000])}..."
|
|
if len(content) > 6000
|
|
else "Content too short for mid-sample"
|
|
)
|
|
|
|
code_blocks = []
|
|
|
|
# Method 1: Look for triple backtick code blocks (Markdown style)
|
|
# Pattern allows for additional text after language (e.g., "typescript TypeScript")
|
|
backtick_pattern = r"```(\w*)[^\n]*\n(.*?)```"
|
|
matches = list(re.finditer(backtick_pattern, content, re.DOTALL | re.MULTILINE))
|
|
safe_logfire_info(f"📊 Backtick pattern matches: {len(matches)}")
|
|
|
|
for i, match in enumerate(matches):
|
|
language = match.group(1) or ""
|
|
code_content = match.group(2).strip()
|
|
|
|
# Log match info without including the actual content that might break formatting
|
|
safe_logfire_info(
|
|
f"🔎 Match {i + 1}: language='{language}', raw_length={len(code_content)}"
|
|
)
|
|
|
|
# Get position info first
|
|
start_pos = match.start()
|
|
end_pos = match.end()
|
|
|
|
# Calculate dynamic minimum length
|
|
context_around = content[max(0, start_pos - 500) : min(len(content), end_pos + 500)]
|
|
if min_length is None:
|
|
actual_min_length = await self._calculate_min_length(language, context_around)
|
|
else:
|
|
actual_min_length = min_length
|
|
|
|
if len(code_content) >= actual_min_length:
|
|
# Get context
|
|
context_before = content[max(0, start_pos - 500) : start_pos].strip()
|
|
context_after = content[end_pos : min(len(content), end_pos + 500)].strip()
|
|
|
|
# Clean and validate
|
|
cleaned_code = self._clean_code_content(code_content, language)
|
|
safe_logfire_info(f"🧹 After cleaning: length={len(cleaned_code)}")
|
|
|
|
if await self._validate_code_quality(cleaned_code, language):
|
|
safe_logfire_info(
|
|
f"✅ VALID backtick code block | language={language} | length={len(cleaned_code)}"
|
|
)
|
|
code_blocks.append({
|
|
"code": cleaned_code,
|
|
"language": language,
|
|
"context_before": context_before,
|
|
"context_after": context_after,
|
|
"full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
|
|
"source_type": "text_backticks",
|
|
})
|
|
else:
|
|
safe_logfire_info(
|
|
f"❌ INVALID code block failed validation | language={language}"
|
|
)
|
|
else:
|
|
safe_logfire_info(
|
|
f"❌ Code block too short: {len(code_content)} < {actual_min_length}"
|
|
)
|
|
|
|
# Method 2: Look for language-labeled code blocks (e.g., "TypeScript:" or "Python example:")
|
|
language_pattern = r"(?:^|\n)((?:typescript|javascript|python|java|c\+\+|rust|go|ruby|php|swift|kotlin|scala|r|matlab|julia|dart|elixir|erlang|haskell|clojure|lua|perl|shell|bash|sql|html|css|xml|json|yaml|toml|ini|dockerfile|makefile|cmake|gradle|maven|npm|yarn|pip|cargo|gem|pod|composer|nuget|apt|yum|brew|choco|snap|flatpak|appimage|msi|exe|dmg|pkg|deb|rpm|tar|zip|7z|rar|gz|bz2|xz|zst|lz4|lzo|lzma|lzip|lzop|compress|uncompress|gzip|gunzip|bzip2|bunzip2|xz|unxz|zstd|unzstd|lz4|unlz4|lzo|unlzo|lzma|unlzma|lzip|lunzip|lzop|unlzop)\s*(?:code|example|snippet)?)[:\s]*\n((?:(?:^[ \t]+.*\n?)+)|(?:.*\n)+?)(?=\n(?:[A-Z][a-z]+\s*:|^\s*$|\n#|\n\*|\n-|\n\d+\.))"
|
|
matches = re.finditer(language_pattern, content, re.IGNORECASE | re.MULTILINE)
|
|
|
|
for match in matches:
|
|
language_info = match.group(1).lower()
|
|
# Extract just the language name
|
|
language = (
|
|
re.match(r"(\w+)", language_info).group(1)
|
|
if re.match(r"(\w+)", language_info)
|
|
else ""
|
|
)
|
|
code_content = match.group(2).strip()
|
|
|
|
# Calculate dynamic minimum length for language-labeled blocks
|
|
if min_length is None:
|
|
actual_min_length_lang = await self._calculate_min_length(
|
|
language, code_content[:500]
|
|
)
|
|
else:
|
|
actual_min_length_lang = min_length
|
|
|
|
if len(code_content) >= actual_min_length_lang:
|
|
# Get context
|
|
start_pos = match.start()
|
|
end_pos = match.end()
|
|
context_before = content[max(0, start_pos - 500) : start_pos].strip()
|
|
context_after = content[end_pos : min(len(content), end_pos + 500)].strip()
|
|
|
|
# Clean and validate
|
|
cleaned_code = self._clean_code_content(code_content, language)
|
|
if await self._validate_code_quality(cleaned_code, language):
|
|
safe_logfire_info(
|
|
f"Found language-labeled code block | language={language} | length={len(cleaned_code)}"
|
|
)
|
|
code_blocks.append({
|
|
"code": cleaned_code,
|
|
"language": language,
|
|
"context_before": context_before,
|
|
"context_after": context_after,
|
|
"full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
|
|
"source_type": "text_language_label",
|
|
})
|
|
|
|
# Method 3: Look for consistently indented blocks (at least 4 spaces or 1 tab)
|
|
# This is more heuristic and should be used carefully
|
|
if len(code_blocks) == 0: # Only if we haven't found code blocks yet
|
|
# Split content into potential code sections
|
|
lines = content.split("\n")
|
|
current_block = []
|
|
current_indent = None
|
|
block_start_idx = 0
|
|
|
|
for i, line in enumerate(lines):
|
|
# Check if line is indented
|
|
stripped = line.lstrip()
|
|
indent = len(line) - len(stripped)
|
|
|
|
if indent >= 4 and stripped: # At least 4 spaces and not empty
|
|
if current_indent is None:
|
|
current_indent = indent
|
|
block_start_idx = i
|
|
current_block.append(line)
|
|
elif current_block and len("\n".join(current_block)) >= min_length:
|
|
# End of indented block, check if it's code
|
|
code_content = "\n".join(current_block)
|
|
|
|
# Try to detect language from content
|
|
language = self._detect_language_from_content(code_content)
|
|
|
|
# Get context
|
|
context_before_lines = lines[max(0, block_start_idx - 10) : block_start_idx]
|
|
context_after_lines = lines[i : min(len(lines), i + 10)]
|
|
context_before = "\n".join(context_before_lines).strip()
|
|
context_after = "\n".join(context_after_lines).strip()
|
|
|
|
# Clean and validate
|
|
cleaned_code = self._clean_code_content(code_content, language)
|
|
if await self._validate_code_quality(cleaned_code, language):
|
|
safe_logfire_info(
|
|
f"Found indented code block | language={language} | length={len(cleaned_code)}"
|
|
)
|
|
code_blocks.append({
|
|
"code": cleaned_code,
|
|
"language": language,
|
|
"context_before": context_before,
|
|
"context_after": context_after,
|
|
"full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
|
|
"source_type": "text_indented",
|
|
})
|
|
|
|
# Reset for next block
|
|
current_block = []
|
|
current_indent = None
|
|
else:
|
|
# Reset if not indented
|
|
if current_block and not stripped:
|
|
# Allow empty lines within code blocks
|
|
current_block.append(line)
|
|
else:
|
|
current_block = []
|
|
current_indent = None
|
|
|
|
safe_logfire_info(
|
|
f"📊 TEXT FILE EXTRACTION COMPLETE | total_blocks={len(code_blocks)} | url={url}"
|
|
)
|
|
for i, block in enumerate(code_blocks[:3]): # Log first 3 blocks
|
|
safe_logfire_info(
|
|
f"📦 Block {i + 1} summary: language='{block.get('language', '')}', source_type='{block.get('source_type', '')}', length={len(block.get('code', ''))}"
|
|
)
|
|
return code_blocks
|
|
|
|
def _detect_language_from_content(self, code: str) -> str:
|
|
"""
|
|
Try to detect programming language from code content.
|
|
This is a simple heuristic approach.
|
|
"""
|
|
import re
|
|
|
|
# Language detection patterns
|
|
patterns = {
|
|
"python": [
|
|
r"\bdef\s+\w+\s*\(",
|
|
r"\bclass\s+\w+",
|
|
r"\bimport\s+\w+",
|
|
r"\bfrom\s+\w+\s+import",
|
|
],
|
|
"javascript": [
|
|
r"\bfunction\s+\w+\s*\(",
|
|
r"\bconst\s+\w+\s*=",
|
|
r"\blet\s+\w+\s*=",
|
|
r"\bvar\s+\w+\s*=",
|
|
],
|
|
"typescript": [
|
|
r"\binterface\s+\w+",
|
|
r":\s*\w+\[\]",
|
|
r"\btype\s+\w+\s*=",
|
|
r"\bclass\s+\w+.*\{",
|
|
],
|
|
"java": [
|
|
r"\bpublic\s+class\s+\w+",
|
|
r"\bprivate\s+\w+\s+\w+",
|
|
r"\bpublic\s+static\s+void\s+main",
|
|
],
|
|
"rust": [r"\bfn\s+\w+\s*\(", r"\blet\s+mut\s+\w+", r"\bimpl\s+\w+", r"\bstruct\s+\w+"],
|
|
"go": [r"\bfunc\s+\w+\s*\(", r"\bpackage\s+\w+", r"\btype\s+\w+\s+struct"],
|
|
}
|
|
|
|
# Count matches for each language
|
|
scores = {}
|
|
for lang, lang_patterns in patterns.items():
|
|
score = 0
|
|
for pattern in lang_patterns:
|
|
if re.search(pattern, code, re.MULTILINE):
|
|
score += 1
|
|
if score > 0:
|
|
scores[lang] = score
|
|
|
|
# Return language with highest score
|
|
if scores:
|
|
return max(scores, key=scores.get)
|
|
|
|
return ""
|
|
|
|
async def _find_complete_code_block(
|
|
self,
|
|
content: str,
|
|
start_pos: int,
|
|
min_length: int = 250,
|
|
language: str = "",
|
|
max_length: int = None,
|
|
) -> tuple[str, int]:
|
|
"""
|
|
Find a complete code block starting from a position, extending until we find a natural boundary.
|
|
|
|
Args:
|
|
content: The full content to search in
|
|
start_pos: Starting position in the content
|
|
min_length: Minimum length for the code block
|
|
language: Detected language for language-specific patterns
|
|
|
|
Returns:
|
|
Tuple of (complete_code_block, end_position)
|
|
"""
|
|
# Start with the minimum content
|
|
if start_pos + min_length > len(content):
|
|
return content[start_pos:], len(content)
|
|
|
|
# Look for natural code boundaries
|
|
boundary_patterns = [
|
|
r"\n}\s*$", # Closing brace at end of line
|
|
r"\n}\s*;?\s*$", # Closing brace with optional semicolon
|
|
r"\n\)\s*;?\s*$", # Closing parenthesis
|
|
r"\n\s*$\n\s*$", # Double newline (paragraph break)
|
|
r"\n(?=class\s)", # Before next class
|
|
r"\n(?=function\s)", # Before next function
|
|
r"\n(?=def\s)", # Before next Python function
|
|
r"\n(?=export\s)", # Before next export
|
|
r"\n(?=const\s)", # Before next const declaration
|
|
r"\n(?=//)", # Before comment block
|
|
r"\n(?=#)", # Before Python comment
|
|
r"\n(?=\*)", # Before JSDoc/comment
|
|
r"\n(?=```)", # Before next code block
|
|
]
|
|
|
|
# Add language-specific patterns if available
|
|
if language and language.lower() in self.LANGUAGE_PATTERNS:
|
|
lang_patterns = self.LANGUAGE_PATTERNS[language.lower()]
|
|
if "block_end" in lang_patterns:
|
|
boundary_patterns.insert(0, lang_patterns["block_end"])
|
|
|
|
# Extend until we find a boundary
|
|
extended_pos = start_pos + min_length
|
|
while extended_pos < len(content):
|
|
# Check next 500 characters for a boundary
|
|
lookahead_end = min(extended_pos + 500, len(content))
|
|
lookahead = content[extended_pos:lookahead_end]
|
|
|
|
for pattern in boundary_patterns:
|
|
match = re.search(pattern, lookahead, re.MULTILINE)
|
|
if match:
|
|
final_pos = extended_pos + match.end()
|
|
return content[start_pos:final_pos].rstrip(), final_pos
|
|
|
|
# If no boundary found, extend by another chunk
|
|
extended_pos += 100
|
|
|
|
# Cap at maximum length
|
|
if max_length is None:
|
|
max_length = await self._get_max_code_length()
|
|
if extended_pos - start_pos > max_length:
|
|
break
|
|
|
|
# Return what we have
|
|
return content[start_pos:extended_pos].rstrip(), extended_pos
|
|
|
|
async def _calculate_min_length(self, language: str, context: str) -> int:
|
|
"""
|
|
Calculate appropriate minimum length based on language and context.
|
|
|
|
Args:
|
|
language: The detected programming language
|
|
context: Surrounding context of the code
|
|
|
|
Returns:
|
|
Calculated minimum length
|
|
"""
|
|
# Base lengths by language
|
|
# Check if contextual length adjustment is enabled
|
|
if not await self._is_contextual_length_enabled():
|
|
# Return default minimum length
|
|
return await self._get_min_code_length()
|
|
|
|
# Base lengths by language
|
|
base_lengths = {
|
|
"json": 100, # JSON can be short
|
|
"yaml": 100, # YAML too
|
|
"xml": 100, # XML structures
|
|
"html": 150, # HTML snippets
|
|
"css": 150, # CSS rules
|
|
"sql": 150, # SQL queries
|
|
"python": 200, # Python functions
|
|
"javascript": 250, # JavaScript typically longer
|
|
"typescript": 250, # TypeScript typically longer
|
|
"java": 300, # Java even more verbose
|
|
"c++": 300, # C++ similar to Java
|
|
"cpp": 300, # C++ alternative
|
|
"c": 250, # C slightly less verbose
|
|
"rust": 250, # Rust medium verbosity
|
|
"go": 200, # Go is concise
|
|
}
|
|
|
|
# Get default minimum from settings
|
|
default_min = await self._get_min_code_length()
|
|
min_length = base_lengths.get(language.lower(), default_min)
|
|
|
|
# Adjust based on context clues
|
|
context_lower = context.lower()
|
|
if any(word in context_lower for word in ["example", "snippet", "sample", "demo"]):
|
|
min_length = int(min_length * 0.7) # Examples can be shorter
|
|
elif any(word in context_lower for word in ["implementation", "complete", "full"]):
|
|
min_length = int(min_length * 1.5) # Full implementations should be longer
|
|
elif any(word in context_lower for word in ["minimal", "simple", "basic"]):
|
|
min_length = int(min_length * 0.8) # Simple examples can be shorter
|
|
|
|
# Ensure reasonable bounds
|
|
return max(100, min(1000, min_length))
|
|
|
|
def _decode_html_entities(self, text: str) -> str:
|
|
"""Decode common HTML entities and clean HTML tags from code."""
|
|
import re
|
|
|
|
# First, handle span tags that wrap individual tokens
|
|
# Check if spans are being used for syntax highlighting (no spaces between tags)
|
|
if "</span><span" in text:
|
|
# This indicates syntax highlighting - preserve the structure
|
|
text = re.sub(r"</span>", "", text)
|
|
text = re.sub(r"<span[^>]*>", "", text)
|
|
else:
|
|
# Normal span usage - might need spacing
|
|
# Only add space if there isn't already whitespace
|
|
text = re.sub(r"</span>(?=[A-Za-z0-9])", " ", text)
|
|
text = re.sub(r"<span[^>]*>", "", text)
|
|
|
|
# Remove any other HTML tags but preserve their content
|
|
text = re.sub(r"</?[^>]+>", "", text)
|
|
|
|
# Decode HTML entities
|
|
replacements = {
|
|
"<": "<",
|
|
">": ">",
|
|
"&": "&",
|
|
""": '"',
|
|
"'": "'",
|
|
" ": " ",
|
|
"'": "'",
|
|
"/": "/",
|
|
"<": "<",
|
|
">": ">",
|
|
}
|
|
|
|
for entity, char in replacements.items():
|
|
text = text.replace(entity, char)
|
|
|
|
# Replace escaped newlines with actual newlines
|
|
text = text.replace("\\n", "\n")
|
|
|
|
# Clean up excessive whitespace while preserving intentional spacing
|
|
# Replace multiple spaces with single space, but preserve newlines
|
|
lines = text.split("\n")
|
|
cleaned_lines = []
|
|
for line in lines:
|
|
# Replace multiple spaces with single space
|
|
line = re.sub(r" +", " ", line)
|
|
# Trim trailing spaces but preserve leading spaces (indentation)
|
|
line = line.rstrip()
|
|
cleaned_lines.append(line)
|
|
|
|
text = "\n".join(cleaned_lines)
|
|
|
|
return text
|
|
|
|
def _clean_code_content(self, code: str, language: str = "") -> str:
|
|
"""
|
|
Clean and fix common issues in extracted code content.
|
|
|
|
Args:
|
|
code: The code content to clean
|
|
language: The detected language (optional)
|
|
|
|
Returns:
|
|
Cleaned code content
|
|
"""
|
|
import re
|
|
|
|
# First apply HTML entity decoding and tag cleaning
|
|
code = self._decode_html_entities(code)
|
|
|
|
# Fix common concatenation issues from span removal
|
|
# Common patterns where spaces are missing between keywords
|
|
spacing_fixes = [
|
|
# Import statements
|
|
(r"(\b(?:from|import|as)\b)([A-Za-z])", r"\1 \2"),
|
|
# Function/class definitions
|
|
(r"(\b(?:def|class|async|await|return|raise|yield)\b)([A-Za-z])", r"\1 \2"),
|
|
# Control flow
|
|
(r"(\b(?:if|elif|else|for|while|try|except|finally|with)\b)([A-Za-z])", r"\1 \2"),
|
|
# Type hints and declarations
|
|
(
|
|
r"(\b(?:int|str|float|bool|list|dict|tuple|set|None|True|False)\b)([A-Za-z])",
|
|
r"\1 \2",
|
|
),
|
|
# Common Python keywords
|
|
(r"(\b(?:and|or|not|in|is|lambda)\b)([A-Za-z])", r"\1 \2"),
|
|
# Fix missing spaces around operators (but be careful with negative numbers)
|
|
(r"([A-Za-z_)])(\+|-|\*|/|=|<|>|%)", r"\1 \2"),
|
|
(r"(\+|-|\*|/|=|<|>|%)([A-Za-z_(])", r"\1 \2"),
|
|
]
|
|
|
|
for pattern, replacement in spacing_fixes:
|
|
code = re.sub(pattern, replacement, code)
|
|
|
|
# Fix specific patterns for different languages
|
|
if language.lower() in ["python", "py"]:
|
|
# Fix Python-specific issues
|
|
code = re.sub(r"(\b(?:from|import)\b)(\w+)(\b(?:import)\b)", r"\1 \2 \3", code)
|
|
# Fix missing colons
|
|
code = re.sub(
|
|
r"(\b(?:def|class|if|elif|else|for|while|try|except|finally|with)\b[^:]+)$",
|
|
r"\1:",
|
|
code,
|
|
flags=re.MULTILINE,
|
|
)
|
|
|
|
# Remove backticks that might have been included
|
|
if code.startswith("```") and code.endswith("```"):
|
|
lines = code.split("\n")
|
|
if len(lines) > 2:
|
|
# Remove first and last line
|
|
code = "\n".join(lines[1:-1])
|
|
elif code.startswith("`") and code.endswith("`"):
|
|
code = code[1:-1]
|
|
|
|
# Final cleanup
|
|
# Remove any remaining excessive spaces while preserving indentation
|
|
lines = code.split("\n")
|
|
cleaned_lines = []
|
|
for line in lines:
|
|
# Don't touch leading whitespace (indentation)
|
|
stripped = line.lstrip()
|
|
indent = line[: len(line) - len(stripped)]
|
|
# Clean the rest of the line
|
|
cleaned = re.sub(r" {2,}", " ", stripped)
|
|
cleaned_lines.append(indent + cleaned)
|
|
|
|
return "\n".join(cleaned_lines).strip()
|
|
|
|
async def _validate_code_quality(self, code: str, language: str = "") -> bool:
|
|
"""
|
|
Enhanced validation to ensure extracted content is actual code.
|
|
|
|
Args:
|
|
code: The code content to validate
|
|
language: The detected language (optional)
|
|
|
|
Returns:
|
|
True if code passes quality checks, False otherwise
|
|
"""
|
|
import re
|
|
|
|
# Basic checks
|
|
if not code or len(code.strip()) < 20:
|
|
return False
|
|
|
|
# Skip diagram languages if filtering is enabled
|
|
if await self._is_diagram_filtering_enabled():
|
|
if language.lower() in ["mermaid", "plantuml", "graphviz", "dot", "diagram"]:
|
|
safe_logfire_info(f"Skipping diagram language: {language}")
|
|
return False
|
|
|
|
# Check for common formatting issues that indicate poor extraction
|
|
bad_patterns = [
|
|
# Concatenated keywords without spaces (but allow camelCase)
|
|
r"\b(from|import|def|class|if|for|while|return)(?=[a-z])",
|
|
# HTML entities that weren't decoded
|
|
r"&[lg]t;|&|"|&#\d+;",
|
|
# Excessive HTML tags
|
|
r"<[^>]{50,}>", # Very long HTML tags
|
|
# Multiple spans in a row (indicates poor extraction)
|
|
r"(<span[^>]*>){5,}",
|
|
# Suspicious character sequences
|
|
r"[^\s]{200,}", # Very long unbroken strings (increased threshold)
|
|
]
|
|
|
|
for pattern in bad_patterns:
|
|
if re.search(pattern, code):
|
|
safe_logfire_info(f"Code failed quality check: pattern '{pattern}' found")
|
|
return False
|
|
|
|
# Check for minimum code complexity using various indicators
|
|
code_indicators = {
|
|
"function_calls": r"\w+\s*\([^)]*\)",
|
|
"assignments": r"\w+\s*=\s*.+",
|
|
"control_flow": r"\b(if|for|while|switch|case|try|catch|except)\b",
|
|
"declarations": r"\b(var|let|const|def|class|function|interface|type|struct|enum)\b",
|
|
"imports": r"\b(import|from|require|include|using|use)\b",
|
|
"brackets": r"[\{\}\[\]]",
|
|
"operators": r"[\+\-\*\/\%\&\|\^<>=!]",
|
|
"method_chains": r"\.\w+",
|
|
"arrows": r"(=>|->)",
|
|
"keywords": r"\b(return|break|continue|yield|await|async)\b",
|
|
}
|
|
|
|
indicator_count = 0
|
|
indicator_details = []
|
|
for name, pattern in code_indicators.items():
|
|
if re.search(pattern, code):
|
|
indicator_count += 1
|
|
indicator_details.append(name)
|
|
|
|
# Require minimum code indicators
|
|
min_indicators = await self._get_min_code_indicators()
|
|
if indicator_count < min_indicators:
|
|
safe_logfire_info(
|
|
f"Code has insufficient indicators: {indicator_count} found ({', '.join(indicator_details)})"
|
|
)
|
|
return False
|
|
|
|
# Check code-to-comment ratio
|
|
lines = code.split("\n")
|
|
non_empty_lines = [line for line in lines if line.strip()]
|
|
|
|
if not non_empty_lines:
|
|
return False
|
|
|
|
# Count comment lines (various comment styles)
|
|
comment_patterns = [
|
|
r"^\s*(//|#|/\*|\*|<!--)", # Single line comments
|
|
r'^\s*"""', # Python docstrings
|
|
r"^\s*'''", # Python docstrings alt
|
|
r"^\s*\*\s", # JSDoc style
|
|
]
|
|
|
|
comment_lines = 0
|
|
for line in lines:
|
|
for pattern in comment_patterns:
|
|
if re.match(pattern, line.strip()):
|
|
comment_lines += 1
|
|
break
|
|
|
|
# Allow up to 70% comments (documentation is important)
|
|
if non_empty_lines and comment_lines / len(non_empty_lines) > 0.7:
|
|
safe_logfire_info(
|
|
f"Code is mostly comments: {comment_lines}/{len(non_empty_lines)} lines"
|
|
)
|
|
return False
|
|
|
|
# Language-specific validation
|
|
if language.lower() in self.LANGUAGE_PATTERNS:
|
|
lang_info = self.LANGUAGE_PATTERNS[language.lower()]
|
|
min_indicators = lang_info.get("min_indicators", [])
|
|
|
|
# Check for language-specific indicators
|
|
found_lang_indicators = sum(
|
|
1 for indicator in min_indicators if indicator in code.lower()
|
|
)
|
|
|
|
if found_lang_indicators < 2: # Need at least 2 language-specific indicators
|
|
safe_logfire_info(
|
|
f"Code lacks {language} indicators: only {found_lang_indicators} found"
|
|
)
|
|
return False
|
|
|
|
# Check for reasonable structure
|
|
# Too few meaningful lines
|
|
if len(non_empty_lines) < 3:
|
|
safe_logfire_info(f"Code has too few non-empty lines: {len(non_empty_lines)}")
|
|
return False
|
|
|
|
# Check for reasonable line lengths
|
|
very_long_lines = sum(1 for line in lines if len(line) > 300)
|
|
if len(lines) > 0 and very_long_lines > len(lines) * 0.5:
|
|
safe_logfire_info("Code has too many very long lines")
|
|
return False
|
|
|
|
# Check if it's mostly prose/documentation
|
|
prose_indicators = [
|
|
r"\b(the|this|that|these|those|is|are|was|were|will|would|should|could|have|has|had)\b",
|
|
r"[.!?]\s+[A-Z]", # Sentence endings followed by capital letter
|
|
r"\b(however|therefore|furthermore|moreover|nevertheless)\b",
|
|
]
|
|
|
|
prose_score = 0
|
|
word_count = len(code.split())
|
|
for pattern in prose_indicators:
|
|
matches = len(re.findall(pattern, code, re.IGNORECASE))
|
|
prose_score += matches
|
|
|
|
# Check prose filtering
|
|
if await self._is_prose_filtering_enabled():
|
|
max_prose_ratio = await self._get_max_prose_ratio()
|
|
if word_count > 0 and prose_score / word_count > max_prose_ratio:
|
|
safe_logfire_info(
|
|
f"Code appears to be prose: prose_score={prose_score}, word_count={word_count}"
|
|
)
|
|
return False
|
|
|
|
# Passed all checks
|
|
safe_logfire_info(
|
|
f"Code passed validation: indicators={indicator_count}, language={language}, lines={len(non_empty_lines)}"
|
|
)
|
|
return True
|
|
|
|
async def _generate_code_summaries(
|
|
self,
|
|
all_code_blocks: list[dict[str, Any]],
|
|
progress_callback: Callable | None = None,
|
|
start_progress: int = 0,
|
|
end_progress: int = 100,
|
|
) -> list[dict[str, str]]:
|
|
"""
|
|
Generate summaries for all code blocks.
|
|
|
|
Returns:
|
|
List of summary results
|
|
"""
|
|
# Check if code summaries are enabled
|
|
if not await self._is_code_summaries_enabled():
|
|
safe_logfire_info("Code summaries generation is disabled, returning default summaries")
|
|
# Return default summaries for all code blocks
|
|
default_summaries = []
|
|
for item in all_code_blocks:
|
|
block = item["block"]
|
|
language = block.get("language", "")
|
|
default_summaries.append({
|
|
"example_name": f"Code Example{f' ({language})' if language else ''}",
|
|
"summary": "Code example for demonstration purposes.",
|
|
})
|
|
|
|
# Report progress for skipped summaries
|
|
if progress_callback:
|
|
await progress_callback({
|
|
"status": "code_extraction",
|
|
"percentage": end_progress,
|
|
"log": f"Skipped AI summary generation (disabled). Using default summaries for {len(all_code_blocks)} code blocks.",
|
|
})
|
|
|
|
return default_summaries
|
|
|
|
# Progress is handled by generate_code_summaries_batch
|
|
|
|
# Use default max workers
|
|
max_workers = 3
|
|
|
|
# Extract just the code blocks for batch processing
|
|
code_blocks_for_summaries = [item["block"] for item in all_code_blocks]
|
|
|
|
# Generate summaries with mapped progress tracking
|
|
summary_progress_callback = None
|
|
if progress_callback:
|
|
# Create a wrapper that maps the progress to the correct range
|
|
async def mapped_callback(data: dict):
|
|
# Map the percentage from generate_code_summaries_batch (0-100) to our range
|
|
if "percentage" in data:
|
|
raw_percentage = data["percentage"]
|
|
# Map from 0-100 to start_progress-end_progress
|
|
mapped_percentage = start_progress + int(
|
|
(raw_percentage / 100) * (end_progress - start_progress)
|
|
)
|
|
data["percentage"] = mapped_percentage
|
|
# Change the status to match what the orchestration expects
|
|
data["status"] = "code_extraction"
|
|
await progress_callback(data)
|
|
|
|
summary_progress_callback = mapped_callback
|
|
|
|
return await generate_code_summaries_batch(
|
|
code_blocks_for_summaries, max_workers, progress_callback=summary_progress_callback
|
|
)
|
|
|
|
def _prepare_code_examples_for_storage(
|
|
self, all_code_blocks: list[dict[str, Any]], summary_results: list[dict[str, str]]
|
|
) -> dict[str, list[Any]]:
|
|
"""
|
|
Prepare code examples for storage by organizing data into arrays.
|
|
|
|
Returns:
|
|
Dictionary with arrays for storage
|
|
"""
|
|
code_urls = []
|
|
code_chunk_numbers = []
|
|
code_examples = []
|
|
code_summaries = []
|
|
code_metadatas = []
|
|
|
|
for code_item, summary_result in zip(all_code_blocks, summary_results, strict=False):
|
|
block = code_item["block"]
|
|
source_url = code_item["source_url"]
|
|
source_id = code_item["source_id"]
|
|
|
|
summary = summary_result.get("summary", "Code example for demonstration purposes.")
|
|
example_name = summary_result.get("example_name", "Code Example")
|
|
|
|
code_urls.append(source_url)
|
|
code_chunk_numbers.append(len(code_examples))
|
|
code_examples.append(block["code"])
|
|
code_summaries.append(summary)
|
|
|
|
code_meta = {
|
|
"chunk_index": len(code_examples) - 1,
|
|
"url": source_url,
|
|
"source": source_id,
|
|
"source_id": source_id,
|
|
"language": block.get("language", ""),
|
|
"char_count": len(block["code"]),
|
|
"word_count": len(block["code"].split()),
|
|
"example_name": example_name,
|
|
"title": example_name,
|
|
}
|
|
code_metadatas.append(code_meta)
|
|
|
|
return {
|
|
"urls": code_urls,
|
|
"chunk_numbers": code_chunk_numbers,
|
|
"examples": code_examples,
|
|
"summaries": code_summaries,
|
|
"metadatas": code_metadatas,
|
|
}
|
|
|
|
async def _store_code_examples(
|
|
self,
|
|
storage_data: dict[str, list[Any]],
|
|
url_to_full_document: dict[str, str],
|
|
progress_callback: Callable | None = None,
|
|
start_progress: int = 0,
|
|
end_progress: int = 100,
|
|
) -> int:
|
|
"""
|
|
Store code examples in the database.
|
|
|
|
Returns:
|
|
Number of code examples stored
|
|
"""
|
|
# Create mapped progress callback for storage phase
|
|
storage_progress_callback = None
|
|
if progress_callback:
|
|
|
|
async def mapped_storage_callback(data: dict):
|
|
# Extract values from the dictionary
|
|
message = data.get("log", "")
|
|
percentage = data.get("percentage", 0)
|
|
|
|
# Map storage progress (0-100) to our range (start_progress to end_progress)
|
|
mapped_percentage = start_progress + int(
|
|
(percentage / 100) * (end_progress - start_progress)
|
|
)
|
|
|
|
update_data = {
|
|
"status": "code_storage",
|
|
"percentage": mapped_percentage,
|
|
"log": message,
|
|
}
|
|
|
|
# Pass through any additional batch info
|
|
if "batch_number" in data:
|
|
update_data["batch_number"] = data["batch_number"]
|
|
if "total_batches" in data:
|
|
update_data["total_batches"] = data["total_batches"]
|
|
|
|
await progress_callback(update_data)
|
|
|
|
storage_progress_callback = mapped_storage_callback
|
|
|
|
try:
|
|
await add_code_examples_to_supabase(
|
|
client=self.supabase_client,
|
|
urls=storage_data["urls"],
|
|
chunk_numbers=storage_data["chunk_numbers"],
|
|
code_examples=storage_data["examples"],
|
|
summaries=storage_data["summaries"],
|
|
metadatas=storage_data["metadatas"],
|
|
batch_size=20,
|
|
url_to_full_document=url_to_full_document,
|
|
progress_callback=storage_progress_callback,
|
|
provider=None, # Use configured provider
|
|
)
|
|
|
|
# Report final progress for code storage phase (not overall completion)
|
|
if progress_callback:
|
|
await progress_callback({
|
|
"status": "code_extraction", # Keep status as code_extraction, not completed
|
|
"percentage": end_progress,
|
|
"log": f"Code extraction phase completed. Stored {len(storage_data['examples'])} code examples.",
|
|
})
|
|
|
|
safe_logfire_info(f"Successfully stored {len(storage_data['examples'])} code examples")
|
|
return len(storage_data["examples"])
|
|
|
|
except Exception as e:
|
|
safe_logfire_error(f"Error storing code examples | error={str(e)}")
|
|
return 0
|