archon/python/src/server/services/crawling/code_extraction_service.py

"""
Code Extraction Service

Handles extraction, processing, and storage of code examples from documents.
"""

import re
from collections.abc import Callable
from typing import Any

from ...config.logfire_config import safe_logfire_error, safe_logfire_info
from ...services.credential_service import credential_service
from ..storage.code_storage_service import (
    add_code_examples_to_supabase,
    generate_code_summaries_batch,
)


class CodeExtractionService:
    """
    Service for extracting and processing code examples from documents.
    """

    # Language-specific patterns for better extraction
    LANGUAGE_PATTERNS = {
        "typescript": {
            "block_start": r"^\s*(export\s+)?(class|interface|function|const|type|enum)\s+\w+",
            "block_end": r"^\}(\s*;)?$",
            "min_indicators": [":", "{", "}", "=>", "function", "class", "interface", "type"],
        },
        "javascript": {
            "block_start": r"^\s*(export\s+)?(class|function|const|let|var)\s+\w+",
            "block_end": r"^\}(\s*;)?$",
            "min_indicators": ["function", "{", "}", "=>", "const", "let", "var"],
        },
        "python": {
            "block_start": r"^\s*(class|def|async\s+def)\s+\w+",
            "block_end": r"^\S",  # Unindented line
            "min_indicators": ["def", ":", "return", "self", "import", "class"],
        },
        "java": {
            "block_start": r"^\s*(public|private|protected)?\s*(class|interface|enum)\s+\w+",
            "block_end": r"^\}$",
            "min_indicators": ["class", "public", "private", "{", "}", ";"],
        },
        "rust": {
            "block_start": r"^\s*(pub\s+)?(fn|struct|impl|trait|enum)\s+\w+",
            "block_end": r"^\}$",
            "min_indicators": ["fn", "let", "mut", "impl", "struct", "->"],
        },
        "go": {
            "block_start": r"^\s*(func|type|struct)\s+\w+",
            "block_end": r"^\}$",
            "min_indicators": ["func", "type", "struct", "{", "}", ":="],
        },
    }

    def __init__(self, supabase_client):
        """
        Initialize the code extraction service.

        Args:
            supabase_client: The Supabase client for database operations
        """
        self.supabase_client = supabase_client
        self._settings_cache = {}

    async def _get_setting(self, key: str, default: Any) -> Any:
        """Get a setting from credential service with caching."""
        if key in self._settings_cache:
            return self._settings_cache[key]

        try:
            value = await credential_service.get_credential(key, default)
            # Convert string values to appropriate types
            if isinstance(default, bool):
                value = str(value).lower() == "true" if value is not None else default
            elif isinstance(default, int):
                value = int(value) if value is not None else default
            elif isinstance(default, float):
                value = float(value) if value is not None else default
            self._settings_cache[key] = value
            return value
        except Exception as e:
            safe_logfire_error(f"Error getting setting {key}: {e}, using default: {default}")
            # Make sure we return the default value with correct type
            self._settings_cache[key] = default
            return default

    async def _get_min_code_length(self) -> int:
        """Get minimum code block length setting."""
        return await self._get_setting("MIN_CODE_BLOCK_LENGTH", 250)

    async def _get_max_code_length(self) -> int:
        """Get maximum code block length setting."""
        return await self._get_setting("MAX_CODE_BLOCK_LENGTH", 5000)

    async def _is_complete_block_detection_enabled(self) -> bool:
        """Check if complete block detection is enabled."""
        return await self._get_setting("ENABLE_COMPLETE_BLOCK_DETECTION", True)

    async def _is_language_patterns_enabled(self) -> bool:
        """Check if language-specific patterns are enabled."""
        return await self._get_setting("ENABLE_LANGUAGE_SPECIFIC_PATTERNS", True)

    async def _is_prose_filtering_enabled(self) -> bool:
        """Check if prose filtering is enabled."""
        return await self._get_setting("ENABLE_PROSE_FILTERING", True)

    async def _get_max_prose_ratio(self) -> float:
        """Get maximum allowed prose ratio."""
        return await self._get_setting("MAX_PROSE_RATIO", 0.15)

    async def _get_min_code_indicators(self) -> int:
        """Get minimum required code indicators."""
        return await self._get_setting("MIN_CODE_INDICATORS", 3)

    async def _is_diagram_filtering_enabled(self) -> bool:
        """Check if diagram filtering is enabled."""
        return await self._get_setting("ENABLE_DIAGRAM_FILTERING", True)

    async def _is_contextual_length_enabled(self) -> bool:
        """Check if contextual length adjustment is enabled."""
        return await self._get_setting("ENABLE_CONTEXTUAL_LENGTH", True)

    async def _get_context_window_size(self) -> int:
        """Get context window size for code blocks."""
        return await self._get_setting("CONTEXT_WINDOW_SIZE", 1000)

    async def _is_code_summaries_enabled(self) -> bool:
        """Check if code summaries generation is enabled."""
        return await self._get_setting("ENABLE_CODE_SUMMARIES", True)

    async def extract_and_store_code_examples(
        self,
        crawl_results: list[dict[str, Any]],
        url_to_full_document: dict[str, str],
        source_id: str,
        progress_callback: Callable | None = None,
        start_progress: int = 0,
        end_progress: int = 100,
    ) -> int:
        """
        Extract code examples from crawled documents and store them.

        Args:
            crawl_results: List of crawled documents with url and markdown content
            url_to_full_document: Mapping of URLs to full document content
            source_id: The unique source_id for all documents
            progress_callback: Optional async callback for progress updates
            start_progress: Starting progress percentage (default: 0)
            end_progress: Ending progress percentage (default: 100)

        Returns:
            Number of code examples stored
        """
        # Divide the progress range into phases:
        # - Extract code blocks: start_progress to 40% of range
        # - Generate summaries: 40% to 80% of range
        # - Store examples: 80% to end_progress
        progress_range = end_progress - start_progress
        extract_end = start_progress + int(progress_range * 0.4)
        summary_end = start_progress + int(progress_range * 0.8)

        # Extract code blocks from all documents
        all_code_blocks = await self._extract_code_blocks_from_documents(
            crawl_results, source_id, progress_callback, start_progress, extract_end
        )

        if not all_code_blocks:
            safe_logfire_info("No code examples found in any crawled documents")
            # Still report completion when no code examples found
            if progress_callback:
                await progress_callback({
                    "status": "code_extraction",
                    "percentage": end_progress,
                    "log": "No code examples found to extract",
                })
            return 0

        # Log what we found
        safe_logfire_info(f"Found {len(all_code_blocks)} total code blocks to process")
        for i, block_data in enumerate(all_code_blocks[:3]):
            block = block_data["block"]
            safe_logfire_info(
                f"Sample code block {i + 1} | language={block.get('language', 'none')} | code_length={len(block.get('code', ''))}"
            )

        # Generate summaries for code blocks with mapped progress
        summary_results = await self._generate_code_summaries(
            all_code_blocks, progress_callback, extract_end, summary_end
        )

        # Prepare code examples for storage
        storage_data = self._prepare_code_examples_for_storage(all_code_blocks, summary_results)

        # Store code examples in database with final phase progress
        return await self._store_code_examples(
            storage_data, url_to_full_document, progress_callback, summary_end, end_progress
        )

    async def _extract_code_blocks_from_documents(
        self,
        crawl_results: list[dict[str, Any]],
        source_id: str,
        progress_callback: Callable | None = None,
        start_progress: int = 0,
        end_progress: int = 100,
    ) -> list[dict[str, Any]]:
        """
        Extract code blocks from all documents.

        Args:
            crawl_results: List of crawled documents
            source_id: The unique source_id for all documents

        Returns:
            List of code blocks with metadata
        """
        import asyncio
        import time

        # Progress will be reported during the loop below

        all_code_blocks = []
        total_docs = len(crawl_results)
        completed_docs = 0

        # PERFORMANCE: Track extraction time per document
        MAX_EXTRACTION_TIME_PER_DOC = 5.0  # 5 seconds max per document

        for doc in crawl_results:
            try:
                doc_start_time = time.time()
                source_url = doc["url"]
                html_content = doc.get("html", "")
                md = doc.get("markdown", "")

                # Debug logging
                safe_logfire_info(
                    f"Document content check | url={source_url} | has_html={bool(html_content)} | has_markdown={bool(md)} | html_len={len(html_content) if html_content else 0} | md_len={len(md) if md else 0}"
                )

                # Dynamic minimum length is handled inside the extraction methods

                # Check markdown first to see if it has code blocks
                if md:
                    has_backticks = "```" in md
                    backtick_count = md.count("```")
                    safe_logfire_info(
                        f"Markdown check | url={source_url} | has_backticks={has_backticks} | backtick_count={backtick_count}"
                    )

                    if "getting-started" in source_url and md:
                        # Log a sample of the markdown
                        sample = md[:500]
                        safe_logfire_info(f"Markdown sample for getting-started: {sample}...")

                # Improved extraction logic - check for text files first, then HTML, then markdown
                code_blocks = []

                # Check if this is a text file (e.g., .txt, .md)
                is_text_file = source_url.endswith((
                    ".txt",
                    ".text",
                    ".md",
                )) or "text/plain" in doc.get("content_type", "")

                if is_text_file:
                    # For text files, use specialized text extraction
                    safe_logfire_info(f"🎯 TEXT FILE DETECTED | url={source_url}")
                    safe_logfire_info(
                        f"📊 Content types - has_html={bool(html_content)}, has_md={bool(md)}"
                    )
                    # For text files, the HTML content should be the raw text (not wrapped in <pre>)
                    text_content = html_content if html_content else md
                    if text_content:
                        safe_logfire_info(
                            f"📝 Using {'HTML' if html_content else 'MARKDOWN'} content for text extraction"
                        )
                        safe_logfire_info(
                            f"🔍 Content preview (first 500 chars): {repr(text_content[:500])}..."
                        )
                        code_blocks = await self._extract_text_file_code_blocks(
                            text_content, source_url
                        )
                        safe_logfire_info(
                            f"📦 Text extraction complete | found={len(code_blocks)} blocks | url={source_url}"
                        )
                    else:
                        safe_logfire_info(f"⚠️ NO CONTENT for text file | url={source_url}")

                # If not a text file or no code blocks found, try HTML extraction first
                if len(code_blocks) == 0 and html_content and not is_text_file:
                    # PERFORMANCE: Check if we've already spent too much time on this document
                    elapsed_time = time.time() - doc_start_time
                    if elapsed_time > MAX_EXTRACTION_TIME_PER_DOC:
                        safe_logfire_info(
                            f"⏱️ Skipping HTML extraction for {source_url} - already spent {elapsed_time:.1f}s"
                        )
                    else:
                        safe_logfire_info(
                            f"Trying HTML extraction first | url={source_url} | html_length={len(html_content)}"
                        )
                        # Create a timeout for HTML extraction
                        remaining_time = MAX_EXTRACTION_TIME_PER_DOC - elapsed_time
                        try:
                            html_code_blocks = await asyncio.wait_for(
                                self._extract_html_code_blocks(html_content, source_url),
                                timeout=remaining_time
                            )
                            if html_code_blocks:
                                code_blocks = html_code_blocks
                                safe_logfire_info(
                                    f"Found {len(code_blocks)} code blocks from HTML | url={source_url}"
                                )
                        except asyncio.TimeoutError:
                            safe_logfire_info(
                                f"⏱️ HTML extraction timed out after {remaining_time:.1f}s for {source_url}"
                            )

                # If still no code blocks, try markdown extraction as fallback
                if len(code_blocks) == 0 and md and "```" in md:
                    safe_logfire_info(
                        f"No code blocks from HTML, trying markdown extraction | url={source_url}"
                    )
                    from ..storage.code_storage_service import extract_code_blocks

                    # Use dynamic minimum for markdown extraction
                    base_min_length = 250  # Default for markdown
                    code_blocks = extract_code_blocks(md, min_length=base_min_length)
                    safe_logfire_info(
                        f"Found {len(code_blocks)} code blocks from markdown | url={source_url}"
                    )

                if code_blocks:
                    # Use the provided source_id for all code blocks
                    for block in code_blocks:
                        all_code_blocks.append({
                            "block": block,
                            "source_url": source_url,
                            "source_id": source_id,
                        })

                # Update progress only after completing document extraction
                completed_docs += 1
                extraction_time = time.time() - doc_start_time
                if extraction_time > 2.0:  # Log slow extractions
                    safe_logfire_info(
                        f"⏱️ Document extraction took {extraction_time:.1f}s | url={source_url} | "
                        f"html_size={len(html_content) if html_content else 0} | "
                        f"blocks_found={len([b for b in all_code_blocks if b['source_url'] == source_url])}"
                    )

                if progress_callback and total_docs > 0:
                    # Calculate progress within the specified range
                    raw_progress = completed_docs / total_docs
                    mapped_progress = start_progress + int(
                        raw_progress * (end_progress - start_progress)
                    )
                    await progress_callback({
                        "status": "code_extraction",
                        "percentage": mapped_progress,
                        "log": f"Extracted code from {completed_docs}/{total_docs} documents",
                        "completed_documents": completed_docs,
                        "total_documents": total_docs,
                    })

            except Exception as e:
                safe_logfire_error(
                    f"Error processing code from document | url={doc.get('url')} | error={str(e)}"
                )

        return all_code_blocks

    async def _extract_html_code_blocks(self, content: str, source_url: str = "") -> list[dict[str, Any]]:
        """
        Extract code blocks from HTML patterns in content.
        This is a fallback when markdown conversion didn't preserve code blocks.

        Args:
            content: The content to search for HTML code patterns
            source_url: The URL of the document being processed
            min_length: Minimum length for code blocks

        Returns:
            List of code blocks with metadata
        """
        import re

        # Add detailed logging
        safe_logfire_info(f"Processing HTML of length {len(content)} for code extraction")

        # PERFORMANCE OPTIMIZATION: Skip extremely large HTML files or chunk them
        MAX_HTML_SIZE = 1_000_000  # 1MB limit for single-pass processing (increased from 500KB)
        if len(content) > MAX_HTML_SIZE:
            safe_logfire_info(
                f"⚠️ HTML content is very large ({len(content)} bytes). "
                f"Limiting to first {MAX_HTML_SIZE} bytes to prevent timeout."
            )
            # For very large files, focus on the first portion where code examples are likely to be
            content = content[:MAX_HTML_SIZE]
            # Try to find a good cutoff point (end of a tag)
            last_tag_end = content.rfind('>')
            if last_tag_end > MAX_HTML_SIZE - 1000:
                content = content[:last_tag_end + 1]

        # Check if we have actual content
        if len(content) < 1000:
            safe_logfire_info(
                f"Warning: HTML content seems too short, first 500 chars: {repr(content[:500])}"
            )

        # Look for specific indicators of code blocks
        has_prism = "prism" in content.lower()
        has_highlight = "highlight" in content.lower()
        has_shiki = "shiki" in content.lower()
        has_codemirror = "codemirror" in content.lower() or "cm-" in content
        safe_logfire_info(
            f"Code library indicators | prism={has_prism} | highlight={has_highlight} | shiki={has_shiki} | codemirror={has_codemirror}"
        )

        # Check for any pre tags with different attributes
        pre_matches = re.findall(r"<pre[^>]*>", content[:5000], re.IGNORECASE)
        if pre_matches:
            safe_logfire_info(f"Found {len(pre_matches)} <pre> tags in first 5000 chars")
            for i, pre_tag in enumerate(pre_matches[:3]):  # Show first 3
                safe_logfire_info(f"Pre tag {i + 1}: {pre_tag}")

        code_blocks = []
        extracted_positions = set()  # Track already extracted code block positions

        # Comprehensive patterns for various code block formats
        # Order matters - more specific patterns first
        patterns = [
            # GitHub/GitLab patterns
            (
                r'<div[^>]*class=["\'][^"\']*highlight[^"\']*["\'][^>]*>.*?<pre[^>]*class=["\'][^"\']*(?:language-)?(\w+)[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
                "github-highlight",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*snippet-clipboard-content[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
                "github-snippet",
            ),
            # Docusaurus patterns
            (
                r'<div[^>]*class=["\'][^"\']*codeBlockContainer[^"\']*["\'][^>]*>.*?<pre[^>]*class=["\'][^"\']*prism-code[^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</pre>',
                "docusaurus",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>.*?<pre[^>]*class=["\'][^"\']*prism-code[^"\']*["\'][^>]*>(.*?)</pre>',
                "docusaurus-alt",
            ),
            # Milkdown specific patterns - check their actual HTML structure
            (
                r'<pre[^>]*><code[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</code></pre>',
                "milkdown-typed",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*code-wrapper[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
                "milkdown-wrapper",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*code-block-wrapper[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
                "milkdown-wrapper-code",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*milkdown-code-block[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
                "milkdown-code-block",
            ),
            (
                r'<pre[^>]*class=["\'][^"\']*code-block[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
                "milkdown",
            ),
            (r"<div[^>]*data-code-block[^>]*>.*?<pre[^>]*>(.*?)</pre>", "milkdown-alt"),
            (
                r'<div[^>]*class=["\'][^"\']*milkdown[^"\']*["\'][^>]*>.*?<pre[^>]*><code[^>]*>(.*?)</code></pre>',
                "milkdown-div",
            ),
            # Monaco Editor - capture all view-lines content
            (
                r'<div[^>]*class=["\'][^"\']*monaco-editor[^"\']*["\'][^>]*>.*?<div[^>]*class=["\'][^"\']*view-lines[^"\']*[^>]*>(.*?)</div>(?=.*?</div>.*?</div>)',
                "monaco",
            ),
            # CodeMirror patterns
            (
                r'<div[^>]*class=["\'][^"\']*cm-content[^"\']*["\'][^>]*>((?:<div[^>]*class=["\'][^"\']*cm-line[^"\']*["\'][^>]*>.*?</div>\s*)+)</div>',
                "codemirror",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*CodeMirror[^"\']*["\'][^>]*>.*?<div[^>]*class=["\'][^"\']*CodeMirror-code[^"\']*["\'][^>]*>(.*?)</div>',
                "codemirror-legacy",
            ),
            # Prism.js with language - must be before generic pre
            (
                r'<pre[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
                "prism",
            ),
            (
                r'<pre[^>]*>\s*<code[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</code>\s*</pre>',
                "prism-alt",
            ),
            # highlight.js - must be before generic pre/code
            (
                r'<pre[^>]*><code[^>]*class=["\'][^"\']*hljs(?:\s+language-(\w+))?[^"\']*["\'][^>]*>(.*?)</code></pre>',
                "hljs",
            ),
            (
                r'<pre[^>]*class=["\'][^"\']*hljs[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
                "hljs-pre",
            ),
            # Shiki patterns (VitePress, Astro, etc.)
            (
                r'<pre[^>]*class=["\'][^"\']*shiki[^"\']*["\'][^>]*(?:.*?style=["\'][^"\']*background-color[^"\']*["\'])?[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>',
                "shiki",
            ),
            (r'<pre[^>]*class=["\'][^"\']*astro-code[^"\']*["\'][^>]*>(.*?)</pre>', "astro-shiki"),
            (
                r'<div[^>]*class=["\'][^"\']*astro-code[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
                "astro-wrapper",
            ),
            # VitePress/Vue patterns
            (
                r'<div[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
                "vitepress",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*vp-code[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
                "vitepress-vp",
            ),
            # Nextra patterns
            (r"<div[^>]*data-nextra-code[^>]*>.*?<pre[^>]*>(.*?)</pre>", "nextra"),
            (
                r'<pre[^>]*class=["\'][^"\']*nx-[^"\']*["\'][^>]*><code[^>]*>(.*?)</code></pre>',
                "nextra-nx",
            ),
            # Standard pre/code patterns - should be near the end
            (
                r'<pre[^>]*><code[^>]*class=["\'][^"\']*language-(\w+)[^"\']*["\'][^>]*>(.*?)</code></pre>',
                "standard-lang",
            ),
            (r"<pre[^>]*>\s*<code[^>]*>(.*?)</code>\s*</pre>", "standard"),
            # Generic patterns - should be last
            (
                r'<div[^>]*class=["\'][^"\']*code-block[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
                "generic-div",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*codeblock[^"\']*["\'][^>]*>(.*?)</div>',
                "generic-codeblock",
            ),
            (
                r'<div[^>]*class=["\'][^"\']*highlight[^"\']*["\'][^>]*>.*?<pre[^>]*>(.*?)</pre>',
                "highlight",
            ),
        ]

        # PERFORMANCE: Early exit checks to avoid unnecessary regex processing
        # Check more content (20KB instead of 5KB) and add URL-based exceptions
        check_size = min(20000, len(content))  # Check first 20KB or entire content if smaller
        has_code_indicators = any(indicator in content[:check_size] for indicator in
                                 ['<pre', '<code', 'language-', 'hljs', 'prism', 'shiki', 'highlight'])

        # Never skip certain documentation sites that we know have code
        is_known_code_site = any(domain in source_url.lower() for domain in
                                ['milkdown', 'github.com', 'gitlab', 'docs.', 'dev.', 'api.'])

        if not has_code_indicators and not is_known_code_site:
            safe_logfire_info(f"No code indicators found in first {check_size} chars and not a known code site, skipping HTML extraction | url={source_url}")
            return []

        if is_known_code_site and not has_code_indicators:
            safe_logfire_info(f"Known code site but no indicators in first {check_size} chars, continuing anyway | url={source_url}")

        # PERFORMANCE: Limit number of patterns to check based on detected libraries
        patterns_to_check = []
        content_lower = content[:10000].lower()  # Check first 10KB for library detection

        # Selectively add patterns based on what's detected
        if 'milkdown' in content_lower:
            patterns_to_check.extend([p for p in patterns if 'milkdown' in p[1]])
        if 'monaco' in content_lower:
            patterns_to_check.extend([p for p in patterns if 'monaco' in p[1]])
        if 'codemirror' in content_lower or 'cm-' in content_lower:
            patterns_to_check.extend([p for p in patterns if 'codemirror' in p[1]])
        if 'prism' in content_lower:
            patterns_to_check.extend([p for p in patterns if 'prism' in p[1]])
        if 'hljs' in content_lower or 'highlight' in content_lower:
            patterns_to_check.extend([p for p in patterns if 'hljs' in p[1] or 'highlight' in p[1]])
        if 'shiki' in content_lower or 'astro' in content_lower:
            patterns_to_check.extend([p for p in patterns if 'shiki' in p[1] or 'astro' in p[1]])

        # Always include standard patterns as fallback (get ALL standard/generic patterns, not just last 5)
        standard_patterns = [p for p in patterns if any(tag in p[1] for tag in ['standard', 'generic', 'prism', 'hljs'])]
        patterns_to_check.extend(standard_patterns)

        # Remove duplicates while preserving order
        seen = set()
        unique_patterns = []
        for p in patterns_to_check:
            if p[1] not in seen:
                unique_patterns.append(p)
                seen.add(p[1])
        patterns_to_check = unique_patterns

        # If we have very few patterns and it's a known code site, add more generic patterns
        if len(patterns_to_check) < 5 and is_known_code_site:
            safe_logfire_info(f"Known code site with few patterns ({len(patterns_to_check)}), adding more generic patterns")
            patterns_to_check = patterns  # Use all patterns for known code sites

        safe_logfire_info(f"Checking {len(patterns_to_check)} relevant patterns out of {len(patterns)} total")

        for pattern_tuple in patterns_to_check:
            pattern_str, source_type = pattern_tuple

            # PERFORMANCE: Use re.finditer with smaller chunks for very long content
            # Only use DOTALL for patterns that really need it (multi-line blocks)
            flags = re.IGNORECASE
            if 'monaco' in source_type or 'codemirror' in source_type:
                flags |= re.DOTALL  # These need DOTALL for multi-line matching

            matches = list(re.finditer(pattern_str, content, flags))

            # Log pattern matches for Milkdown patterns and CodeMirror
            if matches and (
                "milkdown" in source_type
                or "codemirror" in source_type
                or "milkdown" in content[:1000].lower()
            ):
                safe_logfire_info(f"Pattern {source_type} found {len(matches)} matches")

            for match in matches:
                # Extract code content based on pattern type
                if source_type in ["standard-lang", "prism", "vitepress", "hljs", "milkdown-typed"]:
                    # These patterns capture language in group 1, code in group 2
                    if match.lastindex and match.lastindex >= 2:
                        language = match.group(1)
                        code_content = match.group(2).strip()
                    else:
                        code_content = match.group(1).strip()
                        language = ""
                else:
                    # Most patterns have code in group 1
                    code_content = match.group(1).strip()
                    # Try to extract language from the full match
                    full_match = match.group(0)
                    lang_match = re.search(r'class=["\'].*?language-(\w+)', full_match)
                    language = lang_match.group(1) if lang_match else ""

                # Get the start position for complete block extraction
                code_start_pos = match.start()

                # For CodeMirror, extract text from cm-lines
                if source_type == "codemirror":
                    # Extract text from each cm-line div
                    cm_lines = re.findall(
                        r'<div[^>]*class=["\'][^"\']*cm-line[^"\']*["\'][^>]*>(.*?)</div>',
                        code_content,
                        re.DOTALL,
                    )
                    if cm_lines:
                        # Clean each line and join
                        cleaned_lines = []
                        for line in cm_lines:
                            # Remove span tags but keep content
                            line = re.sub(r"<span[^>]*>", "", line)
                            line = re.sub(r"</span>", "", line)
                            # Remove other HTML tags
                            line = re.sub(r"<[^>]+>", "", line)
                            cleaned_lines.append(line)
                        code_content = "\n".join(cleaned_lines)
                    else:
                        # Fallback: just clean HTML
                        code_content = re.sub(r"<span[^>]*>", "", code_content)
                        code_content = re.sub(r"</span>", "", code_content)
                        code_content = re.sub(r"<[^>]+>", "\n", code_content)

                # For Monaco, extract text from nested divs
                if source_type == "monaco":
                    # Extract actual code from Monaco's complex structure
                    code_content = re.sub(r"<div[^>]*>", "\n", code_content)
                    code_content = re.sub(r"</div>", "", code_content)
                    code_content = re.sub(r"<span[^>]*>", "", code_content)
                    code_content = re.sub(r"</span>", "", code_content)

                # Calculate dynamic minimum length
                context_for_length = content[max(0, code_start_pos - 500) : code_start_pos + 500]
                min_length = await self._calculate_min_length(language, context_for_length)

                # Skip if initial content is too short
                if len(code_content) < min_length:
                    # Try to find complete block if we have a language
                    if language and code_start_pos > 0:
                        # Look for complete code block
                        complete_code, block_end_pos = await self._find_complete_code_block(
                            content, code_start_pos, min_length, language
                        )
                        if len(complete_code) >= min_length:
                            code_content = complete_code
                            end_pos = block_end_pos
                        else:
                            continue
                    else:
                        continue

                # Extract position info for deduplication
                start_pos = match.start()
                end_pos = (
                    match.end()
                    if len(code_content) <= len(match.group(0))
                    else code_start_pos + len(code_content)
                )

                # Check if we've already extracted code from this position
                position_key = (start_pos, end_pos)
                overlapping = False
                for existing_start, existing_end in extracted_positions:
                    # Check if this match overlaps with an existing extraction
                    if not (end_pos <= existing_start or start_pos >= existing_end):
                        overlapping = True
                        break

                if not overlapping:
                    extracted_positions.add(position_key)

                    # Extract context
                    context_before = content[max(0, start_pos - 1000) : start_pos].strip()
                    context_after = content[end_pos : min(len(content), end_pos + 1000)].strip()

                    # Clean the code content
                    cleaned_code = self._clean_code_content(code_content, language)

                    # Validate code quality
                    if await self._validate_code_quality(cleaned_code, language):
                        # Log successful extraction
                        safe_logfire_info(
                            f"Extracted code block | source_type={source_type} | language={language} | min_length={min_length} | original_length={len(code_content)} | cleaned_length={len(cleaned_code)}"
                        )

                        code_blocks.append({
                            "code": cleaned_code,
                            "language": language,
                            "context_before": context_before,
                            "context_after": context_after,
                            "full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
                            "source_type": source_type,  # Track which pattern matched
                        })
                    else:
                        safe_logfire_info(
                            f"Code block failed validation | source_type={source_type} | language={language} | length={len(cleaned_code)}"
                        )

        # Pattern 2: <code>...</code> (standalone)
        if not code_blocks:  # Only if we didn't find pre/code blocks
            code_pattern = r"<code[^>]*>(.*?)</code>"
            matches = re.finditer(code_pattern, content, re.DOTALL | re.IGNORECASE)

            for match in matches:
                code_content = match.group(1).strip()
                # Clean the code content
                cleaned_code = self._clean_code_content(code_content, "")

                # Check if it's multiline or substantial enough and validate quality
                # Use a minimal length for standalone code tags
                if len(cleaned_code) >= 100 and ("\n" in cleaned_code or len(cleaned_code) > 100):
                    if await self._validate_code_quality(cleaned_code, ""):
                        start_pos = match.start()
                        end_pos = match.end()
                        context_before = content[max(0, start_pos - 1000) : start_pos].strip()
                        context_after = content[end_pos : min(len(content), end_pos + 1000)].strip()

                        code_blocks.append({
                            "code": cleaned_code,
                            "language": "",
                            "context_before": context_before,
                            "context_after": context_after,
                            "full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
                        })
                    else:
                        safe_logfire_info(
                            f"Standalone code block failed validation | length={len(cleaned_code)}"
                        )

        return code_blocks

    async def _extract_text_file_code_blocks(
        self, content: str, url: str, min_length: int | None = None
    ) -> list[dict[str, Any]]:
        """
        Extract code blocks from plain text files (like .txt files).
        Handles formats like llms.txt where code blocks may be indicated by:
        - Triple backticks (```)
        - Language indicators (e.g., "typescript", "python")
        - Indentation patterns
        - Code block separators

        Args:
            content: The plain text content
            url: The URL of the text file for context
            min_length: Minimum length for code blocks

        Returns:
            List of code blocks with metadata
        """
        import re

        safe_logfire_info(
            f"🔍 TEXT FILE EXTRACTION START | url={url} | content_length={len(content)}"
        )
        safe_logfire_info(f"📄 First 1000 chars: {repr(content[:1000])}...")
        safe_logfire_info(
            f"📄 Sample showing backticks: {repr(content[5000:6000])}..."
            if len(content) > 6000
            else "Content too short for mid-sample"
        )

        code_blocks = []

        # Method 1: Look for triple backtick code blocks (Markdown style)
        # Pattern allows for additional text after language (e.g., "typescript TypeScript")
        backtick_pattern = r"```(\w*)[^\n]*\n(.*?)```"
        matches = list(re.finditer(backtick_pattern, content, re.DOTALL | re.MULTILINE))
        safe_logfire_info(f"📊 Backtick pattern matches: {len(matches)}")

        for i, match in enumerate(matches):
            language = match.group(1) or ""
            code_content = match.group(2).strip()

            # Log match info without including the actual content that might break formatting
            safe_logfire_info(
                f"🔎 Match {i + 1}: language='{language}', raw_length={len(code_content)}"
            )

            # Get position info first
            start_pos = match.start()
            end_pos = match.end()

            # Calculate dynamic minimum length
            context_around = content[max(0, start_pos - 500) : min(len(content), end_pos + 500)]
            if min_length is None:
                actual_min_length = await self._calculate_min_length(language, context_around)
            else:
                actual_min_length = min_length

            if len(code_content) >= actual_min_length:
                # Get context
                context_before = content[max(0, start_pos - 500) : start_pos].strip()
                context_after = content[end_pos : min(len(content), end_pos + 500)].strip()

                # Clean and validate
                cleaned_code = self._clean_code_content(code_content, language)
                safe_logfire_info(f"🧹 After cleaning: length={len(cleaned_code)}")

                if await self._validate_code_quality(cleaned_code, language):
                    safe_logfire_info(
                        f"✅ VALID backtick code block | language={language} | length={len(cleaned_code)}"
                    )
                    code_blocks.append({
                        "code": cleaned_code,
                        "language": language,
                        "context_before": context_before,
                        "context_after": context_after,
                        "full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
                        "source_type": "text_backticks",
                    })
                else:
                    safe_logfire_info(
                        f"❌ INVALID code block failed validation | language={language}"
                    )
            else:
                safe_logfire_info(
                    f"❌ Code block too short: {len(code_content)} < {actual_min_length}"
                )

        # Method 2: Look for language-labeled code blocks (e.g., "TypeScript:" or "Python example:")
        language_pattern = r"(?:^|\n)((?:typescript|javascript|python|java|c\+\+|rust|go|ruby|php|swift|kotlin|scala|r|matlab|julia|dart|elixir|erlang|haskell|clojure|lua|perl|shell|bash|sql|html|css|xml|json|yaml|toml|ini|dockerfile|makefile|cmake|gradle|maven|npm|yarn|pip|cargo|gem|pod|composer|nuget|apt|yum|brew|choco|snap|flatpak|appimage|msi|exe|dmg|pkg|deb|rpm|tar|zip|7z|rar|gz|bz2|xz|zst|lz4|lzo|lzma|lzip|lzop|compress|uncompress|gzip|gunzip|bzip2|bunzip2|xz|unxz|zstd|unzstd|lz4|unlz4|lzo|unlzo|lzma|unlzma|lzip|lunzip|lzop|unlzop)\s*(?:code|example|snippet)?)[:\s]*\n((?:(?:^[ \t]+.*\n?)+)|(?:.*\n)+?)(?=\n(?:[A-Z][a-z]+\s*:|^\s*$|\n#|\n\*|\n-|\n\d+\.))"
        matches = re.finditer(language_pattern, content, re.IGNORECASE | re.MULTILINE)

        for match in matches:
            language_info = match.group(1).lower()
            # Extract just the language name
            language = (
                re.match(r"(\w+)", language_info).group(1)
                if re.match(r"(\w+)", language_info)
                else ""
            )
            code_content = match.group(2).strip()

            # Calculate dynamic minimum length for language-labeled blocks
            if min_length is None:
                actual_min_length_lang = await self._calculate_min_length(
                    language, code_content[:500]
                )
            else:
                actual_min_length_lang = min_length

            if len(code_content) >= actual_min_length_lang:
                # Get context
                start_pos = match.start()
                end_pos = match.end()
                context_before = content[max(0, start_pos - 500) : start_pos].strip()
                context_after = content[end_pos : min(len(content), end_pos + 500)].strip()

                # Clean and validate
                cleaned_code = self._clean_code_content(code_content, language)
                if await self._validate_code_quality(cleaned_code, language):
                    safe_logfire_info(
                        f"Found language-labeled code block | language={language} | length={len(cleaned_code)}"
                    )
                    code_blocks.append({
                        "code": cleaned_code,
                        "language": language,
                        "context_before": context_before,
                        "context_after": context_after,
                        "full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
                        "source_type": "text_language_label",
                    })

        # Method 3: Look for consistently indented blocks (at least 4 spaces or 1 tab)
        # This is more heuristic and should be used carefully
        if len(code_blocks) == 0:  # Only if we haven't found code blocks yet
            # Split content into potential code sections
            lines = content.split("\n")
            current_block = []
            current_indent = None
            block_start_idx = 0

            for i, line in enumerate(lines):
                # Check if line is indented
                stripped = line.lstrip()
                indent = len(line) - len(stripped)

                if indent >= 4 and stripped:  # At least 4 spaces and not empty
                    if current_indent is None:
                        current_indent = indent
                        block_start_idx = i
                    current_block.append(line)
                elif current_block and len("\n".join(current_block)) >= min_length:
                    # End of indented block, check if it's code
                    code_content = "\n".join(current_block)

                    # Try to detect language from content
                    language = self._detect_language_from_content(code_content)

                    # Get context
                    context_before_lines = lines[max(0, block_start_idx - 10) : block_start_idx]
                    context_after_lines = lines[i : min(len(lines), i + 10)]
                    context_before = "\n".join(context_before_lines).strip()
                    context_after = "\n".join(context_after_lines).strip()

                    # Clean and validate
                    cleaned_code = self._clean_code_content(code_content, language)
                    if await self._validate_code_quality(cleaned_code, language):
                        safe_logfire_info(
                            f"Found indented code block | language={language} | length={len(cleaned_code)}"
                        )
                        code_blocks.append({
                            "code": cleaned_code,
                            "language": language,
                            "context_before": context_before,
                            "context_after": context_after,
                            "full_context": f"{context_before}\n\n{cleaned_code}\n\n{context_after}",
                            "source_type": "text_indented",
                        })

                    # Reset for next block
                    current_block = []
                    current_indent = None
                else:
                    # Reset if not indented
                    if current_block and not stripped:
                        # Allow empty lines within code blocks
                        current_block.append(line)
                    else:
                        current_block = []
                        current_indent = None

        safe_logfire_info(
            f"📊 TEXT FILE EXTRACTION COMPLETE | total_blocks={len(code_blocks)} | url={url}"
        )
        for i, block in enumerate(code_blocks[:3]):  # Log first 3 blocks
            safe_logfire_info(
                f"📦 Block {i + 1} summary: language='{block.get('language', '')}', source_type='{block.get('source_type', '')}', length={len(block.get('code', ''))}"
            )
        return code_blocks

    def _detect_language_from_content(self, code: str) -> str:
        """
        Try to detect programming language from code content.
        This is a simple heuristic approach.
        """
        import re

        # Language detection patterns
        patterns = {
            "python": [
                r"\bdef\s+\w+\s*\(",
                r"\bclass\s+\w+",
                r"\bimport\s+\w+",
                r"\bfrom\s+\w+\s+import",
            ],
            "javascript": [
                r"\bfunction\s+\w+\s*\(",
                r"\bconst\s+\w+\s*=",
                r"\blet\s+\w+\s*=",
                r"\bvar\s+\w+\s*=",
            ],
            "typescript": [
                r"\binterface\s+\w+",
                r":\s*\w+\[\]",
                r"\btype\s+\w+\s*=",
                r"\bclass\s+\w+.*\{",
            ],
            "java": [
                r"\bpublic\s+class\s+\w+",
                r"\bprivate\s+\w+\s+\w+",
                r"\bpublic\s+static\s+void\s+main",
            ],
            "rust": [r"\bfn\s+\w+\s*\(", r"\blet\s+mut\s+\w+", r"\bimpl\s+\w+", r"\bstruct\s+\w+"],
            "go": [r"\bfunc\s+\w+\s*\(", r"\bpackage\s+\w+", r"\btype\s+\w+\s+struct"],
        }

        # Count matches for each language
        scores = {}
        for lang, lang_patterns in patterns.items():
            score = 0
            for pattern in lang_patterns:
                if re.search(pattern, code, re.MULTILINE):
                    score += 1
            if score > 0:
                scores[lang] = score

        # Return language with highest score
        if scores:
            return max(scores, key=scores.get)

        return ""

    async def _find_complete_code_block(
        self,
        content: str,
        start_pos: int,
        min_length: int = 250,
        language: str = "",
        max_length: int = None,
    ) -> tuple[str, int]:
        """
        Find a complete code block starting from a position, extending until we find a natural boundary.

        Args:
            content: The full content to search in
            start_pos: Starting position in the content
            min_length: Minimum length for the code block
            language: Detected language for language-specific patterns

        Returns:
            Tuple of (complete_code_block, end_position)
        """
        # Start with the minimum content
        if start_pos + min_length > len(content):
            return content[start_pos:], len(content)

        # Look for natural code boundaries
        boundary_patterns = [
            r"\n}\s*$",  # Closing brace at end of line
            r"\n}\s*;?\s*$",  # Closing brace with optional semicolon
            r"\n\)\s*;?\s*$",  # Closing parenthesis
            r"\n\s*$\n\s*$",  # Double newline (paragraph break)
            r"\n(?=class\s)",  # Before next class
            r"\n(?=function\s)",  # Before next function
            r"\n(?=def\s)",  # Before next Python function
            r"\n(?=export\s)",  # Before next export
            r"\n(?=const\s)",  # Before next const declaration
            r"\n(?=//)",  # Before comment block
            r"\n(?=#)",  # Before Python comment
            r"\n(?=\*)",  # Before JSDoc/comment
            r"\n(?=```)",  # Before next code block
        ]

        # Add language-specific patterns if available
        if language and language.lower() in self.LANGUAGE_PATTERNS:
            lang_patterns = self.LANGUAGE_PATTERNS[language.lower()]
            if "block_end" in lang_patterns:
                boundary_patterns.insert(0, lang_patterns["block_end"])

        # Extend until we find a boundary
        extended_pos = start_pos + min_length
        while extended_pos < len(content):
            # Check next 500 characters for a boundary
            lookahead_end = min(extended_pos + 500, len(content))
            lookahead = content[extended_pos:lookahead_end]

            for pattern in boundary_patterns:
                match = re.search(pattern, lookahead, re.MULTILINE)
                if match:
                    final_pos = extended_pos + match.end()
                    return content[start_pos:final_pos].rstrip(), final_pos

            # If no boundary found, extend by another chunk
            extended_pos += 100

            # Cap at maximum length
            if max_length is None:
                max_length = await self._get_max_code_length()
            if extended_pos - start_pos > max_length:
                break

        # Return what we have
        return content[start_pos:extended_pos].rstrip(), extended_pos

    async def _calculate_min_length(self, language: str, context: str) -> int:
        """
        Calculate appropriate minimum length based on language and context.

        Args:
            language: The detected programming language
            context: Surrounding context of the code

        Returns:
            Calculated minimum length
        """
        # Base lengths by language
        # Check if contextual length adjustment is enabled
        if not await self._is_contextual_length_enabled():
            # Return default minimum length
            return await self._get_min_code_length()

        # Base lengths by language
        base_lengths = {
            "json": 100,  # JSON can be short
            "yaml": 100,  # YAML too
            "xml": 100,  # XML structures
            "html": 150,  # HTML snippets
            "css": 150,  # CSS rules
            "sql": 150,  # SQL queries
            "python": 200,  # Python functions
            "javascript": 250,  # JavaScript typically longer
            "typescript": 250,  # TypeScript typically longer
            "java": 300,  # Java even more verbose
            "c++": 300,  # C++ similar to Java
            "cpp": 300,  # C++ alternative
            "c": 250,  # C slightly less verbose
            "rust": 250,  # Rust medium verbosity
            "go": 200,  # Go is concise
        }

        # Get default minimum from settings
        default_min = await self._get_min_code_length()
        min_length = base_lengths.get(language.lower(), default_min)

        # Adjust based on context clues
        context_lower = context.lower()
        if any(word in context_lower for word in ["example", "snippet", "sample", "demo"]):
            min_length = int(min_length * 0.7)  # Examples can be shorter
        elif any(word in context_lower for word in ["implementation", "complete", "full"]):
            min_length = int(min_length * 1.5)  # Full implementations should be longer
        elif any(word in context_lower for word in ["minimal", "simple", "basic"]):
            min_length = int(min_length * 0.8)  # Simple examples can be shorter

        # Ensure reasonable bounds
        return max(100, min(1000, min_length))

    def _decode_html_entities(self, text: str) -> str:
        """Decode common HTML entities and clean HTML tags from code."""
        import re

        # First, handle span tags that wrap individual tokens
        # Check if spans are being used for syntax highlighting (no spaces between tags)
        if "</span><span" in text:
            # This indicates syntax highlighting - preserve the structure
            text = re.sub(r"</span>", "", text)
            text = re.sub(r"<span[^>]*>", "", text)
        else:
            # Normal span usage - might need spacing
            # Only add space if there isn't already whitespace
            text = re.sub(r"</span>(?=[A-Za-z0-9])", " ", text)
            text = re.sub(r"<span[^>]*>", "", text)

        # Remove any other HTML tags but preserve their content
        text = re.sub(r"</?[^>]+>", "", text)

        # Decode HTML entities
        replacements = {
            "&lt;": "<",
            "&gt;": ">",
            "&amp;": "&",
            "&quot;": '"',
            "&#39;": "'",
            "&nbsp;": " ",
            "&#x27;": "'",
            "&#x2F;": "/",
            "&#60;": "<",
            "&#62;": ">",
        }

        for entity, char in replacements.items():
            text = text.replace(entity, char)

        # Replace escaped newlines with actual newlines
        text = text.replace("\\n", "\n")

        # Clean up excessive whitespace while preserving intentional spacing
        # Replace multiple spaces with single space, but preserve newlines
        lines = text.split("\n")
        cleaned_lines = []
        for line in lines:
            # Replace multiple spaces with single space
            line = re.sub(r" +", " ", line)
            # Trim trailing spaces but preserve leading spaces (indentation)
            line = line.rstrip()
            cleaned_lines.append(line)

        text = "\n".join(cleaned_lines)

        return text

    def _clean_code_content(self, code: str, language: str = "") -> str:
        """
        Clean and fix common issues in extracted code content.

        Args:
            code: The code content to clean
            language: The detected language (optional)

        Returns:
            Cleaned code content
        """
        import re

        # First apply HTML entity decoding and tag cleaning
        code = self._decode_html_entities(code)

        # Fix common concatenation issues from span removal
        # Common patterns where spaces are missing between keywords
        spacing_fixes = [
            # Import statements
            (r"(\b(?:from|import|as)\b)([A-Za-z])", r"\1 \2"),
            # Function/class definitions
            (r"(\b(?:def|class|async|await|return|raise|yield)\b)([A-Za-z])", r"\1 \2"),
            # Control flow
            (r"(\b(?:if|elif|else|for|while|try|except|finally|with)\b)([A-Za-z])", r"\1 \2"),
            # Type hints and declarations
            (
                r"(\b(?:int|str|float|bool|list|dict|tuple|set|None|True|False)\b)([A-Za-z])",
                r"\1 \2",
            ),
            # Common Python keywords
            (r"(\b(?:and|or|not|in|is|lambda)\b)([A-Za-z])", r"\1 \2"),
            # Fix missing spaces around operators (but be careful with negative numbers)
            (r"([A-Za-z_)])(\+|-|\*|/|=|<|>|%)", r"\1 \2"),
            (r"(\+|-|\*|/|=|<|>|%)([A-Za-z_(])", r"\1 \2"),
        ]

        for pattern, replacement in spacing_fixes:
            code = re.sub(pattern, replacement, code)

        # Fix specific patterns for different languages
        if language.lower() in ["python", "py"]:
            # Fix Python-specific issues
            code = re.sub(r"(\b(?:from|import)\b)(\w+)(\b(?:import)\b)", r"\1 \2 \3", code)
            # Fix missing colons
            code = re.sub(
                r"(\b(?:def|class|if|elif|else|for|while|try|except|finally|with)\b[^:]+)$",
                r"\1:",
                code,
                flags=re.MULTILINE,
            )

        # Remove backticks that might have been included
        if code.startswith("```") and code.endswith("```"):
            lines = code.split("\n")
            if len(lines) > 2:
                # Remove first and last line
                code = "\n".join(lines[1:-1])
        elif code.startswith("`") and code.endswith("`"):
            code = code[1:-1]

        # Final cleanup
        # Remove any remaining excessive spaces while preserving indentation
        lines = code.split("\n")
        cleaned_lines = []
        for line in lines:
            # Don't touch leading whitespace (indentation)
            stripped = line.lstrip()
            indent = line[: len(line) - len(stripped)]
            # Clean the rest of the line
            cleaned = re.sub(r" {2,}", " ", stripped)
            cleaned_lines.append(indent + cleaned)

        return "\n".join(cleaned_lines).strip()

    async def _validate_code_quality(self, code: str, language: str = "") -> bool:
        """
        Enhanced validation to ensure extracted content is actual code.

        Args:
            code: The code content to validate
            language: The detected language (optional)

        Returns:
            True if code passes quality checks, False otherwise
        """
        import re

        # Basic checks
        if not code or len(code.strip()) < 20:
            return False

        # Skip diagram languages if filtering is enabled
        if await self._is_diagram_filtering_enabled():
            if language.lower() in ["mermaid", "plantuml", "graphviz", "dot", "diagram"]:
                safe_logfire_info(f"Skipping diagram language: {language}")
                return False

        # Check for common formatting issues that indicate poor extraction
        bad_patterns = [
            # Concatenated keywords without spaces (but allow camelCase)
            r"\b(from|import|def|class|if|for|while|return)(?=[a-z])",
            # HTML entities that weren't decoded
            r"&[lg]t;|&amp;|&quot;|&#\d+;",
            # Excessive HTML tags
            r"<[^>]{50,}>",  # Very long HTML tags
            # Multiple spans in a row (indicates poor extraction)
            r"(<span[^>]*>){5,}",
            # Suspicious character sequences
            r"[^\s]{200,}",  # Very long unbroken strings (increased threshold)
        ]

        for pattern in bad_patterns:
            if re.search(pattern, code):
                safe_logfire_info(f"Code failed quality check: pattern '{pattern}' found")
                return False

        # Check for minimum code complexity using various indicators
        code_indicators = {
            "function_calls": r"\w+\s*\([^)]*\)",
            "assignments": r"\w+\s*=\s*.+",
            "control_flow": r"\b(if|for|while|switch|case|try|catch|except)\b",
            "declarations": r"\b(var|let|const|def|class|function|interface|type|struct|enum)\b",
            "imports": r"\b(import|from|require|include|using|use)\b",
            "brackets": r"[\{\}\[\]]",
            "operators": r"[\+\-\*\/\%\&\|\^<>=!]",
            "method_chains": r"\.\w+",
            "arrows": r"(=>|->)",
            "keywords": r"\b(return|break|continue|yield|await|async)\b",
        }

        indicator_count = 0
        indicator_details = []
        for name, pattern in code_indicators.items():
            if re.search(pattern, code):
                indicator_count += 1
                indicator_details.append(name)

        # Require minimum code indicators
        min_indicators = await self._get_min_code_indicators()
        if indicator_count < min_indicators:
            safe_logfire_info(
                f"Code has insufficient indicators: {indicator_count} found ({', '.join(indicator_details)})"
            )
            return False

        # Check code-to-comment ratio
        lines = code.split("\n")
        non_empty_lines = [line for line in lines if line.strip()]

        if not non_empty_lines:
            return False

        # Count comment lines (various comment styles)
        comment_patterns = [
            r"^\s*(//|#|/\*|\*|<!--)",  # Single line comments
            r'^\s*"""',  # Python docstrings
            r"^\s*'''",  # Python docstrings alt
            r"^\s*\*\s",  # JSDoc style
        ]

        comment_lines = 0
        for line in lines:
            for pattern in comment_patterns:
                if re.match(pattern, line.strip()):
                    comment_lines += 1
                    break

        # Allow up to 70% comments (documentation is important)
        if non_empty_lines and comment_lines / len(non_empty_lines) > 0.7:
            safe_logfire_info(
                f"Code is mostly comments: {comment_lines}/{len(non_empty_lines)} lines"
            )
            return False

        # Language-specific validation
        if language.lower() in self.LANGUAGE_PATTERNS:
            lang_info = self.LANGUAGE_PATTERNS[language.lower()]
            min_indicators = lang_info.get("min_indicators", [])

            # Check for language-specific indicators
            found_lang_indicators = sum(
                1 for indicator in min_indicators if indicator in code.lower()
            )

            if found_lang_indicators < 2:  # Need at least 2 language-specific indicators
                safe_logfire_info(
                    f"Code lacks {language} indicators: only {found_lang_indicators} found"
                )
                return False

        # Check for reasonable structure
        # Too few meaningful lines
        if len(non_empty_lines) < 3:
            safe_logfire_info(f"Code has too few non-empty lines: {len(non_empty_lines)}")
            return False

        # Check for reasonable line lengths
        very_long_lines = sum(1 for line in lines if len(line) > 300)
        if len(lines) > 0 and very_long_lines > len(lines) * 0.5:
            safe_logfire_info("Code has too many very long lines")
            return False

        # Check if it's mostly prose/documentation
        prose_indicators = [
            r"\b(the|this|that|these|those|is|are|was|were|will|would|should|could|have|has|had)\b",
            r"[.!?]\s+[A-Z]",  # Sentence endings followed by capital letter
            r"\b(however|therefore|furthermore|moreover|nevertheless)\b",
        ]

        prose_score = 0
        word_count = len(code.split())
        for pattern in prose_indicators:
            matches = len(re.findall(pattern, code, re.IGNORECASE))
            prose_score += matches

        # Check prose filtering
        if await self._is_prose_filtering_enabled():
            max_prose_ratio = await self._get_max_prose_ratio()
            if word_count > 0 and prose_score / word_count > max_prose_ratio:
                safe_logfire_info(
                    f"Code appears to be prose: prose_score={prose_score}, word_count={word_count}"
                )
                return False

        # Passed all checks
        safe_logfire_info(
            f"Code passed validation: indicators={indicator_count}, language={language}, lines={len(non_empty_lines)}"
        )
        return True

    async def _generate_code_summaries(
        self,
        all_code_blocks: list[dict[str, Any]],
        progress_callback: Callable | None = None,
        start_progress: int = 0,
        end_progress: int = 100,
    ) -> list[dict[str, str]]:
        """
        Generate summaries for all code blocks.

        Returns:
            List of summary results
        """
        # Check if code summaries are enabled
        if not await self._is_code_summaries_enabled():
            safe_logfire_info("Code summaries generation is disabled, returning default summaries")
            # Return default summaries for all code blocks
            default_summaries = []
            for item in all_code_blocks:
                block = item["block"]
                language = block.get("language", "")
                default_summaries.append({
                    "example_name": f"Code Example{f' ({language})' if language else ''}",
                    "summary": "Code example for demonstration purposes.",
                })

            # Report progress for skipped summaries
            if progress_callback:
                await progress_callback({
                    "status": "code_extraction",
                    "percentage": end_progress,
                    "log": f"Skipped AI summary generation (disabled). Using default summaries for {len(all_code_blocks)} code blocks.",
                })

            return default_summaries

        # Progress is handled by generate_code_summaries_batch

        # Use default max workers
        max_workers = 3

        # Extract just the code blocks for batch processing
        code_blocks_for_summaries = [item["block"] for item in all_code_blocks]

        # Generate summaries with mapped progress tracking
        summary_progress_callback = None
        if progress_callback:
            # Create a wrapper that maps the progress to the correct range
            async def mapped_callback(data: dict):
                # Map the percentage from generate_code_summaries_batch (0-100) to our range
                if "percentage" in data:
                    raw_percentage = data["percentage"]
                    # Map from 0-100 to start_progress-end_progress
                    mapped_percentage = start_progress + int(
                        (raw_percentage / 100) * (end_progress - start_progress)
                    )
                    data["percentage"] = mapped_percentage
                    # Change the status to match what the orchestration expects
                    data["status"] = "code_extraction"
                await progress_callback(data)

            summary_progress_callback = mapped_callback

        return await generate_code_summaries_batch(
            code_blocks_for_summaries, max_workers, progress_callback=summary_progress_callback
        )

    def _prepare_code_examples_for_storage(
        self, all_code_blocks: list[dict[str, Any]], summary_results: list[dict[str, str]]
    ) -> dict[str, list[Any]]:
        """
        Prepare code examples for storage by organizing data into arrays.

        Returns:
            Dictionary with arrays for storage
        """
        code_urls = []
        code_chunk_numbers = []
        code_examples = []
        code_summaries = []
        code_metadatas = []

        for code_item, summary_result in zip(all_code_blocks, summary_results, strict=False):
            block = code_item["block"]
            source_url = code_item["source_url"]
            source_id = code_item["source_id"]

            summary = summary_result.get("summary", "Code example for demonstration purposes.")
            example_name = summary_result.get("example_name", "Code Example")

            code_urls.append(source_url)
            code_chunk_numbers.append(len(code_examples))
            code_examples.append(block["code"])
            code_summaries.append(summary)

            code_meta = {
                "chunk_index": len(code_examples) - 1,
                "url": source_url,
                "source": source_id,
                "source_id": source_id,
                "language": block.get("language", ""),
                "char_count": len(block["code"]),
                "word_count": len(block["code"].split()),
                "example_name": example_name,
                "title": example_name,
            }
            code_metadatas.append(code_meta)

        return {
            "urls": code_urls,
            "chunk_numbers": code_chunk_numbers,
            "examples": code_examples,
            "summaries": code_summaries,
            "metadatas": code_metadatas,
        }

    async def _store_code_examples(
        self,
        storage_data: dict[str, list[Any]],
        url_to_full_document: dict[str, str],
        progress_callback: Callable | None = None,
        start_progress: int = 0,
        end_progress: int = 100,
    ) -> int:
        """
        Store code examples in the database.

        Returns:
            Number of code examples stored
        """
        # Create mapped progress callback for storage phase
        storage_progress_callback = None
        if progress_callback:

            async def mapped_storage_callback(data: dict):
                # Extract values from the dictionary
                message = data.get("log", "")
                percentage = data.get("percentage", 0)

                # Map storage progress (0-100) to our range (start_progress to end_progress)
                mapped_percentage = start_progress + int(
                    (percentage / 100) * (end_progress - start_progress)
                )

                update_data = {
                    "status": "code_storage",
                    "percentage": mapped_percentage,
                    "log": message,
                }

                # Pass through any additional batch info
                if "batch_number" in data:
                    update_data["batch_number"] = data["batch_number"]
                if "total_batches" in data:
                    update_data["total_batches"] = data["total_batches"]

                await progress_callback(update_data)

            storage_progress_callback = mapped_storage_callback

        try:
            await add_code_examples_to_supabase(
                client=self.supabase_client,
                urls=storage_data["urls"],
                chunk_numbers=storage_data["chunk_numbers"],
                code_examples=storage_data["examples"],
                summaries=storage_data["summaries"],
                metadatas=storage_data["metadatas"],
                batch_size=20,
                url_to_full_document=url_to_full_document,
                progress_callback=storage_progress_callback,
                provider=None,  # Use configured provider
            )

            # Report final progress for code storage phase (not overall completion)
            if progress_callback:
                await progress_callback({
                    "status": "code_extraction",  # Keep status as code_extraction, not completed
                    "percentage": end_progress,
                    "log": f"Code extraction phase completed. Stored {len(storage_data['examples'])} code examples.",
                })

            safe_logfire_info(f"Successfully stored {len(storage_data['examples'])} code examples")
            return len(storage_data["examples"])

        except Exception as e:
            safe_logfire_error(f"Error storing code examples | error={str(e)}")
            return 0