Fixes: crawl code storage issue with <think> tags for ollama models. (#775)

* Fixes: crawl code storage issue with <think> tags for ollama models. * updates from code rabbit review
2025-12-24 02:39:17 -05:00 · 2025-10-10 18:09:53 -04:00
parent 94e28f85fd
commit 7c3823e08f
2 changed files with 437 additions and 314 deletions
--- a/migration/complete_setup.sql
+++ b/migration/complete_setup.sql
@@ -782,6 +782,12 @@ CREATE POLICY "Allow public read access to archon_code_examples"
  TO public
  USING (true);
 CREATE POLICY "Allow public read access to archon_page_metadata"
  ON archon_page_metadata
  FOR SELECT
  TO public
  USING (true);
 -- =====================================================
 -- SECTION 7: PROJECTS AND TASKS MODULE
 -- =====================================================
@@ -954,6 +960,73 @@ COMMENT ON COLUMN archon_document_versions.change_type IS 'Type of change: creat
 COMMENT ON COLUMN archon_document_versions.document_id IS 'For docs arrays, the specific document ID that was changed';
 COMMENT ON COLUMN archon_document_versions.task_id IS 'DEPRECATED: No longer used for new versions, kept for historical task version data';
 -- =====================================================
 -- SECTION 6.5: PAGE METADATA FOR PAGE-BASED RAG
 -- =====================================================
 -- Create archon_page_metadata table
 -- This table stores complete documentation pages alongside chunks for improved agent context retrieval
 CREATE TABLE IF NOT EXISTS archon_page_metadata (
    -- Primary identification
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    source_id TEXT NOT NULL,
    url TEXT NOT NULL,
    -- Content
    full_content TEXT NOT NULL,
    -- Section metadata (for llms-full.txt H1 sections)
    section_title TEXT,
    section_order INT DEFAULT 0,
    -- Statistics
    word_count INT NOT NULL,
    char_count INT NOT NULL,
    chunk_count INT NOT NULL DEFAULT 0,
    -- Timestamps
    created_at TIMESTAMPTZ DEFAULT NOW(),
    updated_at TIMESTAMPTZ DEFAULT NOW(),
    -- Flexible metadata storage
    metadata JSONB DEFAULT '{}'::jsonb,
    -- Constraints
    CONSTRAINT archon_page_metadata_url_unique UNIQUE(url),
    CONSTRAINT archon_page_metadata_source_fk FOREIGN KEY (source_id)
        REFERENCES archon_sources(source_id) ON DELETE CASCADE
 );
 -- Add page_id foreign key to archon_crawled_pages
 -- This links chunks back to their parent page
 -- NULLABLE because existing chunks won't have a page_id yet
 ALTER TABLE archon_crawled_pages
 ADD COLUMN IF NOT EXISTS page_id UUID REFERENCES archon_page_metadata(id) ON DELETE SET NULL;
 -- Create indexes for query performance
 CREATE INDEX IF NOT EXISTS idx_archon_page_metadata_source_id ON archon_page_metadata(source_id);
 CREATE INDEX IF NOT EXISTS idx_archon_page_metadata_url ON archon_page_metadata(url);
 CREATE INDEX IF NOT EXISTS idx_archon_page_metadata_section ON archon_page_metadata(source_id, section_title, section_order);
 CREATE INDEX IF NOT EXISTS idx_archon_page_metadata_created_at ON archon_page_metadata(created_at);
 CREATE INDEX IF NOT EXISTS idx_archon_page_metadata_metadata ON archon_page_metadata USING GIN(metadata);
 CREATE INDEX IF NOT EXISTS idx_archon_crawled_pages_page_id ON archon_crawled_pages(page_id);
 -- Add comments to document the table structure
 COMMENT ON TABLE archon_page_metadata IS 'Stores complete documentation pages for agent retrieval';
 COMMENT ON COLUMN archon_page_metadata.source_id IS 'References the source this page belongs to';
 COMMENT ON COLUMN archon_page_metadata.url IS 'Unique URL of the page (synthetic for llms-full.txt sections with #anchor)';
 COMMENT ON COLUMN archon_page_metadata.full_content IS 'Complete markdown/text content of the page';
 COMMENT ON COLUMN archon_page_metadata.section_title IS 'H1 section title for llms-full.txt pages';
 COMMENT ON COLUMN archon_page_metadata.section_order IS 'Order of section in llms-full.txt file (0-based)';
 COMMENT ON COLUMN archon_page_metadata.word_count IS 'Number of words in full_content';
 COMMENT ON COLUMN archon_page_metadata.char_count IS 'Number of characters in full_content';
 COMMENT ON COLUMN archon_page_metadata.chunk_count IS 'Number of chunks created from this page';
 COMMENT ON COLUMN archon_page_metadata.metadata IS 'Flexible JSON metadata (page_type, knowledge_type, tags, etc)';
 COMMENT ON COLUMN archon_crawled_pages.page_id IS 'Foreign key linking chunk to parent page';
 -- Enable RLS on archon_page_metadata
 ALTER TABLE archon_page_metadata ENABLE ROW LEVEL SECURITY;
 -- =====================================================
 -- SECTION 7: MIGRATION TRACKING
 -- =====================================================
@@ -991,7 +1064,9 @@ VALUES
  ('0.1.0', '006_ollama_create_indexes_optional'),
  ('0.1.0', '007_add_priority_column_to_tasks'),
  ('0.1.0', '008_add_migration_tracking'),
-  ('0.1.0', '009_add_cascade_delete_constraints')
+  ('0.1.0', '009_add_cascade_delete_constraints'),
  ('0.1.0', '009_add_provider_placeholders'),
  ('0.1.0', '010_add_page_metadata_table')
 ON CONFLICT (version, migration_name) DO NOTHING;
 -- Enable Row Level Security on migrations table
--- a/python/src/server/services/storage/code_storage_service.py
+++ b/python/src/server/services/storage/code_storage_service.py
@@ -82,6 +82,10 @@ def _is_reasoning_text_response(text: str) -> bool:
    text_lower = text.lower().strip()
    # Check for XML-style thinking tags (common in models with extended thinking)
    if text_lower.startswith("<think>") or "<think>" in text_lower[:100]:
        return True
    # Check if it's clearly not JSON (starts with reasoning text)
    starts_with_reasoning = any(text_lower.startswith(starter) for starter in REASONING_STARTERS)
@@ -592,10 +596,23 @@ def generate_code_example_summary(
 async def _generate_code_example_summary_async(
-    code: str, context_before: str, context_after: str, language: str = "", provider: str = None
+    code: str,
    context_before: str,
    context_after: str,
    language: str = "",
    provider: str = None,
    client = None
 ) -> dict[str, str]:
    """
    Async version of generate_code_example_summary using unified LLM provider service.
    Args:
        code: The code example to summarize
        context_before: Context before the code block
        context_after: Context after the code block
        language: Programming language of the code
        provider: LLM provider to use (optional)
        client: Pre-initialized LLM client for reuse (optional, improves performance)
    """
    # Get model choice from credential service (RAG setting)
@@ -647,283 +664,312 @@ Format your response as JSON:
        + "\n\nSecond attempt enforcement: Return JSON only with the exact schema. No additional text or reasoning content."
    )
    # Use provided client or create a new one
    if client is not None:
        # Reuse provided client for better performance
        return await _generate_summary_with_client(
            client, code, context_before, context_after, language, provider,
            model_choice, guard_prompt, strict_prompt
        )
    else:
        # Create new client (backward compatibility)
        async with get_llm_client(provider=provider) as new_client:
            return await _generate_summary_with_client(
                new_client, code, context_before, context_after, language, provider,
                model_choice, guard_prompt, strict_prompt
            )
 async def _generate_summary_with_client(
    llm_client, code: str, context_before: str, context_after: str,
    language: str, provider: str, model_choice: str,
    guard_prompt: str, strict_prompt: str
 ) -> dict[str, str]:
    """Helper function that generates summary using a provided client."""
    search_logger.info(
        f"Generating summary for {hash(code) & 0xffffff:06x} using model: {model_choice}"
    )
    provider_lower = provider.lower()
    is_grok_model = (provider_lower == "grok") or ("grok" in model_choice.lower())
    is_ollama = provider_lower == "ollama"
    supports_response_format_base = (
        provider_lower in {"openai", "google", "anthropic"}
        or (provider_lower == "openrouter" and model_choice.startswith("openai/"))
    )
    last_response_obj = None
    last_elapsed_time = None
    last_response_content = ""
    last_json_error: json.JSONDecodeError | None = None
    try:
-        # Use unified LLM provider service
+        for enforce_json, current_prompt in ((False, guard_prompt), (True, strict_prompt)):
-        async with get_llm_client(provider=provider) as client:
+            request_params = {
-            search_logger.info(
+                "model": model_choice,
-                f"Generating summary for {hash(code) & 0xffffff:06x} using model: {model_choice}"
+                "messages": [
-            )
+                    {
                        "role": "system",
                        "content": "You are a helpful assistant that analyzes code examples and provides JSON responses with example names and summaries.",
                    },
                    {"role": "user", "content": current_prompt},
                ],
                "max_tokens": 2000,
                "temperature": 0.3,
            }
-            provider_lower = provider.lower()
+            should_use_response_format = False
-            is_grok_model = (provider_lower == "grok") or ("grok" in model_choice.lower())
+            if enforce_json:
                if not is_grok_model and (supports_response_format_base or provider_lower == "openrouter"):
                    should_use_response_format = True
            else:
                if supports_response_format_base:
                    should_use_response_format = True
-            supports_response_format_base = (
+            if should_use_response_format:
-                provider_lower in {"openai", "google", "anthropic"}
+                request_params["response_format"] = {"type": "json_object"}
                or (provider_lower == "openrouter" and model_choice.startswith("openai/"))
            )
-            last_response_obj = None
+            # Ollama uses a different parameter format for JSON mode
-            last_elapsed_time = None
+            if is_ollama and enforce_json:
-            last_response_content = ""
+                # Remove response_format if it was set (shouldn't be for ollama)
-            last_json_error: json.JSONDecodeError | None = None
+                request_params.pop("response_format", None)
                # Ollama expects "format": "json" parameter
                request_params["format"] = "json"
                search_logger.debug("Using Ollama-specific JSON format parameter")
-            for enforce_json, current_prompt in ((False, guard_prompt), (True, strict_prompt)):
+            if is_grok_model:
-                request_params = {
+                unsupported_params = ["presence_penalty", "frequency_penalty", "stop", "reasoning_effort"]
-                    "model": model_choice,
+                for param in unsupported_params:
-                    "messages": [
+                    if param in request_params:
-                        {
+                        removed_value = request_params.pop(param)
-                            "role": "system",
+                        search_logger.warning(f"Removed unsupported Grok parameter '{param}': {removed_value}")
                            "content": "You are a helpful assistant that analyzes code examples and provides JSON responses with example names and summaries.",
                        },
                        {"role": "user", "content": current_prompt},
                    ],
                    "max_tokens": 2000,
                    "temperature": 0.3,
                }
-                should_use_response_format = False
+                supported_params = ["model", "messages", "max_tokens", "temperature", "response_format", "stream", "tools", "tool_choice"]
-                if enforce_json:
+                for param in list(request_params.keys()):
-                    if not is_grok_model and (supports_response_format_base or provider_lower == "openrouter"):
+                    if param not in supported_params:
-                        should_use_response_format = True
+                        search_logger.warning(f"Parameter '{param}' may not be supported by Grok reasoning models")
                else:
                    if supports_response_format_base:
                        should_use_response_format = True
-                if should_use_response_format:
+            start_time = time.time()
-                    request_params["response_format"] = {"type": "json_object"}
+            max_retries = 3 if is_grok_model else 1
            retry_delay = 1.0
            response_content_local = ""
            reasoning_text_local = ""
            json_error_occurred = False
-                if is_grok_model:
+            for attempt in range(max_retries):
-                    unsupported_params = ["presence_penalty", "frequency_penalty", "stop", "reasoning_effort"]
+                try:
-                    for param in unsupported_params:
+                    if is_grok_model and attempt > 0:
-                        if param in request_params:
+                        search_logger.info(f"Grok retry attempt {attempt + 1}/{max_retries} after {retry_delay:.1f}s delay")
-                            removed_value = request_params.pop(param)
+                        await asyncio.sleep(retry_delay)
                            search_logger.warning(f"Removed unsupported Grok parameter '{param}': {removed_value}")
-                    supported_params = ["model", "messages", "max_tokens", "temperature", "response_format", "stream", "tools", "tool_choice"]
+                    final_params = prepare_chat_completion_params(model_choice, request_params)
-                    for param in list(request_params.keys()):
+                    response = await llm_client.chat.completions.create(**final_params)
-                        if param not in supported_params:
+                    last_response_obj = response
                            search_logger.warning(f"Parameter '{param}' may not be supported by Grok reasoning models")
-                start_time = time.time()
+                    choice = response.choices[0] if response.choices else None
-                max_retries = 3 if is_grok_model else 1
+                    message = choice.message if choice and hasattr(choice, "message") else None
-                retry_delay = 1.0
+                    response_content_local = ""
-                response_content_local = ""
+                    reasoning_text_local = ""
                reasoning_text_local = ""
                json_error_occurred = False
-                for attempt in range(max_retries):
+                    if choice:
-                    try:
+                        response_content_local, reasoning_text_local, _ = extract_message_text(choice)
                        if is_grok_model and attempt > 0:
                            search_logger.info(f"Grok retry attempt {attempt + 1}/{max_retries} after {retry_delay:.1f}s delay")
                            await asyncio.sleep(retry_delay)
-                        final_params = prepare_chat_completion_params(model_choice, request_params)
+                    # Enhanced logging for response analysis
-                        response = await client.chat.completions.create(**final_params)
+                    if message and reasoning_text_local:
-                        last_response_obj = response
+                        content_preview = response_content_local[:100] if response_content_local else "None"
                        reasoning_preview = reasoning_text_local[:100] if reasoning_text_local else "None"
                        search_logger.debug(
                            f"Response has reasoning content - content: '{content_preview}', reasoning: '{reasoning_preview}'"
                        )
-                        choice = response.choices[0] if response.choices else None
+                    if response_content_local:
-                        message = choice.message if choice and hasattr(choice, "message") else None
+                        last_response_content = response_content_local.strip()
                        response_content_local = ""
                        reasoning_text_local = ""
-                        if choice:
+                        # Pre-validate response before processing
-                            response_content_local, reasoning_text_local, _ = extract_message_text(choice)
+                        if len(last_response_content) < 20 or (len(last_response_content) < 50 and not last_response_content.strip().startswith('{')):
-
+                            # Very minimal response - likely "Okay\nOkay" type
-                        # Enhanced logging for response analysis
+                            search_logger.debug(f"Minimal response detected: {repr(last_response_content)}")
-                        if message and reasoning_text_local:
+                            # Generate fallback directly from context
-                            content_preview = response_content_local[:100] if response_content_local else "None"
+                            fallback_json = synthesize_json_from_reasoning("", code, language)
-                            reasoning_preview = reasoning_text_local[:100] if reasoning_text_local else "None"
+                            if fallback_json:
-                            search_logger.debug(
+                                try:
-                                f"Response has reasoning content - content: '{content_preview}', reasoning: '{reasoning_preview}'"
+                                    result = json.loads(fallback_json)
                            )
                        if response_content_local:
                            last_response_content = response_content_local.strip()
                            # Pre-validate response before processing
                            if len(last_response_content) < 20 or (len(last_response_content) < 50 and not last_response_content.strip().startswith('{')):
                                # Very minimal response - likely "Okay\nOkay" type
                                search_logger.debug(f"Minimal response detected: {repr(last_response_content)}")
                                # Generate fallback directly from context
                                fallback_json = synthesize_json_from_reasoning("", code, language)
                                if fallback_json:
                                    try:
                                        result = json.loads(fallback_json)
                                        final_result = {
                                            "example_name": result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
                                            "summary": result.get("summary", "Code example for demonstration purposes."),
                                        }
                                        search_logger.info(f"Generated fallback summary from context - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
                                        return final_result
                                    except json.JSONDecodeError:
                                        pass  # Continue to normal error handling
                                else:
                                    # Even synthesis failed - provide hardcoded fallback for minimal responses
                                    final_result = {
-                                        "example_name": f"Code Example{f' ({language})' if language else ''}",
+                                        "example_name": result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
-                                        "summary": "Code example extracted from development context.",
+                                        "summary": result.get("summary", "Code example for demonstration purposes."),
                                    }
-                                    search_logger.info(f"Used hardcoded fallback for minimal response - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
+                                    search_logger.info(f"Generated fallback summary from context - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
                                    return final_result
-
+                                except json.JSONDecodeError:
-                            payload = _extract_json_payload(last_response_content, code, language)
+                                    pass  # Continue to normal error handling
-                            if payload != last_response_content:
+                            else:
-                                search_logger.debug(
+                                # Even synthesis failed - provide hardcoded fallback for minimal responses
                                    f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
                                )
                            try:
                                result = json.loads(payload)
                                if not result.get("example_name") or not result.get("summary"):
                                    search_logger.warning(f"Incomplete response from LLM: {result}")
                                final_result = {
-                                    "example_name": result.get(
+                                    "example_name": f"Code Example{f' ({language})' if language else ''}",
-                                        "example_name", f"Code Example{f' ({language})' if language else ''}"
+                                    "summary": "Code example extracted from development context.",
                                    ),
                                    "summary": result.get("summary", "Code example for demonstration purposes."),
                                }
-
+                                search_logger.info(f"Used hardcoded fallback for minimal response - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
                                search_logger.info(
                                    f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
                                )
                                return final_result
-                            except json.JSONDecodeError as json_error:
+                        payload = _extract_json_payload(last_response_content, code, language)
-                                last_json_error = json_error
+                        if payload != last_response_content:
-                                json_error_occurred = True
+                            search_logger.debug(
-                                snippet = last_response_content[:200]
+                                f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
-                                if not enforce_json:
+                            )
-                                    # Check if this was reasoning text that couldn't be parsed
+
-                                    if _is_reasoning_text_response(last_response_content):
+                        try:
-                                        search_logger.debug(
+                            result = json.loads(payload)
-                                            f"Reasoning text detected but no JSON extracted. Response snippet: {repr(snippet)}"
+
-                                        )
+                            if not result.get("example_name") or not result.get("summary"):
-                                    else:
+                                search_logger.warning(f"Incomplete response from LLM: {result}")
-                                        search_logger.warning(
+
-                                            f"Failed to parse JSON response from LLM (non-strict attempt). Error: {json_error}. Response snippet: {repr(snippet)}"
+                            final_result = {
-                                        )
+                                "example_name": result.get(
-                                    break
+                                    "example_name", f"Code Example{f' ({language})' if language else ''}"
-                                else:
+                                ),
-                                    search_logger.error(
+                                "summary": result.get("summary", "Code example for demonstration purposes."),
-                                        f"Strict JSON enforcement still failed to produce valid JSON: {json_error}. Response snippet: {repr(snippet)}"
+                            }
                            search_logger.info(
                                f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
                            )
                            return final_result
                        except json.JSONDecodeError as json_error:
                            last_json_error = json_error
                            json_error_occurred = True
                            snippet = last_response_content[:200]
                            if not enforce_json:
                                # Check if this was reasoning text that couldn't be parsed
                                if _is_reasoning_text_response(last_response_content):
                                    search_logger.debug(
                                        f"Reasoning text detected but no JSON extracted. Response snippet: {repr(snippet)}"
                                    )
-                                    break
+                                else:
                                    search_logger.warning(
                                        f"Failed to parse JSON response from LLM (non-strict attempt). Error: {json_error}. Response snippet: {repr(snippet)}"
                                    )
                                break
                            else:
                                search_logger.error(
                                    f"Strict JSON enforcement still failed to produce valid JSON: {json_error}. Response snippet: {repr(snippet)}"
                                )
                                break
-                        elif is_grok_model and attempt < max_retries - 1:
+                    elif is_grok_model and attempt < max_retries - 1:
-                            search_logger.warning(f"Grok empty response on attempt {attempt + 1}, retrying...")
+                        search_logger.warning(f"Grok empty response on attempt {attempt + 1}, retrying...")
-                            retry_delay *= 2
+                        retry_delay *= 2
                            continue
                        else:
                            break
                    except Exception as e:
                        if is_grok_model and attempt < max_retries - 1:
                            search_logger.error(f"Grok request failed on attempt {attempt + 1}: {e}, retrying...")
                            retry_delay *= 2
                            continue
                        else:
                            raise
                if is_grok_model:
                    elapsed_time = time.time() - start_time
                    last_elapsed_time = elapsed_time
                    search_logger.debug(f"Grok total response time: {elapsed_time:.2f}s")
                if json_error_occurred:
                    if not enforce_json:
                        continue
                    else:
                        break
-                if response_content_local:
+                except Exception as e:
-                    # We would have returned already on success; if we reach here, parsing failed but we are not retrying
+                    if is_grok_model and attempt < max_retries - 1:
                        search_logger.error(f"Grok request failed on attempt {attempt + 1}: {e}, retrying...")
                        retry_delay *= 2
                        continue
                    else:
                        raise
            if is_grok_model:
                elapsed_time = time.time() - start_time
                last_elapsed_time = elapsed_time
                search_logger.debug(f"Grok total response time: {elapsed_time:.2f}s")
            if json_error_occurred:
                if not enforce_json:
                    continue
            response_content = last_response_content
            response = last_response_obj
            elapsed_time = last_elapsed_time if last_elapsed_time is not None else 0.0
            if last_json_error is not None and response_content:
                search_logger.error(
                    f"LLM response after strict enforcement was still not valid JSON: {last_json_error}. Clearing response to trigger error handling."
                )
                response_content = ""
            if not response_content:
                search_logger.error(f"Empty response from LLM for model: {model_choice} (provider: {provider})")
                if is_grok_model:
                    search_logger.error("Grok empty response debugging:")
                    search_logger.error(f"  - Request took: {elapsed_time:.2f}s")
                    search_logger.error(f"  - Response status: {getattr(response, 'status_code', 'N/A')}")
                    search_logger.error(f"  - Response headers: {getattr(response, 'headers', 'N/A')}")
                    search_logger.error(f"  - Full response: {response}")
                    search_logger.error(f"  - Response choices length: {len(response.choices) if response.choices else 0}")
                    if response.choices:
                        search_logger.error(f"  - First choice: {response.choices[0]}")
                        search_logger.error(f"  - Message content: '{response.choices[0].message.content}'")
                        search_logger.error(f"  - Message role: {response.choices[0].message.role}")
                    search_logger.error("Check: 1) API key validity, 2) rate limits, 3) model availability")
                    # Implement fallback for Grok failures
                    search_logger.warning("Attempting fallback to OpenAI due to Grok failure...")
                    try:
                        # Use OpenAI as fallback with similar parameters
                        fallback_params = {
                            "model": "gpt-4o-mini",
                            "messages": request_params["messages"],
                            "temperature": request_params.get("temperature", 0.1),
                            "max_tokens": request_params.get("max_tokens", 500),
                        }
                        async with get_llm_client(provider="openai") as fallback_client:
                            fallback_response = await fallback_client.chat.completions.create(**fallback_params)
                            fallback_content = fallback_response.choices[0].message.content
                            if fallback_content and fallback_content.strip():
                                search_logger.info("gpt-4o-mini fallback succeeded")
                                response_content = fallback_content.strip()
                            else:
                                search_logger.error("gpt-4o-mini fallback also returned empty response")
                                raise ValueError(f"Both {model_choice} and gpt-4o-mini fallback failed")
                    except Exception as fallback_error:
                        search_logger.error(f"gpt-4o-mini fallback failed: {fallback_error}")
                        raise ValueError(f"{model_choice} failed and fallback to gpt-4o-mini also failed: {fallback_error}") from fallback_error
                else:
-                    search_logger.debug(f"Full response object: {response}")
+                    break
                    raise ValueError("Empty response from LLM")
-            if not response_content:
+            if response_content_local:
-                # This should not happen after fallback logic, but safety check
+                # We would have returned already on success; if we reach here, parsing failed but we are not retrying
-                raise ValueError("No valid response content after all attempts")
+                continue
-            response_content = response_content.strip()
+        response_content = last_response_content
-            search_logger.debug(f"LLM API response: {repr(response_content[:200])}...")
+        response = last_response_obj
        elapsed_time = last_elapsed_time if last_elapsed_time is not None else 0.0
-            payload = _extract_json_payload(response_content, code, language)
+        if last_json_error is not None and response_content:
-            if payload != response_content:
+            search_logger.error(
-                search_logger.debug(
+                f"LLM response after strict enforcement was still not valid JSON: {last_json_error}. Clearing response to trigger error handling."
                    f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
                )
            result = json.loads(payload)
            # Validate the response has the required fields
            if not result.get("example_name") or not result.get("summary"):
                search_logger.warning(f"Incomplete response from LLM: {result}")
            final_result = {
                "example_name": result.get(
                    "example_name", f"Code Example{f' ({language})' if language else ''}"
                ),
                "summary": result.get("summary", "Code example for demonstration purposes."),
            }
            search_logger.info(
                f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
            )
-            return final_result
+            response_content = ""
        if not response_content:
            search_logger.error(f"Empty response from LLM for model: {model_choice} (provider: {provider})")
            if is_grok_model:
                search_logger.error("Grok empty response debugging:")
                search_logger.error(f"  - Request took: {elapsed_time:.2f}s")
                search_logger.error(f"  - Response status: {getattr(response, 'status_code', 'N/A')}")
                search_logger.error(f"  - Response headers: {getattr(response, 'headers', 'N/A')}")
                search_logger.error(f"  - Full response: {response}")
                search_logger.error(f"  - Response choices length: {len(response.choices) if response.choices else 0}")
                if response.choices:
                    search_logger.error(f"  - First choice: {response.choices[0]}")
                    search_logger.error(f"  - Message content: '{response.choices[0].message.content}'")
                    search_logger.error(f"  - Message role: {response.choices[0].message.role}")
                search_logger.error("Check: 1) API key validity, 2) rate limits, 3) model availability")
                # Implement fallback for Grok failures
                search_logger.warning("Attempting fallback to OpenAI due to Grok failure...")
                try:
                    # Use OpenAI as fallback with similar parameters
                    fallback_params = {
                        "model": "gpt-4o-mini",
                        "messages": request_params["messages"],
                        "temperature": request_params.get("temperature", 0.1),
                        "max_tokens": request_params.get("max_tokens", 500),
                    }
                    async with get_llm_client(provider="openai") as fallback_client:
                        fallback_response = await fallback_client.chat.completions.create(**fallback_params)
                        fallback_content = fallback_response.choices[0].message.content
                        if fallback_content and fallback_content.strip():
                            search_logger.info("gpt-4o-mini fallback succeeded")
                            response_content = fallback_content.strip()
                        else:
                            search_logger.error("gpt-4o-mini fallback also returned empty response")
                            raise ValueError(f"Both {model_choice} and gpt-4o-mini fallback failed")
                except Exception as fallback_error:
                    search_logger.error(f"gpt-4o-mini fallback failed: {fallback_error}")
                    raise ValueError(f"{model_choice} failed and fallback to gpt-4o-mini also failed: {fallback_error}") from fallback_error
            else:
                search_logger.debug(f"Full response object: {response}")
                raise ValueError("Empty response from LLM")
        if not response_content:
            # This should not happen after fallback logic, but safety check
            raise ValueError("No valid response content after all attempts")
        response_content = response_content.strip()
        search_logger.debug(f"LLM API response: {repr(response_content[:200])}...")
        payload = _extract_json_payload(response_content, code, language)
        if payload != response_content:
            search_logger.debug(
                f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
            )
        result = json.loads(payload)
        # Validate the response has the required fields
        if not result.get("example_name") or not result.get("summary"):
            search_logger.warning(f"Incomplete response from LLM: {result}")
        final_result = {
            "example_name": result.get(
                "example_name", f"Code Example{f' ({language})' if language else ''}"
            ),
            "summary": result.get("summary", "Code example for demonstration purposes."),
        }
        search_logger.info(
            f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
        )
        return final_result
    except json.JSONDecodeError as e:
        search_logger.error(
@@ -934,7 +980,7 @@ Format your response as JSON:
            fallback_json = synthesize_json_from_reasoning("", code, language)
            if fallback_json:
                fallback_result = json.loads(fallback_json)
-                search_logger.info(f"Generated context-aware fallback summary")
+                search_logger.info("Generated context-aware fallback summary")
                return {
                    "example_name": fallback_result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
                    "summary": fallback_result.get("summary", "Code example for demonstration purposes."),
@@ -953,7 +999,7 @@ Format your response as JSON:
            fallback_json = synthesize_json_from_reasoning("", code, language)
            if fallback_json:
                fallback_result = json.loads(fallback_json)
-                search_logger.info(f"Generated context-aware fallback summary after error")
+                search_logger.info("Generated context-aware fallback summary after error")
                return {
                    "example_name": fallback_result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
                    "summary": fallback_result.get("summary", "Code example for demonstration purposes."),
@@ -1002,82 +1048,84 @@ async def generate_code_summaries_batch(
        f"Generating summaries for {len(code_blocks)} code blocks with max_workers={max_workers}"
    )
-    # Semaphore to limit concurrent requests
+    # Create a shared LLM client for all summaries (performance optimization)
-    semaphore = asyncio.Semaphore(max_workers)
+    async with get_llm_client(provider=provider) as shared_client:
-    completed_count = 0
+        search_logger.debug("Created shared LLM client for batch summary generation")
    lock = asyncio.Lock()
-    async def generate_single_summary_with_limit(block: dict[str, Any]) -> dict[str, str]:
+        # Semaphore to limit concurrent requests
-        nonlocal completed_count
+        semaphore = asyncio.Semaphore(max_workers)
-        async with semaphore:
+        completed_count = 0
-            # Add delay between requests to avoid rate limiting
+        lock = asyncio.Lock()
            await asyncio.sleep(0.5)  # 500ms delay between requests
-            # Run the synchronous function in a thread
+        async def generate_single_summary_with_limit(block: dict[str, Any]) -> dict[str, str]:
-            loop = asyncio.get_event_loop()
+            nonlocal completed_count
-            result = await loop.run_in_executor(
+            async with semaphore:
-                None,
+                # Add delay between requests to avoid rate limiting
-                generate_code_example_summary,
+                await asyncio.sleep(0.5)  # 500ms delay between requests
-                block["code"],
+
-                block["context_before"],
+                # Call async version directly with shared client (no event loop overhead)
-                block["context_after"],
+                result = await _generate_code_example_summary_async(
-                block.get("language", ""),
+                    block["code"],
-                provider,
+                    block["context_before"],
                    block["context_after"],
                    block.get("language", ""),
                    provider,
                    shared_client  # Pass shared client for reuse
                )
                # Update progress
                async with lock:
                    completed_count += 1
                    if progress_callback:
                        # Simple progress based on summaries completed
                        progress_percentage = int((completed_count / len(code_blocks)) * 100)
                        await progress_callback({
                            "status": "code_extraction",
                            "percentage": progress_percentage,
                            "log": f"Generated {completed_count}/{len(code_blocks)} code summaries",
                            "completed_summaries": completed_count,
                            "total_summaries": len(code_blocks),
                        })
                return result
        # Process all blocks concurrently but with rate limiting
        try:
            summaries = await asyncio.gather(
                *[generate_single_summary_with_limit(block) for block in code_blocks],
                return_exceptions=True,
            )
-            # Update progress
+            # Handle any exceptions in the results
-            async with lock:
+            final_summaries = []
-                completed_count += 1
+            for i, summary in enumerate(summaries):
-                if progress_callback:
+                if isinstance(summary, Exception):
-                    # Simple progress based on summaries completed
+                    search_logger.error(f"Error generating summary for code block {i}: {summary}")
-                    progress_percentage = int((completed_count / len(code_blocks)) * 100)
+                    # Use fallback summary
-                    await progress_callback({
+                    language = code_blocks[i].get("language", "")
-                        "status": "code_extraction",
+                    fallback = {
-                        "percentage": progress_percentage,
+                        "example_name": f"Code Example{f' ({language})' if language else ''}",
-                        "log": f"Generated {completed_count}/{len(code_blocks)} code summaries",
+                        "summary": "Code example for demonstration purposes.",
-                        "completed_summaries": completed_count,
+                    }
-                        "total_summaries": len(code_blocks),
+                    final_summaries.append(fallback)
-                    })
+                else:
                    final_summaries.append(summary)
-            return result
+            search_logger.info(f"Successfully generated {len(final_summaries)} code summaries")
            return final_summaries
-    # Process all blocks concurrently but with rate limiting
+        except Exception as e:
-    try:
+            search_logger.error(f"Error in batch summary generation: {e}")
-        summaries = await asyncio.gather(
+            # Return fallback summaries for all blocks
-            *[generate_single_summary_with_limit(block) for block in code_blocks],
+            fallback_summaries = []
-            return_exceptions=True,
+            for block in code_blocks:
-        )
+                language = block.get("language", "")
        # Handle any exceptions in the results
        final_summaries = []
        for i, summary in enumerate(summaries):
            if isinstance(summary, Exception):
                search_logger.error(f"Error generating summary for code block {i}: {summary}")
                # Use fallback summary
                language = code_blocks[i].get("language", "")
                fallback = {
                    "example_name": f"Code Example{f' ({language})' if language else ''}",
                    "summary": "Code example for demonstration purposes.",
                }
-                final_summaries.append(fallback)
+                fallback_summaries.append(fallback)
-            else:
+            return fallback_summaries
                final_summaries.append(summary)
        search_logger.info(f"Successfully generated {len(final_summaries)} code summaries")
        return final_summaries
    except Exception as e:
        search_logger.error(f"Error in batch summary generation: {e}")
        # Return fallback summaries for all blocks
        fallback_summaries = []
        for block in code_blocks:
            language = block.get("language", "")
            fallback = {
                "example_name": f"Code Example{f' ({language})' if language else ''}",
                "summary": "Code example for demonstration purposes.",
            }
            fallback_summaries.append(fallback)
        return fallback_summaries
 async def add_code_examples_to_supabase(