Fixes: crawl code storage issue with <think> tags for ollama models. (#775)

* Fixes: crawl code storage issue with <think> tags for ollama models. * updates from code rabbit review
2025-12-30 21:49:30 -05:00 · 2025-10-10 18:09:53 -04:00
parent 94e28f85fd
commit 7c3823e08f
2 changed files with 437 additions and 314 deletions
--- a/python/src/server/services/storage/code_storage_service.py
+++ b/python/src/server/services/storage/code_storage_service.py
@@ -82,6 +82,10 @@ def _is_reasoning_text_response(text: str) -> bool:

    text_lower = text.lower().strip()

+    # Check for XML-style thinking tags (common in models with extended thinking)
+    if text_lower.startswith("<think>") or "<think>" in text_lower[:100]:
+        return True
+
    # Check if it's clearly not JSON (starts with reasoning text)
    starts_with_reasoning = any(text_lower.startswith(starter) for starter in REASONING_STARTERS)

@@ -592,10 +596,23 @@ def generate_code_example_summary(


 async def _generate_code_example_summary_async(
-    code: str, context_before: str, context_after: str, language: str = "", provider: str = None
+    code: str,
+    context_before: str,
+    context_after: str,
+    language: str = "",
+    provider: str = None,
+    client = None
 ) -> dict[str, str]:
    """
    Async version of generate_code_example_summary using unified LLM provider service.
+
+    Args:
+        code: The code example to summarize
+        context_before: Context before the code block
+        context_after: Context after the code block
+        language: Programming language of the code
+        provider: LLM provider to use (optional)
+        client: Pre-initialized LLM client for reuse (optional, improves performance)
    """

    # Get model choice from credential service (RAG setting)
@@ -647,283 +664,312 @@ Format your response as JSON:
        + "\n\nSecond attempt enforcement: Return JSON only with the exact schema. No additional text or reasoning content."
    )

+    # Use provided client or create a new one
+    if client is not None:
+        # Reuse provided client for better performance
+        return await _generate_summary_with_client(
+            client, code, context_before, context_after, language, provider,
+            model_choice, guard_prompt, strict_prompt
+        )
+    else:
+        # Create new client (backward compatibility)
+        async with get_llm_client(provider=provider) as new_client:
+            return await _generate_summary_with_client(
+                new_client, code, context_before, context_after, language, provider,
+                model_choice, guard_prompt, strict_prompt
+            )
+
+
+async def _generate_summary_with_client(
+    llm_client, code: str, context_before: str, context_after: str,
+    language: str, provider: str, model_choice: str,
+    guard_prompt: str, strict_prompt: str
+) -> dict[str, str]:
+    """Helper function that generates summary using a provided client."""
+    search_logger.info(
+        f"Generating summary for {hash(code) & 0xffffff:06x} using model: {model_choice}"
+    )
+
+    provider_lower = provider.lower()
+    is_grok_model = (provider_lower == "grok") or ("grok" in model_choice.lower())
+    is_ollama = provider_lower == "ollama"
+
+    supports_response_format_base = (
+        provider_lower in {"openai", "google", "anthropic"}
+        or (provider_lower == "openrouter" and model_choice.startswith("openai/"))
+    )
+
+    last_response_obj = None
+    last_elapsed_time = None
+    last_response_content = ""
+    last_json_error: json.JSONDecodeError | None = None
+
    try:
-        # Use unified LLM provider service
-        async with get_llm_client(provider=provider) as client:
-            search_logger.info(
-                f"Generating summary for {hash(code) & 0xffffff:06x} using model: {model_choice}"
-            )
+        for enforce_json, current_prompt in ((False, guard_prompt), (True, strict_prompt)):
+            request_params = {
+                "model": model_choice,
+                "messages": [
+                    {
+                        "role": "system",
+                        "content": "You are a helpful assistant that analyzes code examples and provides JSON responses with example names and summaries.",
+                    },
+                    {"role": "user", "content": current_prompt},
+                ],
+                "max_tokens": 2000,
+                "temperature": 0.3,
+            }

-            provider_lower = provider.lower()
-            is_grok_model = (provider_lower == "grok") or ("grok" in model_choice.lower())
+            should_use_response_format = False
+            if enforce_json:
+                if not is_grok_model and (supports_response_format_base or provider_lower == "openrouter"):
+                    should_use_response_format = True
+            else:
+                if supports_response_format_base:
+                    should_use_response_format = True

-            supports_response_format_base = (
-                provider_lower in {"openai", "google", "anthropic"}
-                or (provider_lower == "openrouter" and model_choice.startswith("openai/"))
-            )
+            if should_use_response_format:
+                request_params["response_format"] = {"type": "json_object"}

-            last_response_obj = None
-            last_elapsed_time = None
-            last_response_content = ""
-            last_json_error: json.JSONDecodeError | None = None
+            # Ollama uses a different parameter format for JSON mode
+            if is_ollama and enforce_json:
+                # Remove response_format if it was set (shouldn't be for ollama)
+                request_params.pop("response_format", None)
+                # Ollama expects "format": "json" parameter
+                request_params["format"] = "json"
+                search_logger.debug("Using Ollama-specific JSON format parameter")

-            for enforce_json, current_prompt in ((False, guard_prompt), (True, strict_prompt)):
-                request_params = {
-                    "model": model_choice,
-                    "messages": [
-                        {
-                            "role": "system",
-                            "content": "You are a helpful assistant that analyzes code examples and provides JSON responses with example names and summaries.",
-                        },
-                        {"role": "user", "content": current_prompt},
-                    ],
-                    "max_tokens": 2000,
-                    "temperature": 0.3,
-                }
+            if is_grok_model:
+                unsupported_params = ["presence_penalty", "frequency_penalty", "stop", "reasoning_effort"]
+                for param in unsupported_params:
+                    if param in request_params:
+                        removed_value = request_params.pop(param)
+                        search_logger.warning(f"Removed unsupported Grok parameter '{param}': {removed_value}")

-                should_use_response_format = False
-                if enforce_json:
-                    if not is_grok_model and (supports_response_format_base or provider_lower == "openrouter"):
-                        should_use_response_format = True
-                else:
-                    if supports_response_format_base:
-                        should_use_response_format = True
+                supported_params = ["model", "messages", "max_tokens", "temperature", "response_format", "stream", "tools", "tool_choice"]
+                for param in list(request_params.keys()):
+                    if param not in supported_params:
+                        search_logger.warning(f"Parameter '{param}' may not be supported by Grok reasoning models")

-                if should_use_response_format:
-                    request_params["response_format"] = {"type": "json_object"}
+            start_time = time.time()
+            max_retries = 3 if is_grok_model else 1
+            retry_delay = 1.0
+            response_content_local = ""
+            reasoning_text_local = ""
+            json_error_occurred = False

-                if is_grok_model:
-                    unsupported_params = ["presence_penalty", "frequency_penalty", "stop", "reasoning_effort"]
-                    for param in unsupported_params:
-                        if param in request_params:
-                            removed_value = request_params.pop(param)
-                            search_logger.warning(f"Removed unsupported Grok parameter '{param}': {removed_value}")
+            for attempt in range(max_retries):
+                try:
+                    if is_grok_model and attempt > 0:
+                        search_logger.info(f"Grok retry attempt {attempt + 1}/{max_retries} after {retry_delay:.1f}s delay")
+                        await asyncio.sleep(retry_delay)

-                    supported_params = ["model", "messages", "max_tokens", "temperature", "response_format", "stream", "tools", "tool_choice"]
-                    for param in list(request_params.keys()):
-                        if param not in supported_params:
-                            search_logger.warning(f"Parameter '{param}' may not be supported by Grok reasoning models")
+                    final_params = prepare_chat_completion_params(model_choice, request_params)
+                    response = await llm_client.chat.completions.create(**final_params)
+                    last_response_obj = response

-                start_time = time.time()
-                max_retries = 3 if is_grok_model else 1
-                retry_delay = 1.0
-                response_content_local = ""
-                reasoning_text_local = ""
-                json_error_occurred = False
+                    choice = response.choices[0] if response.choices else None
+                    message = choice.message if choice and hasattr(choice, "message") else None
+                    response_content_local = ""
+                    reasoning_text_local = ""

-                for attempt in range(max_retries):
-                    try:
-                        if is_grok_model and attempt > 0:
-                            search_logger.info(f"Grok retry attempt {attempt + 1}/{max_retries} after {retry_delay:.1f}s delay")
-                            await asyncio.sleep(retry_delay)
+                    if choice:
+                        response_content_local, reasoning_text_local, _ = extract_message_text(choice)

-                        final_params = prepare_chat_completion_params(model_choice, request_params)
-                        response = await client.chat.completions.create(**final_params)
-                        last_response_obj = response
+                    # Enhanced logging for response analysis
+                    if message and reasoning_text_local:
+                        content_preview = response_content_local[:100] if response_content_local else "None"
+                        reasoning_preview = reasoning_text_local[:100] if reasoning_text_local else "None"
+                        search_logger.debug(
+                            f"Response has reasoning content - content: '{content_preview}', reasoning: '{reasoning_preview}'"
+                        )

-                        choice = response.choices[0] if response.choices else None
-                        message = choice.message if choice and hasattr(choice, "message") else None
-                        response_content_local = ""
-                        reasoning_text_local = ""
+                    if response_content_local:
+                        last_response_content = response_content_local.strip()

-                        if choice:
-                            response_content_local, reasoning_text_local, _ = extract_message_text(choice)
-
-                        # Enhanced logging for response analysis
-                        if message and reasoning_text_local:
-                            content_preview = response_content_local[:100] if response_content_local else "None"
-                            reasoning_preview = reasoning_text_local[:100] if reasoning_text_local else "None"
-                            search_logger.debug(
-                                f"Response has reasoning content - content: '{content_preview}', reasoning: '{reasoning_preview}'"
-                            )
-
-                        if response_content_local:
-                            last_response_content = response_content_local.strip()
-
-                            # Pre-validate response before processing
-                            if len(last_response_content) < 20 or (len(last_response_content) < 50 and not last_response_content.strip().startswith('{')):
-                                # Very minimal response - likely "Okay\nOkay" type
-                                search_logger.debug(f"Minimal response detected: {repr(last_response_content)}")
-                                # Generate fallback directly from context
-                                fallback_json = synthesize_json_from_reasoning("", code, language)
-                                if fallback_json:
-                                    try:
-                                        result = json.loads(fallback_json)
-                                        final_result = {
-                                            "example_name": result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
-                                            "summary": result.get("summary", "Code example for demonstration purposes."),
-                                        }
-                                        search_logger.info(f"Generated fallback summary from context - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
-                                        return final_result
-                                    except json.JSONDecodeError:
-                                        pass  # Continue to normal error handling
-                                else:
-                                    # Even synthesis failed - provide hardcoded fallback for minimal responses
+                        # Pre-validate response before processing
+                        if len(last_response_content) < 20 or (len(last_response_content) < 50 and not last_response_content.strip().startswith('{')):
+                            # Very minimal response - likely "Okay\nOkay" type
+                            search_logger.debug(f"Minimal response detected: {repr(last_response_content)}")
+                            # Generate fallback directly from context
+                            fallback_json = synthesize_json_from_reasoning("", code, language)
+                            if fallback_json:
+                                try:
+                                    result = json.loads(fallback_json)
                                    final_result = {
-                                        "example_name": f"Code Example{f' ({language})' if language else ''}",
-                                        "summary": "Code example extracted from development context.",
+                                        "example_name": result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
+                                        "summary": result.get("summary", "Code example for demonstration purposes."),
                                    }
-                                    search_logger.info(f"Used hardcoded fallback for minimal response - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
+                                    search_logger.info(f"Generated fallback summary from context - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
                                    return final_result
-
-                            payload = _extract_json_payload(last_response_content, code, language)
-                            if payload != last_response_content:
-                                search_logger.debug(
-                                    f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
-                                )
-
-                            try:
-                                result = json.loads(payload)
-
-                                if not result.get("example_name") or not result.get("summary"):
-                                    search_logger.warning(f"Incomplete response from LLM: {result}")
-
+                                except json.JSONDecodeError:
+                                    pass  # Continue to normal error handling
+                            else:
+                                # Even synthesis failed - provide hardcoded fallback for minimal responses
                                final_result = {
-                                    "example_name": result.get(
-                                        "example_name", f"Code Example{f' ({language})' if language else ''}"
-                                    ),
-                                    "summary": result.get("summary", "Code example for demonstration purposes."),
+                                    "example_name": f"Code Example{f' ({language})' if language else ''}",
+                                    "summary": "Code example extracted from development context.",
                                }
-
-                                search_logger.info(
-                                    f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
-                                )
+                                search_logger.info(f"Used hardcoded fallback for minimal response - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}")
                                return final_result

-                            except json.JSONDecodeError as json_error:
-                                last_json_error = json_error
-                                json_error_occurred = True
-                                snippet = last_response_content[:200]
-                                if not enforce_json:
-                                    # Check if this was reasoning text that couldn't be parsed
-                                    if _is_reasoning_text_response(last_response_content):
-                                        search_logger.debug(
-                                            f"Reasoning text detected but no JSON extracted. Response snippet: {repr(snippet)}"
-                                        )
-                                    else:
-                                        search_logger.warning(
-                                            f"Failed to parse JSON response from LLM (non-strict attempt). Error: {json_error}. Response snippet: {repr(snippet)}"
-                                        )
-                                    break
-                                else:
-                                    search_logger.error(
-                                        f"Strict JSON enforcement still failed to produce valid JSON: {json_error}. Response snippet: {repr(snippet)}"
+                        payload = _extract_json_payload(last_response_content, code, language)
+                        if payload != last_response_content:
+                            search_logger.debug(
+                                f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
+                            )
+
+                        try:
+                            result = json.loads(payload)
+
+                            if not result.get("example_name") or not result.get("summary"):
+                                search_logger.warning(f"Incomplete response from LLM: {result}")
+
+                            final_result = {
+                                "example_name": result.get(
+                                    "example_name", f"Code Example{f' ({language})' if language else ''}"
+                                ),
+                                "summary": result.get("summary", "Code example for demonstration purposes."),
+                            }
+
+                            search_logger.info(
+                                f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
+                            )
+                            return final_result
+
+                        except json.JSONDecodeError as json_error:
+                            last_json_error = json_error
+                            json_error_occurred = True
+                            snippet = last_response_content[:200]
+                            if not enforce_json:
+                                # Check if this was reasoning text that couldn't be parsed
+                                if _is_reasoning_text_response(last_response_content):
+                                    search_logger.debug(
+                                        f"Reasoning text detected but no JSON extracted. Response snippet: {repr(snippet)}"
                                    )
-                                    break
+                                else:
+                                    search_logger.warning(
+                                        f"Failed to parse JSON response from LLM (non-strict attempt). Error: {json_error}. Response snippet: {repr(snippet)}"
+                                    )
+                                break
+                            else:
+                                search_logger.error(
+                                    f"Strict JSON enforcement still failed to produce valid JSON: {json_error}. Response snippet: {repr(snippet)}"
+                                )
+                                break

-                        elif is_grok_model and attempt < max_retries - 1:
-                            search_logger.warning(f"Grok empty response on attempt {attempt + 1}, retrying...")
-                            retry_delay *= 2
-                            continue
-                        else:
-                            break
-
-                    except Exception as e:
-                        if is_grok_model and attempt < max_retries - 1:
-                            search_logger.error(f"Grok request failed on attempt {attempt + 1}: {e}, retrying...")
-                            retry_delay *= 2
-                            continue
-                        else:
-                            raise
-
-                if is_grok_model:
-                    elapsed_time = time.time() - start_time
-                    last_elapsed_time = elapsed_time
-                    search_logger.debug(f"Grok total response time: {elapsed_time:.2f}s")
-
-                if json_error_occurred:
-                    if not enforce_json:
+                    elif is_grok_model and attempt < max_retries - 1:
+                        search_logger.warning(f"Grok empty response on attempt {attempt + 1}, retrying...")
+                        retry_delay *= 2
                        continue
                    else:
                        break

-                if response_content_local:
-                    # We would have returned already on success; if we reach here, parsing failed but we are not retrying
+                except Exception as e:
+                    if is_grok_model and attempt < max_retries - 1:
+                        search_logger.error(f"Grok request failed on attempt {attempt + 1}: {e}, retrying...")
+                        retry_delay *= 2
+                        continue
+                    else:
+                        raise
+
+            if is_grok_model:
+                elapsed_time = time.time() - start_time
+                last_elapsed_time = elapsed_time
+                search_logger.debug(f"Grok total response time: {elapsed_time:.2f}s")
+
+            if json_error_occurred:
+                if not enforce_json:
                    continue
-
-            response_content = last_response_content
-            response = last_response_obj
-            elapsed_time = last_elapsed_time if last_elapsed_time is not None else 0.0
-
-            if last_json_error is not None and response_content:
-                search_logger.error(
-                    f"LLM response after strict enforcement was still not valid JSON: {last_json_error}. Clearing response to trigger error handling."
-                )
-                response_content = ""
-
-            if not response_content:
-                search_logger.error(f"Empty response from LLM for model: {model_choice} (provider: {provider})")
-                if is_grok_model:
-                    search_logger.error("Grok empty response debugging:")
-                    search_logger.error(f"  - Request took: {elapsed_time:.2f}s")
-                    search_logger.error(f"  - Response status: {getattr(response, 'status_code', 'N/A')}")
-                    search_logger.error(f"  - Response headers: {getattr(response, 'headers', 'N/A')}")
-                    search_logger.error(f"  - Full response: {response}")
-                    search_logger.error(f"  - Response choices length: {len(response.choices) if response.choices else 0}")
-                    if response.choices:
-                        search_logger.error(f"  - First choice: {response.choices[0]}")
-                        search_logger.error(f"  - Message content: '{response.choices[0].message.content}'")
-                        search_logger.error(f"  - Message role: {response.choices[0].message.role}")
-                    search_logger.error("Check: 1) API key validity, 2) rate limits, 3) model availability")
-
-                    # Implement fallback for Grok failures
-                    search_logger.warning("Attempting fallback to OpenAI due to Grok failure...")
-                    try:
-                        # Use OpenAI as fallback with similar parameters
-                        fallback_params = {
-                            "model": "gpt-4o-mini",
-                            "messages": request_params["messages"],
-                            "temperature": request_params.get("temperature", 0.1),
-                            "max_tokens": request_params.get("max_tokens", 500),
-                        }
-
-                        async with get_llm_client(provider="openai") as fallback_client:
-                            fallback_response = await fallback_client.chat.completions.create(**fallback_params)
-                            fallback_content = fallback_response.choices[0].message.content
-                            if fallback_content and fallback_content.strip():
-                                search_logger.info("gpt-4o-mini fallback succeeded")
-                                response_content = fallback_content.strip()
-                            else:
-                                search_logger.error("gpt-4o-mini fallback also returned empty response")
-                                raise ValueError(f"Both {model_choice} and gpt-4o-mini fallback failed")
-
-                    except Exception as fallback_error:
-                        search_logger.error(f"gpt-4o-mini fallback failed: {fallback_error}")
-                        raise ValueError(f"{model_choice} failed and fallback to gpt-4o-mini also failed: {fallback_error}") from fallback_error
                else:
-                    search_logger.debug(f"Full response object: {response}")
-                    raise ValueError("Empty response from LLM")
+                    break

-            if not response_content:
-                # This should not happen after fallback logic, but safety check
-                raise ValueError("No valid response content after all attempts")
+            if response_content_local:
+                # We would have returned already on success; if we reach here, parsing failed but we are not retrying
+                continue

-            response_content = response_content.strip()
-            search_logger.debug(f"LLM API response: {repr(response_content[:200])}...")
+        response_content = last_response_content
+        response = last_response_obj
+        elapsed_time = last_elapsed_time if last_elapsed_time is not None else 0.0

-            payload = _extract_json_payload(response_content, code, language)
-            if payload != response_content:
-                search_logger.debug(
-                    f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
-                )
-
-            result = json.loads(payload)
-
-            # Validate the response has the required fields
-            if not result.get("example_name") or not result.get("summary"):
-                search_logger.warning(f"Incomplete response from LLM: {result}")
-
-            final_result = {
-                "example_name": result.get(
-                    "example_name", f"Code Example{f' ({language})' if language else ''}"
-                ),
-                "summary": result.get("summary", "Code example for demonstration purposes."),
-            }
-
-            search_logger.info(
-                f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
+        if last_json_error is not None and response_content:
+            search_logger.error(
+                f"LLM response after strict enforcement was still not valid JSON: {last_json_error}. Clearing response to trigger error handling."
            )
-            return final_result
+            response_content = ""
+
+        if not response_content:
+            search_logger.error(f"Empty response from LLM for model: {model_choice} (provider: {provider})")
+            if is_grok_model:
+                search_logger.error("Grok empty response debugging:")
+                search_logger.error(f"  - Request took: {elapsed_time:.2f}s")
+                search_logger.error(f"  - Response status: {getattr(response, 'status_code', 'N/A')}")
+                search_logger.error(f"  - Response headers: {getattr(response, 'headers', 'N/A')}")
+                search_logger.error(f"  - Full response: {response}")
+                search_logger.error(f"  - Response choices length: {len(response.choices) if response.choices else 0}")
+                if response.choices:
+                    search_logger.error(f"  - First choice: {response.choices[0]}")
+                    search_logger.error(f"  - Message content: '{response.choices[0].message.content}'")
+                    search_logger.error(f"  - Message role: {response.choices[0].message.role}")
+                search_logger.error("Check: 1) API key validity, 2) rate limits, 3) model availability")
+
+                # Implement fallback for Grok failures
+                search_logger.warning("Attempting fallback to OpenAI due to Grok failure...")
+                try:
+                    # Use OpenAI as fallback with similar parameters
+                    fallback_params = {
+                        "model": "gpt-4o-mini",
+                        "messages": request_params["messages"],
+                        "temperature": request_params.get("temperature", 0.1),
+                        "max_tokens": request_params.get("max_tokens", 500),
+                    }
+
+                    async with get_llm_client(provider="openai") as fallback_client:
+                        fallback_response = await fallback_client.chat.completions.create(**fallback_params)
+                        fallback_content = fallback_response.choices[0].message.content
+                        if fallback_content and fallback_content.strip():
+                            search_logger.info("gpt-4o-mini fallback succeeded")
+                            response_content = fallback_content.strip()
+                        else:
+                            search_logger.error("gpt-4o-mini fallback also returned empty response")
+                            raise ValueError(f"Both {model_choice} and gpt-4o-mini fallback failed")
+
+                except Exception as fallback_error:
+                    search_logger.error(f"gpt-4o-mini fallback failed: {fallback_error}")
+                    raise ValueError(f"{model_choice} failed and fallback to gpt-4o-mini also failed: {fallback_error}") from fallback_error
+            else:
+                search_logger.debug(f"Full response object: {response}")
+                raise ValueError("Empty response from LLM")
+
+        if not response_content:
+            # This should not happen after fallback logic, but safety check
+            raise ValueError("No valid response content after all attempts")
+
+        response_content = response_content.strip()
+        search_logger.debug(f"LLM API response: {repr(response_content[:200])}...")
+
+        payload = _extract_json_payload(response_content, code, language)
+        if payload != response_content:
+            search_logger.debug(
+                f"Sanitized LLM response payload before parsing: {repr(payload[:200])}..."
+            )
+
+        result = json.loads(payload)
+
+        # Validate the response has the required fields
+        if not result.get("example_name") or not result.get("summary"):
+            search_logger.warning(f"Incomplete response from LLM: {result}")
+
+        final_result = {
+            "example_name": result.get(
+                "example_name", f"Code Example{f' ({language})' if language else ''}"
+            ),
+            "summary": result.get("summary", "Code example for demonstration purposes."),
+        }
+
+        search_logger.info(
+            f"Generated code example summary - Name: '{final_result['example_name']}', Summary length: {len(final_result['summary'])}"
+        )
+        return final_result

    except json.JSONDecodeError as e:
        search_logger.error(
@@ -934,7 +980,7 @@ Format your response as JSON:
            fallback_json = synthesize_json_from_reasoning("", code, language)
            if fallback_json:
                fallback_result = json.loads(fallback_json)
-                search_logger.info(f"Generated context-aware fallback summary")
+                search_logger.info("Generated context-aware fallback summary")
                return {
                    "example_name": fallback_result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
                    "summary": fallback_result.get("summary", "Code example for demonstration purposes."),
@@ -953,7 +999,7 @@ Format your response as JSON:
            fallback_json = synthesize_json_from_reasoning("", code, language)
            if fallback_json:
                fallback_result = json.loads(fallback_json)
-                search_logger.info(f"Generated context-aware fallback summary after error")
+                search_logger.info("Generated context-aware fallback summary after error")
                return {
                    "example_name": fallback_result.get("example_name", f"Code Example{f' ({language})' if language else ''}"),
                    "summary": fallback_result.get("summary", "Code example for demonstration purposes."),
@@ -1002,82 +1048,84 @@ async def generate_code_summaries_batch(
        f"Generating summaries for {len(code_blocks)} code blocks with max_workers={max_workers}"
    )

-    # Semaphore to limit concurrent requests
-    semaphore = asyncio.Semaphore(max_workers)
-    completed_count = 0
-    lock = asyncio.Lock()
+    # Create a shared LLM client for all summaries (performance optimization)
+    async with get_llm_client(provider=provider) as shared_client:
+        search_logger.debug("Created shared LLM client for batch summary generation")

-    async def generate_single_summary_with_limit(block: dict[str, Any]) -> dict[str, str]:
-        nonlocal completed_count
-        async with semaphore:
-            # Add delay between requests to avoid rate limiting
-            await asyncio.sleep(0.5)  # 500ms delay between requests
+        # Semaphore to limit concurrent requests
+        semaphore = asyncio.Semaphore(max_workers)
+        completed_count = 0
+        lock = asyncio.Lock()

-            # Run the synchronous function in a thread
-            loop = asyncio.get_event_loop()
-            result = await loop.run_in_executor(
-                None,
-                generate_code_example_summary,
-                block["code"],
-                block["context_before"],
-                block["context_after"],
-                block.get("language", ""),
-                provider,
+        async def generate_single_summary_with_limit(block: dict[str, Any]) -> dict[str, str]:
+            nonlocal completed_count
+            async with semaphore:
+                # Add delay between requests to avoid rate limiting
+                await asyncio.sleep(0.5)  # 500ms delay between requests
+
+                # Call async version directly with shared client (no event loop overhead)
+                result = await _generate_code_example_summary_async(
+                    block["code"],
+                    block["context_before"],
+                    block["context_after"],
+                    block.get("language", ""),
+                    provider,
+                    shared_client  # Pass shared client for reuse
+                )
+
+                # Update progress
+                async with lock:
+                    completed_count += 1
+                    if progress_callback:
+                        # Simple progress based on summaries completed
+                        progress_percentage = int((completed_count / len(code_blocks)) * 100)
+                        await progress_callback({
+                            "status": "code_extraction",
+                            "percentage": progress_percentage,
+                            "log": f"Generated {completed_count}/{len(code_blocks)} code summaries",
+                            "completed_summaries": completed_count,
+                            "total_summaries": len(code_blocks),
+                        })
+
+                return result
+
+        # Process all blocks concurrently but with rate limiting
+        try:
+            summaries = await asyncio.gather(
+                *[generate_single_summary_with_limit(block) for block in code_blocks],
+                return_exceptions=True,
            )

-            # Update progress
-            async with lock:
-                completed_count += 1
-                if progress_callback:
-                    # Simple progress based on summaries completed
-                    progress_percentage = int((completed_count / len(code_blocks)) * 100)
-                    await progress_callback({
-                        "status": "code_extraction",
-                        "percentage": progress_percentage,
-                        "log": f"Generated {completed_count}/{len(code_blocks)} code summaries",
-                        "completed_summaries": completed_count,
-                        "total_summaries": len(code_blocks),
-                    })
+            # Handle any exceptions in the results
+            final_summaries = []
+            for i, summary in enumerate(summaries):
+                if isinstance(summary, Exception):
+                    search_logger.error(f"Error generating summary for code block {i}: {summary}")
+                    # Use fallback summary
+                    language = code_blocks[i].get("language", "")
+                    fallback = {
+                        "example_name": f"Code Example{f' ({language})' if language else ''}",
+                        "summary": "Code example for demonstration purposes.",
+                    }
+                    final_summaries.append(fallback)
+                else:
+                    final_summaries.append(summary)

-            return result
+            search_logger.info(f"Successfully generated {len(final_summaries)} code summaries")
+            return final_summaries

-    # Process all blocks concurrently but with rate limiting
-    try:
-        summaries = await asyncio.gather(
-            *[generate_single_summary_with_limit(block) for block in code_blocks],
-            return_exceptions=True,
-        )
-
-        # Handle any exceptions in the results
-        final_summaries = []
-        for i, summary in enumerate(summaries):
-            if isinstance(summary, Exception):
-                search_logger.error(f"Error generating summary for code block {i}: {summary}")
-                # Use fallback summary
-                language = code_blocks[i].get("language", "")
+        except Exception as e:
+            search_logger.error(f"Error in batch summary generation: {e}")
+            # Return fallback summaries for all blocks
+            fallback_summaries = []
+            for block in code_blocks:
+                language = block.get("language", "")
                fallback = {
                    "example_name": f"Code Example{f' ({language})' if language else ''}",
                    "summary": "Code example for demonstration purposes.",
                }
-                final_summaries.append(fallback)
-            else:
-                final_summaries.append(summary)
-
-        search_logger.info(f"Successfully generated {len(final_summaries)} code summaries")
-        return final_summaries
-
-    except Exception as e:
-        search_logger.error(f"Error in batch summary generation: {e}")
-        # Return fallback summaries for all blocks
-        fallback_summaries = []
-        for block in code_blocks:
-            language = block.get("language", "")
-            fallback = {
-                "example_name": f"Code Example{f' ({language})' if language else ''}",
-                "summary": "Code example for demonstration purposes.",
-            }
-            fallback_summaries.append(fallback)
-        return fallback_summaries
+                fallback_summaries.append(fallback)
+            return fallback_summaries


 async def add_code_examples_to_supabase(