improve: robust URL processing with whitespace handling and consistent source_type

Enhanced both list_items() and _transform_source_to_item() methods:

- Trim whitespace from original_url and treat empty/whitespace-only strings as None
- Compute display_url before deriving source_type for consistency
- Base source_type determination on final display_url (not raw original_url)
- Ensures file:// schemes and other URL types are detected correctly
- Prevents whitespace-only original_url from being used
- Handles None/empty original_url gracefully during trimming

This ensures consistent source_type classification and prevents
whitespace issues from causing incorrect URL display.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
leex279
2025-09-10 07:22:18 +02:00
parent 5913d7a8ac
commit 9061fde028

View File

@@ -141,11 +141,14 @@ class KnowledgeItemService:
code_examples_count = code_example_counts.get(source_id, 0)
chunks_count = chunk_counts.get(source_id, 0)
# Determine source type
source_type = self._determine_source_type(source_metadata, first_page_url)
# Compute display_url with proper trimming and fallback
original_url = source_metadata.get("original_url")
# Trim whitespace and treat empty/whitespace-only strings as None
trimmed_original_url = original_url.strip() if original_url and original_url.strip() else None
display_url = trimmed_original_url or first_page_url
# Use original URL from metadata if available, fallback to first page URL
display_url = source_metadata.get("original_url") or first_page_url
# Determine source type based on final display_url (not raw original_url)
source_type = self._determine_source_type(source_metadata, display_url)
item = {
"id": source_id,
@@ -359,15 +362,17 @@ class KnowledgeItemService:
# Get first page URL
first_page_url = await self._get_first_page_url(source_id)
# Determine source type
source_type = self._determine_source_type(source_metadata, first_page_url)
# Compute display_url with proper trimming and fallback before deriving source_type
original_url = source_metadata.get("original_url")
# Trim whitespace and treat empty/whitespace-only strings as None
trimmed_original_url = original_url.strip() if original_url and original_url.strip() else None
display_url = trimmed_original_url or first_page_url
# Determine source type based on final display_url (not raw original_url)
source_type = self._determine_source_type(source_metadata, display_url)
# Get code examples
code_examples = await self._get_code_examples(source_id)
# Use original URL from metadata if available, fallback to first page URL
original_url = source_metadata.get("original_url")
display_url = original_url or first_page_url
return {
"id": source_id,