From dbe5855d846f12a2c7dad4e74d9b05d62381dda4 Mon Sep 17 00:00:00 2001 From: leex279 Date: Mon, 22 Sep 2025 13:52:15 +0200 Subject: [PATCH] docs: Add comprehensive JSDoc documentation for CrawlConfig Added detailed documentation for CrawlConfig interface including: ## Documentation improvements: - Clear precedence rules (excluded_domains > allowed_domains > exclude_patterns > include_patterns) - Pattern syntax explanation (glob patterns with fnmatch for URLs, wildcards for domains) - Comprehensive examples showing common use cases: - Single subdomain with path exclusions - Multiple subdomains with specific exclusions - File type and directory blocking - Individual property documentation with examples ## Code improvements: - Refactored DocumentBrowser to avoid repeated URL/domain computation - Extract resolvedUrl and resolvedDomain once as constants - Improved readability and performance This documentation helps developers understand: - How conflicting rules are resolved (blacklist always wins) - What pattern syntax to use (glob patterns) - How to compose allow/deny lists effectively --- .../knowledge/components/DocumentBrowser.tsx | 32 ++++++---- .../src/features/knowledge/types/knowledge.ts | 59 +++++++++++++++++++ 2 files changed, 79 insertions(+), 12 deletions(-) diff --git a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx index 306ee0a5..efe61b7f 100644 --- a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx +++ b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx @@ -245,18 +245,26 @@ export const DocumentBrowser: React.FC = ({ sourceId, open {chunk.metadata.title} )} - {(chunk.url || chunk.metadata?.url) && ( - - {extractDomain(chunk.url || chunk.metadata?.url || "")} - - - )} + {(() => { + // Extract URL and domain once to avoid repeated computation + const resolvedUrl = chunk.url || chunk.metadata?.url; + if (!resolvedUrl) return null; + + const resolvedDomain = extractDomain(resolvedUrl); + + return ( + + {resolvedDomain} + + + ); + })()}
diff --git a/archon-ui-main/src/features/knowledge/types/knowledge.ts b/archon-ui-main/src/features/knowledge/types/knowledge.ts index 007a4034..bbb360ea 100644 --- a/archon-ui-main/src/features/knowledge/types/knowledge.ts +++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts @@ -133,10 +133,69 @@ export interface KnowledgeItemsFilter { per_page?: number; } +/** + * Advanced crawler configuration for domain and URL pattern filtering. + * + * ## Precedence Rules (highest to lowest priority): + * 1. **excluded_domains** - Always blocks, takes highest priority + * 2. **allowed_domains** - If specified, only these domains are crawled + * 3. **exclude_patterns** - Blocks matching URL patterns + * 4. **include_patterns** - If specified, only matching patterns are crawled + * + * ## Pattern Syntax: + * - **Domain patterns**: Support wildcards (*.example.com) and exact matches + * - **URL patterns**: Use glob syntax with fnmatch (*, ?, [seq], [!seq]) + * + * ## Common Examples: + * ```typescript + * // Crawl only docs subdomain, excluding API references + * { + * allowed_domains: ["docs.example.com"], + * exclude_patterns: ["*/api-reference/*", "*/deprecated/*"] + * } + * + * // Crawl all subdomains except blog, only documentation paths + * { + * allowed_domains: ["*.example.com"], + * excluded_domains: ["blog.example.com"], + * include_patterns: ["*/docs/*", "*/guide/*", "*/tutorial/*"] + * } + * + * // Block specific file types across all domains + * { + * exclude_patterns: ["*.pdf", "*.zip", "*/downloads/*"] + * } + * ``` + */ export interface CrawlConfig { + /** + * Whitelist of domains to crawl. Supports exact matches and wildcards. + * Examples: ["docs.example.com", "*.example.com", "api.example.com"] + * If specified, ONLY these domains will be crawled (unless blocked by excluded_domains). + */ allowed_domains?: string[]; + + /** + * Blacklist of domains to never crawl. Takes precedence over allowed_domains. + * Examples: ["blog.example.com", "*.internal.example.com"] + * These domains are ALWAYS blocked, even if they match allowed_domains. + */ excluded_domains?: string[]; + + /** + * URL patterns that must match for pages to be crawled. Uses glob syntax. + * Examples: ["*/docs/*", "*/api/v2/*", "*tutorial*"] + * If specified, ONLY URLs matching at least one pattern will be crawled. + * Patterns are matched against the full URL. + */ include_patterns?: string[]; + + /** + * URL patterns to exclude from crawling. Uses glob syntax. Takes precedence over include_patterns. + * Examples: ["*/admin/*", "*.pdf", "*/temp/*", "*test*"] + * URLs matching these patterns are ALWAYS blocked. + * Patterns are matched against the full URL. + */ exclude_patterns?: string[]; }