docs: Add comprehensive JSDoc documentation for CrawlConfig

Added detailed documentation for CrawlConfig interface including: ## Documentation improvements: - Clear precedence rules (excluded_domains > allowed_domains > exclude_patterns > include_patterns) - Pattern syntax explanation (glob patterns with fnmatch for URLs, wildcards for domains) - Comprehensive examples showing common use cases: - Single subdomain with path exclusions - Multiple subdomains with specific exclusions - File type and directory blocking - Individual property documentation with examples ## Code improvements: - Refactored DocumentBrowser to avoid repeated URL/domain computation - Extract resolvedUrl and resolvedDomain once as constants - Improved readability and performance This documentation helps developers understand: - How conflicting rules are resolved (blacklist always wins) - What pattern syntax to use (glob patterns) - How to compose allow/deny lists effectively
2025-12-24 02:39:17 -05:00 · 2025-09-22 13:52:15 +02:00
parent 7ea4d99a27
commit dbe5855d84
2 changed files with 79 additions and 12 deletions
--- a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
+++ b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
@@ -245,18 +245,26 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
                            {chunk.metadata.title}
                          </h4>
                          )}
-                          {(chunk.url || chunk.metadata?.url) && (
-                            <a
-                              href={chunk.url || chunk.metadata?.url}
-                              target="_blank"
-                              rel="noopener noreferrer"
-                              className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
-                              title={`View on ${extractDomain(chunk.url || chunk.metadata?.url || "")}`}
-                            >
-                              {extractDomain(chunk.url || chunk.metadata?.url || "")}
-                              <ExternalLink className="w-3 h-3" />
-                            </a>
-                          )}
+                          {(() => {
+                            // Extract URL and domain once to avoid repeated computation
+                            const resolvedUrl = chunk.url || chunk.metadata?.url;
+                            if (!resolvedUrl) return null;
+
+                            const resolvedDomain = extractDomain(resolvedUrl);
+
+                            return (
+                              <a
+                                href={resolvedUrl}
+                                target="_blank"
+                                rel="noopener noreferrer"
+                                className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
+                                title={`View on ${resolvedDomain}`}
+                              >
+                                {resolvedDomain}
+                                <ExternalLink className="w-3 h-3" />
+                              </a>
+                            );
+                          })()}
                        </div>

                        <div className="text-sm text-gray-300 whitespace-pre-wrap">
--- a/archon-ui-main/src/features/knowledge/types/knowledge.ts
+++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts
@@ -133,10 +133,69 @@ export interface KnowledgeItemsFilter {
  per_page?: number;
 }

+/**
+ * Advanced crawler configuration for domain and URL pattern filtering.
+ *
+ * ## Precedence Rules (highest to lowest priority):
+ * 1. **excluded_domains** - Always blocks, takes highest priority
+ * 2. **allowed_domains** - If specified, only these domains are crawled
+ * 3. **exclude_patterns** - Blocks matching URL patterns
+ * 4. **include_patterns** - If specified, only matching patterns are crawled
+ *
+ * ## Pattern Syntax:
+ * - **Domain patterns**: Support wildcards (*.example.com) and exact matches
+ * - **URL patterns**: Use glob syntax with fnmatch (*, ?, [seq], [!seq])
+ *
+ * ## Common Examples:
+ * ```typescript
+ * // Crawl only docs subdomain, excluding API references
+ * {
+ *   allowed_domains: ["docs.example.com"],
+ *   exclude_patterns: ["*/api-reference/*", "*/deprecated/*"]
+ * }
+ *
+ * // Crawl all subdomains except blog, only documentation paths
+ * {
+ *   allowed_domains: ["*.example.com"],
+ *   excluded_domains: ["blog.example.com"],
+ *   include_patterns: ["*/docs/*", "*/guide/*", "*/tutorial/*"]
+ * }
+ *
+ * // Block specific file types across all domains
+ * {
+ *   exclude_patterns: ["*.pdf", "*.zip", "*/downloads/*"]
+ * }
+ * ```
+ */
 export interface CrawlConfig {
+  /**
+   * Whitelist of domains to crawl. Supports exact matches and wildcards.
+   * Examples: ["docs.example.com", "*.example.com", "api.example.com"]
+   * If specified, ONLY these domains will be crawled (unless blocked by excluded_domains).
+   */
  allowed_domains?: string[];
+
+  /**
+   * Blacklist of domains to never crawl. Takes precedence over allowed_domains.
+   * Examples: ["blog.example.com", "*.internal.example.com"]
+   * These domains are ALWAYS blocked, even if they match allowed_domains.
+   */
  excluded_domains?: string[];
+
+  /**
+   * URL patterns that must match for pages to be crawled. Uses glob syntax.
+   * Examples: ["*/docs/*", "*/api/v2/*", "*tutorial*"]
+   * If specified, ONLY URLs matching at least one pattern will be crawled.
+   * Patterns are matched against the full URL.
+   */
  include_patterns?: string[];
+
+  /**
+   * URL patterns to exclude from crawling. Uses glob syntax. Takes precedence over include_patterns.
+   * Examples: ["*/admin/*", "*.pdf", "*/temp/*", "*test*"]
+   * URLs matching these patterns are ALWAYS blocked.
+   * Patterns are matched against the full URL.
+   */
  exclude_patterns?: string[];
 }