From dbe5855d846f12a2c7dad4e74d9b05d62381dda4 Mon Sep 17 00:00:00 2001
From: leex279 <thomas@thirty3.de>
Date: Mon, 22 Sep 2025 13:52:15 +0200
Subject: [PATCH] docs: Add comprehensive JSDoc documentation for CrawlConfig

Added detailed documentation for CrawlConfig interface including:

## Documentation improvements:
- Clear precedence rules (excluded_domains > allowed_domains > exclude_patterns > include_patterns)
- Pattern syntax explanation (glob patterns with fnmatch for URLs, wildcards for domains)
- Comprehensive examples showing common use cases:
  - Single subdomain with path exclusions
  - Multiple subdomains with specific exclusions
  - File type and directory blocking
- Individual property documentation with examples

## Code improvements:
- Refactored DocumentBrowser to avoid repeated URL/domain computation
- Extract resolvedUrl and resolvedDomain once as constants
- Improved readability and performance

This documentation helps developers understand:
- How conflicting rules are resolved (blacklist always wins)
- What pattern syntax to use (glob patterns)
- How to compose allow/deny lists effectively
---
 .../knowledge/components/DocumentBrowser.tsx  | 32 ++++++----
 .../src/features/knowledge/types/knowledge.ts | 59 +++++++++++++++++++
 2 files changed, 79 insertions(+), 12 deletions(-)
diff --git a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
index 306ee0a5..efe61b7f 100644
--- a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
+++ b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
@@ -245,18 +245,26 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
                             {chunk.metadata.title}
                           </h4>
                           )}
-                          {(chunk.url || chunk.metadata?.url) && (
-                            <a
-                              href={chunk.url || chunk.metadata?.url}
-                              target="_blank"
-                              rel="noopener noreferrer"
-                              className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
-                              title={`View on ${extractDomain(chunk.url || chunk.metadata?.url || "")}`}
-                            >
-                              {extractDomain(chunk.url || chunk.metadata?.url || "")}
-                              <ExternalLink className="w-3 h-3" />
-                            </a>
-                          )}
+                          {(() => {
+                            // Extract URL and domain once to avoid repeated computation
+                            const resolvedUrl = chunk.url || chunk.metadata?.url;
+                            if (!resolvedUrl) return null;
+
+                            const resolvedDomain = extractDomain(resolvedUrl);
+
+                            return (
+                              <a
+                                href={resolvedUrl}
+                                target="_blank"
+                                rel="noopener noreferrer"
+                                className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
+                                title={`View on ${resolvedDomain}`}
+                              >
+                                {resolvedDomain}
+                                <ExternalLink className="w-3 h-3" />
+                              </a>
+                            );
+                          })()}
                         </div>
 
                         <div className="text-sm text-gray-300 whitespace-pre-wrap">
diff --git a/archon-ui-main/src/features/knowledge/types/knowledge.ts b/archon-ui-main/src/features/knowledge/types/knowledge.ts
index 007a4034..bbb360ea 100644
--- a/archon-ui-main/src/features/knowledge/types/knowledge.ts
+++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts
@@ -133,10 +133,69 @@ export interface KnowledgeItemsFilter {
   per_page?: number;
 }
 
+/**
+ * Advanced crawler configuration for domain and URL pattern filtering.
+ *
+ * ## Precedence Rules (highest to lowest priority):
+ * 1. **excluded_domains** - Always blocks, takes highest priority
+ * 2. **allowed_domains** - If specified, only these domains are crawled
+ * 3. **exclude_patterns** - Blocks matching URL patterns
+ * 4. **include_patterns** - If specified, only matching patterns are crawled
+ *
+ * ## Pattern Syntax:
+ * - **Domain patterns**: Support wildcards (*.example.com) and exact matches
+ * - **URL patterns**: Use glob syntax with fnmatch (*, ?, [seq], [!seq])
+ *
+ * ## Common Examples:
+ * ```typescript
+ * // Crawl only docs subdomain, excluding API references
+ * {
+ *   allowed_domains: ["docs.example.com"],
+ *   exclude_patterns: ["*/api-reference/*", "*/deprecated/*"]
+ * }
+ *
+ * // Crawl all subdomains except blog, only documentation paths
+ * {
+ *   allowed_domains: ["*.example.com"],
+ *   excluded_domains: ["blog.example.com"],
+ *   include_patterns: ["*/docs/*", "*/guide/*", "*/tutorial/*"]
+ * }
+ *
+ * // Block specific file types across all domains
+ * {
+ *   exclude_patterns: ["*.pdf", "*.zip", "*/downloads/*"]
+ * }
+ * ```
+ */
 export interface CrawlConfig {
+  /**
+   * Whitelist of domains to crawl. Supports exact matches and wildcards.
+   * Examples: ["docs.example.com", "*.example.com", "api.example.com"]
+   * If specified, ONLY these domains will be crawled (unless blocked by excluded_domains).
+   */
   allowed_domains?: string[];
+
+  /**
+   * Blacklist of domains to never crawl. Takes precedence over allowed_domains.
+   * Examples: ["blog.example.com", "*.internal.example.com"]
+   * These domains are ALWAYS blocked, even if they match allowed_domains.
+   */
   excluded_domains?: string[];
+
+  /**
+   * URL patterns that must match for pages to be crawled. Uses glob syntax.
+   * Examples: ["*/docs/*", "*/api/v2/*", "*tutorial*"]
+   * If specified, ONLY URLs matching at least one pattern will be crawled.
+   * Patterns are matched against the full URL.
+   */
   include_patterns?: string[];
+
+  /**
+   * URL patterns to exclude from crawling. Uses glob syntax. Takes precedence over include_patterns.
+   * Examples: ["*/admin/*", "*.pdf", "*/temp/*", "*test*"]
+   * URLs matching these patterns are ALWAYS blocked.
+   * Patterns are matched against the full URL.
+   */
   exclude_patterns?: string[];
 }