mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
docs: Add comprehensive JSDoc documentation for CrawlConfig
Added detailed documentation for CrawlConfig interface including: ## Documentation improvements: - Clear precedence rules (excluded_domains > allowed_domains > exclude_patterns > include_patterns) - Pattern syntax explanation (glob patterns with fnmatch for URLs, wildcards for domains) - Comprehensive examples showing common use cases: - Single subdomain with path exclusions - Multiple subdomains with specific exclusions - File type and directory blocking - Individual property documentation with examples ## Code improvements: - Refactored DocumentBrowser to avoid repeated URL/domain computation - Extract resolvedUrl and resolvedDomain once as constants - Improved readability and performance This documentation helps developers understand: - How conflicting rules are resolved (blacklist always wins) - What pattern syntax to use (glob patterns) - How to compose allow/deny lists effectively
This commit is contained in:
@@ -245,18 +245,26 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
{chunk.metadata.title}
|
||||
</h4>
|
||||
)}
|
||||
{(chunk.url || chunk.metadata?.url) && (
|
||||
<a
|
||||
href={chunk.url || chunk.metadata?.url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
|
||||
title={`View on ${extractDomain(chunk.url || chunk.metadata?.url || "")}`}
|
||||
>
|
||||
{extractDomain(chunk.url || chunk.metadata?.url || "")}
|
||||
<ExternalLink className="w-3 h-3" />
|
||||
</a>
|
||||
)}
|
||||
{(() => {
|
||||
// Extract URL and domain once to avoid repeated computation
|
||||
const resolvedUrl = chunk.url || chunk.metadata?.url;
|
||||
if (!resolvedUrl) return null;
|
||||
|
||||
const resolvedDomain = extractDomain(resolvedUrl);
|
||||
|
||||
return (
|
||||
<a
|
||||
href={resolvedUrl}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
|
||||
title={`View on ${resolvedDomain}`}
|
||||
>
|
||||
{resolvedDomain}
|
||||
<ExternalLink className="w-3 h-3" />
|
||||
</a>
|
||||
);
|
||||
})()}
|
||||
</div>
|
||||
|
||||
<div className="text-sm text-gray-300 whitespace-pre-wrap">
|
||||
|
||||
@@ -133,10 +133,69 @@ export interface KnowledgeItemsFilter {
|
||||
per_page?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advanced crawler configuration for domain and URL pattern filtering.
|
||||
*
|
||||
* ## Precedence Rules (highest to lowest priority):
|
||||
* 1. **excluded_domains** - Always blocks, takes highest priority
|
||||
* 2. **allowed_domains** - If specified, only these domains are crawled
|
||||
* 3. **exclude_patterns** - Blocks matching URL patterns
|
||||
* 4. **include_patterns** - If specified, only matching patterns are crawled
|
||||
*
|
||||
* ## Pattern Syntax:
|
||||
* - **Domain patterns**: Support wildcards (*.example.com) and exact matches
|
||||
* - **URL patterns**: Use glob syntax with fnmatch (*, ?, [seq], [!seq])
|
||||
*
|
||||
* ## Common Examples:
|
||||
* ```typescript
|
||||
* // Crawl only docs subdomain, excluding API references
|
||||
* {
|
||||
* allowed_domains: ["docs.example.com"],
|
||||
* exclude_patterns: ["*/api-reference/*", "*/deprecated/*"]
|
||||
* }
|
||||
*
|
||||
* // Crawl all subdomains except blog, only documentation paths
|
||||
* {
|
||||
* allowed_domains: ["*.example.com"],
|
||||
* excluded_domains: ["blog.example.com"],
|
||||
* include_patterns: ["*/docs/*", "*/guide/*", "*/tutorial/*"]
|
||||
* }
|
||||
*
|
||||
* // Block specific file types across all domains
|
||||
* {
|
||||
* exclude_patterns: ["*.pdf", "*.zip", "*/downloads/*"]
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
export interface CrawlConfig {
|
||||
/**
|
||||
* Whitelist of domains to crawl. Supports exact matches and wildcards.
|
||||
* Examples: ["docs.example.com", "*.example.com", "api.example.com"]
|
||||
* If specified, ONLY these domains will be crawled (unless blocked by excluded_domains).
|
||||
*/
|
||||
allowed_domains?: string[];
|
||||
|
||||
/**
|
||||
* Blacklist of domains to never crawl. Takes precedence over allowed_domains.
|
||||
* Examples: ["blog.example.com", "*.internal.example.com"]
|
||||
* These domains are ALWAYS blocked, even if they match allowed_domains.
|
||||
*/
|
||||
excluded_domains?: string[];
|
||||
|
||||
/**
|
||||
* URL patterns that must match for pages to be crawled. Uses glob syntax.
|
||||
* Examples: ["*/docs/*", "*/api/v2/*", "*tutorial*"]
|
||||
* If specified, ONLY URLs matching at least one pattern will be crawled.
|
||||
* Patterns are matched against the full URL.
|
||||
*/
|
||||
include_patterns?: string[];
|
||||
|
||||
/**
|
||||
* URL patterns to exclude from crawling. Uses glob syntax. Takes precedence over include_patterns.
|
||||
* Examples: ["*/admin/*", "*.pdf", "*/temp/*", "*test*"]
|
||||
* URLs matching these patterns are ALWAYS blocked.
|
||||
* Patterns are matched against the full URL.
|
||||
*/
|
||||
exclude_patterns?: string[];
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user