diff --git a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx index f6c7bc2a..db526faf 100644 --- a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx +++ b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx @@ -10,8 +10,9 @@ import { Button, Input, Label } from "../../ui/primitives"; import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; import { cn } from "../../ui/primitives/styles"; import { Tabs, TabsContent } from "../../ui/primitives/tabs"; -import { useCrawlUrl, useUploadDocument } from "../hooks"; -import type { CrawlRequest, UploadMetadata } from "../types"; +import { useCrawlUrl, useCrawlUrlV2, useUploadDocument } from "../hooks"; +import type { CrawlConfig, CrawlRequest, CrawlRequestV2, UploadMetadata } from "../types"; +import { AdvancedCrawlConfig } from "./AdvancedCrawlConfig"; import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector"; import { LevelSelector } from "./LevelSelector"; import { TagInput } from "./TagInput"; @@ -32,6 +33,7 @@ export const AddKnowledgeDialog: React.FC = ({ const [activeTab, setActiveTab] = useState<"crawl" | "upload">("crawl"); const { showToast } = useToast(); const crawlMutation = useCrawlUrl(); + const crawlV2Mutation = useCrawlUrlV2(); const uploadMutation = useUploadDocument(); // Generate unique IDs for form elements @@ -43,6 +45,7 @@ export const AddKnowledgeDialog: React.FC = ({ const [crawlType, setCrawlType] = useState<"technical" | "business">("technical"); const [maxDepth, setMaxDepth] = useState("2"); const [tags, setTags] = useState([]); + const [crawlConfig, setCrawlConfig] = useState({}); // Upload form state const [selectedFile, setSelectedFile] = useState(null); @@ -54,6 +57,7 @@ export const AddKnowledgeDialog: React.FC = ({ setCrawlType("technical"); setMaxDepth("2"); setTags([]); + setCrawlConfig({}); setSelectedFile(null); setUploadType("technical"); setUploadTags([]); @@ -66,21 +70,42 @@ export const AddKnowledgeDialog: React.FC = ({ } try { - const request: CrawlRequest = { - url: crawlUrl, - knowledge_type: crawlType, - max_depth: parseInt(maxDepth, 10), - tags: tags.length > 0 ? tags : undefined, - }; + // Check if we have any domain filtering configuration + const hasCrawlConfig = + (crawlConfig.allowed_domains && crawlConfig.allowed_domains.length > 0) || + (crawlConfig.excluded_domains && crawlConfig.excluded_domains.length > 0) || + (crawlConfig.include_patterns && crawlConfig.include_patterns.length > 0) || + (crawlConfig.exclude_patterns && crawlConfig.exclude_patterns.length > 0); - const response = await crawlMutation.mutateAsync(request); + let response; + + if (hasCrawlConfig) { + // Use v2 endpoint with domain filtering + const requestV2: CrawlRequestV2 = { + url: crawlUrl, + knowledge_type: crawlType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + crawl_config: crawlConfig, + }; + response = await crawlV2Mutation.mutateAsync(requestV2); + } else { + // Use regular endpoint + const request: CrawlRequest = { + url: crawlUrl, + knowledge_type: crawlType, + max_depth: parseInt(maxDepth, 10), + tags: tags.length > 0 ? tags : undefined, + }; + response = await crawlMutation.mutateAsync(request); + } // Notify parent about the new crawl operation if (response?.progressId && onCrawlStarted) { onCrawlStarted(response.progressId); } - showToast("Crawl started successfully", "success"); + showToast(hasCrawlConfig ? "Crawl started with domain filtering" : "Crawl started successfully", "success"); resetForm(); onSuccess(); onOpenChange(false); @@ -123,19 +148,19 @@ export const AddKnowledgeDialog: React.FC = ({ } }; - const isProcessing = crawlMutation.isPending || uploadMutation.isPending; + const isProcessing = crawlMutation.isPending || crawlV2Mutation.isPending || uploadMutation.isPending; return ( - - + + Add Knowledge Crawl websites or upload documents to expand your knowledge base. - setActiveTab(v as "crawl" | "upload")}> + setActiveTab(v as "crawl" | "upload")} className="flex-1 flex flex-col min-h-0"> {/* Enhanced Tab Buttons */} -
+
{/* Crawl Website Tab */}
{/* Crawl Tab */} - + +
+
{/* Enhanced URL Input Section */}
+ {/* Advanced Configuration - positioned directly below URL */} + +
@@ -233,7 +270,7 @@ export const AddKnowledgeDialog: React.FC = ({ disabled={isProcessing || !crawlUrl} className="w-full bg-gradient-to-r from-cyan-500 to-cyan-600 hover:from-cyan-600 hover:to-cyan-700 backdrop-blur-md border border-cyan-400/50 shadow-[0_0_20px_rgba(6,182,212,0.25)] hover:shadow-[0_0_30px_rgba(6,182,212,0.35)] transition-all duration-200" > - {crawlMutation.isPending ? ( + {(crawlMutation.isPending || crawlV2Mutation.isPending) ? ( <> Starting Crawl... @@ -245,10 +282,21 @@ export const AddKnowledgeDialog: React.FC = ({ )} +
+
{/* Upload Tab */} - + +
+
{/* Enhanced File Input Section */}
+
diff --git a/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx b/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx new file mode 100644 index 00000000..afd7d77a --- /dev/null +++ b/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx @@ -0,0 +1,308 @@ +/** + * Advanced Crawl Configuration Component + * Provides UI for configuring domain filtering and URL patterns + */ + +import { ChevronDown, Info, Plus, X } from "lucide-react"; +import React, { useState } from "react"; +import type { CrawlConfig } from "../types"; + +interface Props { + config: CrawlConfig; + onChange: (config: CrawlConfig) => void; +} + +export const AdvancedCrawlConfig: React.FC = ({ config, onChange }) => { + const [isExpanded, setIsExpanded] = useState(false); + const [newDomain, setNewDomain] = useState(""); + const [newPattern, setNewPattern] = useState(""); + const [activeTab, setActiveTab] = useState<"allowed" | "excluded">("allowed"); + const [patternTab, setPatternTab] = useState<"include" | "exclude">("include"); + + const handleAddDomain = (type: "allowed" | "excluded") => { + if (!newDomain.trim()) return; + + const domain = newDomain.trim().toLowerCase().replace(/^https?:\/\//, "").replace(/\/$/, ""); + const key = `${type}_domains` as keyof CrawlConfig; + const current = config[key] || []; + + if (!current.includes(domain)) { + onChange({ + ...config, + [key]: [...current, domain], + }); + } + + setNewDomain(""); + }; + + const handleRemoveDomain = (type: "allowed" | "excluded", domain: string) => { + const key = `${type}_domains` as keyof CrawlConfig; + onChange({ + ...config, + [key]: (config[key] || []).filter(d => d !== domain), + }); + }; + + const handleAddPattern = (type: "include" | "exclude") => { + if (!newPattern.trim()) return; + + const key = `${type}_patterns` as keyof CrawlConfig; + const current = config[key] || []; + + if (!current.includes(newPattern)) { + onChange({ + ...config, + [key]: [...current, newPattern], + }); + } + + setNewPattern(""); + }; + + const handleRemovePattern = (type: "include" | "exclude", pattern: string) => { + const key = `${type}_patterns` as keyof CrawlConfig; + onChange({ + ...config, + [key]: (config[key] || []).filter(p => p !== pattern), + }); + }; + + const hasAnyConfig = + (config.allowed_domains && config.allowed_domains.length > 0) || + (config.excluded_domains && config.excluded_domains.length > 0) || + (config.include_patterns && config.include_patterns.length > 0) || + (config.exclude_patterns && config.exclude_patterns.length > 0); + + return ( +
+ + + {isExpanded && ( +
+ {/* Domain Filters Section */} +
+
+

Domain Filters

+
+ +
+ Control which domains are crawled. Blacklist takes priority over whitelist. +
+
+
+ + {/* Domain Tabs */} +
+ + +
+ + {/* Domain Input */} +
+ setNewDomain(e.target.value)} + onKeyDown={(e) => { + if (e.key === "Enter") { + handleAddDomain(activeTab); + } + }} + placeholder={`Add ${activeTab} domain (e.g., docs.example.com)`} + className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200 + placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors" + /> + +
+ + {/* Domain List */} +
+ {activeTab === "allowed" && config.allowed_domains?.map(domain => ( +
+ {domain} + +
+ ))} + {activeTab === "excluded" && config.excluded_domains?.map(domain => ( +
+ {domain} + +
+ ))} +
+
+ + {/* URL Patterns Section */} +
+
+

URL Patterns

+
+ +
+ Use glob patterns to filter URLs. Example: */docs/* or *.pdf +
+
+
+ + {/* Pattern Tabs */} +
+ + +
+ + {/* Pattern Input */} +
+ setNewPattern(e.target.value)} + onKeyDown={(e) => { + if (e.key === "Enter") { + handleAddPattern(patternTab); + } + }} + placeholder={`Add ${patternTab} pattern (e.g., */api/* or *.pdf)`} + className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200 + placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors" + /> + +
+ + {/* Pattern List */} +
+ {patternTab === "include" && config.include_patterns?.map(pattern => ( +
+ {pattern} + +
+ ))} + {patternTab === "exclude" && config.exclude_patterns?.map(pattern => ( +
+ {pattern} + +
+ ))} +
+
+ + {/* Clear All Button */} + {hasAnyConfig && ( + + )} +
+ )} +
+ ); +}; \ No newline at end of file diff --git a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx index 6da79b18..306ee0a5 100644 --- a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx +++ b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx @@ -3,13 +3,14 @@ * Shows document chunks and code examples for a knowledge item */ -import { ChevronDown, ChevronRight, Code, FileText, Search } from "lucide-react"; -import { useState } from "react"; +import { ChevronDown, ChevronRight, Code, ExternalLink, FileText, Globe, Search, X } from "lucide-react"; +import { useMemo, useState } from "react"; import { Input } from "../../ui/primitives"; import { Dialog, DialogContent, DialogHeader, DialogTitle } from "../../ui/primitives/dialog"; import { cn } from "../../ui/primitives/styles"; import { Tabs, TabsContent, TabsList, TabsTrigger } from "../../ui/primitives/tabs"; -import { useCodeExamples, useKnowledgeItemChunks } from "../hooks"; +import { useCodeExamples, useKnowledgeItem, useKnowledgeItemChunks } from "../hooks"; +import { extractDomain } from "../utils/knowledge-utils"; interface DocumentBrowserProps { sourceId: string; @@ -21,7 +22,9 @@ export const DocumentBrowser: React.FC = ({ sourceId, open const [activeTab, setActiveTab] = useState<"documents" | "code">("documents"); const [searchQuery, setSearchQuery] = useState(""); const [expandedChunks, setExpandedChunks] = useState>(new Set()); + const [selectedDomains, setSelectedDomains] = useState>(new Set()); + const { data: sourceItem } = useKnowledgeItem(sourceId); const { data: chunksData, isLoading: chunksLoading, @@ -33,12 +36,36 @@ export const DocumentBrowser: React.FC = ({ sourceId, open const chunks = chunksData?.chunks || []; const codeExamples = codeData?.code_examples || []; - // Filter chunks based on search - const filteredChunks = chunks.filter( - (chunk) => + // Extract unique domains from chunks + const domainStats = useMemo(() => { + const stats = new Map(); + chunks.forEach((chunk) => { + const url = chunk.url || chunk.metadata?.url; + if (url) { + const domain = extractDomain(url); + stats.set(domain, (stats.get(domain) || 0) + 1); + } + }); + + return Array.from(stats.entries()) + .sort((a, b) => b[1] - a[1]) // Sort by count descending + .map(([domain, count]) => ({ domain, count })); + }, [chunks]); + + // Filter chunks based on search and domain + const filteredChunks = chunks.filter((chunk) => { + // Search filter + const matchesSearch = + !searchQuery || chunk.content.toLowerCase().includes(searchQuery.toLowerCase()) || - chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase()), - ); + chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase()); + + // Domain filter + const url = chunk.url || chunk.metadata?.url; + const matchesDomain = selectedDomains.size === 0 || (url && selectedDomains.has(extractDomain(url))); + + return matchesSearch && matchesDomain; + }); // Filter code examples based on search const filteredCode = codeExamples.filter((example) => { @@ -66,9 +93,30 @@ export const DocumentBrowser: React.FC = ({ sourceId, open - Document Browser -
-
+ +
+ Document Browser + {chunksData && ( + + ({chunks.length} documents from {domainStats.length} domain{domainStats.length !== 1 ? "s" : ""}) + + )} +
+ {sourceItem && sourceItem.url && ( + + + View Source + + )} +
+
+ {/* Search Bar */} +
= ({ sourceId, open className="pl-10 bg-black/30 border-white/10 focus:border-cyan-500/50" />
+ + {/* Domain Filter */} + {domainStats.length > 0 && ( +
+
+ + Domain Filter + {selectedDomains.size > 0 && ( + + )} +
+
+ {domainStats.map(({ domain, count }) => { + const isSelected = selectedDomains.has(domain); + return ( + + ); + })} +
+
+ )}
@@ -123,8 +226,9 @@ export const DocumentBrowser: React.FC = ({ sourceId, open key={chunk.id} className="bg-black/30 rounded-lg border border-white/10 p-4 hover:border-cyan-500/30 transition-colors" > - {chunk.metadata?.title && ( -

+
+ {chunk.metadata?.title && ( +

{needsExpansion && (

- )} + )} + {(chunk.url || chunk.metadata?.url) && ( + + {extractDomain(chunk.url || chunk.metadata?.url || "")} + + + )} +
{isExpanded || !needsExpansion ? ( diff --git a/archon-ui-main/src/features/knowledge/components/index.ts b/archon-ui-main/src/features/knowledge/components/index.ts index e9174d5b..31732139 100644 --- a/archon-ui-main/src/features/knowledge/components/index.ts +++ b/archon-ui-main/src/features/knowledge/components/index.ts @@ -1,4 +1,5 @@ export * from "./AddKnowledgeDialog"; +export * from "./AdvancedCrawlConfig"; export * from "./DocumentBrowser"; export * from "./KnowledgeCard"; export * from "./KnowledgeList"; diff --git a/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts b/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts index 874499e2..020e9626 100644 --- a/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts +++ b/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts @@ -15,6 +15,7 @@ import { useToast } from "../../ui/hooks/useToast"; import { knowledgeService } from "../services"; import type { CrawlRequest, + CrawlRequestV2, CrawlStartResponse, KnowledgeItem, KnowledgeItemsFilter, @@ -298,6 +299,181 @@ export function useCrawlUrl() { }); } +/** + * Crawl URL mutation with domain filtering (v2) with optimistic updates + * Returns the progressId that can be used to track crawl progress + */ +export function useCrawlUrlV2() { + const queryClient = useQueryClient(); + const { showToast } = useToast(); + + return useMutation< + CrawlStartResponse, + Error, + CrawlRequestV2, + { + previousKnowledge?: KnowledgeItem[]; + previousSummaries?: Array<[readonly unknown[], KnowledgeItemsResponse | undefined]>; + previousOperations?: ActiveOperationsResponse; + tempProgressId: string; + tempItemId: string; + } + >({ + mutationFn: (request: CrawlRequestV2) => knowledgeService.crawlUrlV2(request), + onMutate: async (request) => { + // Cancel any outgoing refetches to prevent race conditions + await queryClient.cancelQueries({ queryKey: knowledgeKeys.summariesPrefix() }); + await queryClient.cancelQueries({ queryKey: progressKeys.active() }); + + // Snapshot the previous values for rollback + const previousSummaries = queryClient.getQueriesData({ + queryKey: knowledgeKeys.summariesPrefix(), + }); + const previousOperations = queryClient.getQueryData(progressKeys.active()); + + // Generate temporary progress ID and optimistic entity + const tempProgressId = createOptimisticId(); + const optimisticItem = createOptimisticEntity({ + title: (() => { + try { + return new URL(request.url).hostname || "New crawl"; + } catch { + return "New crawl"; + } + })(), + url: request.url, + source_id: tempProgressId, + source_type: "url", + knowledge_type: request.knowledge_type || "technical", + status: "processing", + document_count: 0, + code_examples_count: 0, + metadata: { + knowledge_type: request.knowledge_type || "technical", + tags: request.tags || [], + source_type: "url", + status: "processing", + description: `Crawling ${request.url} with domain filters`, + crawl_config: request.crawl_config, + }, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + } as Omit); + const tempItemId = optimisticItem.id; + + // Update all summaries caches with optimistic data + const entries = queryClient.getQueriesData({ + queryKey: knowledgeKeys.summariesPrefix(), + }); + for (const [qk, old] of entries) { + const filter = qk[qk.length - 1] as KnowledgeItemsFilter | undefined; + const matchesType = !filter?.knowledge_type || optimisticItem.knowledge_type === filter.knowledge_type; + const matchesTags = + !filter?.tags || filter.tags.every((t) => (optimisticItem.metadata?.tags ?? []).includes(t)); + if (!(matchesType && matchesTags)) continue; + if (!old) { + queryClient.setQueryData(qk, { + items: [optimisticItem], + total: 1, + page: 1, + per_page: 100, + }); + } else { + queryClient.setQueryData(qk, { + ...old, + items: [optimisticItem, ...old.items], + total: (old.total ?? old.items.length) + 1, + }); + } + } + + // Add optimistic progress entry + if (!previousOperations) { + queryClient.setQueryData(progressKeys.active(), { + operations: [ + { + operation_id: tempProgressId, + operation_type: "crawl", + status: "starting", + progress: 0, + message: `Starting crawl of ${request.url} with domain filtering`, + started_at: new Date().toISOString(), + progressId: tempProgressId, + } as ActiveOperation, + ], + }); + } else { + queryClient.setQueryData(progressKeys.active(), { + operations: [ + { + operation_id: tempProgressId, + operation_type: "crawl", + status: "starting", + progress: 0, + message: `Starting crawl of ${request.url} with domain filtering`, + started_at: new Date().toISOString(), + progressId: tempProgressId, + } as ActiveOperation, + ...(previousOperations.operations || []), + ], + }); + } + + return { previousSummaries, previousOperations, tempProgressId, tempItemId }; + }, + onSuccess: async (response, _variables, context) => { + // Show success message + showToast("Crawl started with domain filtering", "success"); + + // Update the temporary progress ID with the real one + if (context) { + const activeOps = queryClient.getQueryData(progressKeys.active()); + if (activeOps) { + const updated = { + operations: activeOps.operations.map((op) => + op.progressId === context.tempProgressId ? { ...op, progressId: response.progressId } : op, + ), + }; + queryClient.setQueryData(progressKeys.active(), updated); + } + + // Update item in all summaries caches + const entries = queryClient.getQueriesData({ + queryKey: knowledgeKeys.summariesPrefix(), + }); + for (const [qk, data] of entries) { + if (data) { + const updated = { + ...data, + items: data.items.map((item) => + item.id === context.tempItemId ? { ...item, source_id: response.progressId } : item, + ), + }; + queryClient.setQueryData(qk, updated); + } + } + } + + // Return the response so caller can access progressId + return response; + }, + onError: (error, _variables, context) => { + // Rollback optimistic updates on error + if (context?.previousSummaries) { + for (const [queryKey, data] of context.previousSummaries) { + queryClient.setQueryData(queryKey, data); + } + } + if (context?.previousOperations) { + queryClient.setQueryData(progressKeys.active(), context.previousOperations); + } + + const errorMessage = getProviderErrorMessage(error) || "Failed to start crawl with filters"; + showToast(errorMessage, "error"); + }, + }); +} + /** * Upload document mutation with optimistic updates */ diff --git a/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx b/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx index 09b9e441..da4f36aa 100644 --- a/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx +++ b/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx @@ -4,10 +4,12 @@ */ import { motion } from "framer-motion"; -import { Code, FileText, Hash, Loader2, Search } from "lucide-react"; +import { Code, FileText, Globe, Hash, Loader2, Search, X } from "lucide-react"; +import { useMemo } from "react"; import { Button, Input } from "../../../ui/primitives"; import { cn } from "../../../ui/primitives/styles"; import type { CodeExample, DocumentChunk } from "../../types"; +import { extractDomain } from "../../utils/knowledge-utils"; interface InspectorSidebarProps { viewMode: "documents" | "code"; @@ -20,6 +22,8 @@ interface InspectorSidebarProps { hasNextPage: boolean; onLoadMore: () => void; isFetchingNextPage: boolean; + selectedDomains?: Set; + onDomainsChange?: (domains: Set) => void; } export const InspectorSidebar: React.FC = ({ @@ -33,7 +37,39 @@ export const InspectorSidebar: React.FC = ({ hasNextPage, onLoadMore, isFetchingNextPage, + selectedDomains = new Set(), + onDomainsChange, }) => { + // Extract unique domains from documents + const domainStats = useMemo(() => { + if (viewMode !== "documents") return []; + + const stats = new Map(); + (items as DocumentChunk[]).forEach((doc) => { + const url = doc.url || doc.metadata?.url; + if (url) { + const domain = extractDomain(url); + stats.set(domain, (stats.get(domain) || 0) + 1); + } + }); + + return Array.from(stats.entries()) + .sort((a, b) => b[1] - a[1]) + .map(([domain, count]) => ({ domain, count })); + }, [items, viewMode]); + + // Filter items by selected domains + const filteredItems = useMemo(() => { + if (viewMode !== "documents" || selectedDomains.size === 0) { + return items; + } + + return (items as DocumentChunk[]).filter((doc) => { + const url = doc.url || doc.metadata?.url; + if (!url) return false; + return selectedDomains.has(extractDomain(url)); + }); + }, [items, selectedDomains, viewMode]); const getItemTitle = (item: DocumentChunk | CodeExample) => { const idSuffix = String(item.id).slice(-6); if (viewMode === "documents") { @@ -62,8 +98,9 @@ export const InspectorSidebar: React.FC = ({ return (