mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
feat: Add advanced web crawling with domain filtering
- Implement domain filtering for web crawler with whitelist/blacklist support - Add URL pattern matching (glob-style) for include/exclude patterns - Create AdvancedCrawlConfig UI component with collapsible panel - Add domain filter to Knowledge Inspector sidebar for easy filtering - Implement crawl-v2 API endpoint with backward compatibility - Add comprehensive unit tests for domain filtering logic Implements priority-based filtering: 1. Blacklist (excluded_domains) - highest priority 2. Whitelist (allowed_domains) - must match if provided 3. Exclude patterns - glob patterns to exclude 4. Include patterns - glob patterns to include UI improvements: - Advanced configuration section in Add Knowledge dialog - Domain pills in Inspector sidebar showing document distribution - Visual domain indicators on each document - Responsive domain filtering with document counts 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -10,8 +10,9 @@ import { Button, Input, Label } from "../../ui/primitives";
|
||||
import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog";
|
||||
import { cn } from "../../ui/primitives/styles";
|
||||
import { Tabs, TabsContent } from "../../ui/primitives/tabs";
|
||||
import { useCrawlUrl, useUploadDocument } from "../hooks";
|
||||
import type { CrawlRequest, UploadMetadata } from "../types";
|
||||
import { useCrawlUrl, useCrawlUrlV2, useUploadDocument } from "../hooks";
|
||||
import type { CrawlConfig, CrawlRequest, CrawlRequestV2, UploadMetadata } from "../types";
|
||||
import { AdvancedCrawlConfig } from "./AdvancedCrawlConfig";
|
||||
import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector";
|
||||
import { LevelSelector } from "./LevelSelector";
|
||||
import { TagInput } from "./TagInput";
|
||||
@@ -32,6 +33,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
const [activeTab, setActiveTab] = useState<"crawl" | "upload">("crawl");
|
||||
const { showToast } = useToast();
|
||||
const crawlMutation = useCrawlUrl();
|
||||
const crawlV2Mutation = useCrawlUrlV2();
|
||||
const uploadMutation = useUploadDocument();
|
||||
|
||||
// Generate unique IDs for form elements
|
||||
@@ -43,6 +45,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
const [crawlType, setCrawlType] = useState<"technical" | "business">("technical");
|
||||
const [maxDepth, setMaxDepth] = useState("2");
|
||||
const [tags, setTags] = useState<string[]>([]);
|
||||
const [crawlConfig, setCrawlConfig] = useState<CrawlConfig>({});
|
||||
|
||||
// Upload form state
|
||||
const [selectedFile, setSelectedFile] = useState<File | null>(null);
|
||||
@@ -54,6 +57,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
setCrawlType("technical");
|
||||
setMaxDepth("2");
|
||||
setTags([]);
|
||||
setCrawlConfig({});
|
||||
setSelectedFile(null);
|
||||
setUploadType("technical");
|
||||
setUploadTags([]);
|
||||
@@ -66,21 +70,42 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
}
|
||||
|
||||
try {
|
||||
const request: CrawlRequest = {
|
||||
url: crawlUrl,
|
||||
knowledge_type: crawlType,
|
||||
max_depth: parseInt(maxDepth, 10),
|
||||
tags: tags.length > 0 ? tags : undefined,
|
||||
};
|
||||
// Check if we have any domain filtering configuration
|
||||
const hasCrawlConfig =
|
||||
(crawlConfig.allowed_domains && crawlConfig.allowed_domains.length > 0) ||
|
||||
(crawlConfig.excluded_domains && crawlConfig.excluded_domains.length > 0) ||
|
||||
(crawlConfig.include_patterns && crawlConfig.include_patterns.length > 0) ||
|
||||
(crawlConfig.exclude_patterns && crawlConfig.exclude_patterns.length > 0);
|
||||
|
||||
const response = await crawlMutation.mutateAsync(request);
|
||||
let response;
|
||||
|
||||
if (hasCrawlConfig) {
|
||||
// Use v2 endpoint with domain filtering
|
||||
const requestV2: CrawlRequestV2 = {
|
||||
url: crawlUrl,
|
||||
knowledge_type: crawlType,
|
||||
max_depth: parseInt(maxDepth, 10),
|
||||
tags: tags.length > 0 ? tags : undefined,
|
||||
crawl_config: crawlConfig,
|
||||
};
|
||||
response = await crawlV2Mutation.mutateAsync(requestV2);
|
||||
} else {
|
||||
// Use regular endpoint
|
||||
const request: CrawlRequest = {
|
||||
url: crawlUrl,
|
||||
knowledge_type: crawlType,
|
||||
max_depth: parseInt(maxDepth, 10),
|
||||
tags: tags.length > 0 ? tags : undefined,
|
||||
};
|
||||
response = await crawlMutation.mutateAsync(request);
|
||||
}
|
||||
|
||||
// Notify parent about the new crawl operation
|
||||
if (response?.progressId && onCrawlStarted) {
|
||||
onCrawlStarted(response.progressId);
|
||||
}
|
||||
|
||||
showToast("Crawl started successfully", "success");
|
||||
showToast(hasCrawlConfig ? "Crawl started with domain filtering" : "Crawl started successfully", "success");
|
||||
resetForm();
|
||||
onSuccess();
|
||||
onOpenChange(false);
|
||||
@@ -123,19 +148,19 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
}
|
||||
};
|
||||
|
||||
const isProcessing = crawlMutation.isPending || uploadMutation.isPending;
|
||||
const isProcessing = crawlMutation.isPending || crawlV2Mutation.isPending || uploadMutation.isPending;
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||
<DialogContent className="sm:max-w-[600px]">
|
||||
<DialogHeader>
|
||||
<DialogContent className="sm:max-w-[600px]" style={{ maxHeight: "85vh", display: "flex", flexDirection: "column" }}>
|
||||
<DialogHeader className="flex-shrink-0">
|
||||
<DialogTitle>Add Knowledge</DialogTitle>
|
||||
<DialogDescription>Crawl websites or upload documents to expand your knowledge base.</DialogDescription>
|
||||
</DialogHeader>
|
||||
|
||||
<Tabs value={activeTab} onValueChange={(v) => setActiveTab(v as "crawl" | "upload")}>
|
||||
<Tabs value={activeTab} onValueChange={(v) => setActiveTab(v as "crawl" | "upload")} className="flex-1 flex flex-col min-h-0">
|
||||
{/* Enhanced Tab Buttons */}
|
||||
<div className="grid grid-cols-2 gap-3 p-2 rounded-xl backdrop-blur-md bg-gradient-to-b from-gray-100/30 via-gray-50/20 to-white/40 dark:from-gray-900/30 dark:via-gray-800/20 dark:to-black/40 border border-gray-200/40 dark:border-gray-700/40">
|
||||
<div className="grid grid-cols-2 gap-3 p-2 rounded-xl backdrop-blur-md bg-gradient-to-b from-gray-100/30 via-gray-50/20 to-white/40 dark:from-gray-900/30 dark:via-gray-800/20 dark:to-black/40 border border-gray-200/40 dark:border-gray-700/40 flex-shrink-0">
|
||||
{/* Crawl Website Tab */}
|
||||
<button
|
||||
type="button"
|
||||
@@ -190,7 +215,16 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
</div>
|
||||
|
||||
{/* Crawl Tab */}
|
||||
<TabsContent value="crawl" className="space-y-6 mt-6">
|
||||
<TabsContent value="crawl" className="mt-6 flex-1 min-h-0">
|
||||
<div
|
||||
className="overflow-y-auto overflow-x-hidden pr-2 scrollbar-thin scrollbar-thumb-gray-400 dark:scrollbar-thumb-gray-600 scrollbar-track-transparent"
|
||||
style={{
|
||||
maxHeight: "calc(85vh - 200px)",
|
||||
overflowY: "scroll",
|
||||
WebkitOverflowScrolling: "touch",
|
||||
scrollbarWidth: "thin"
|
||||
}}>
|
||||
<div className="space-y-6 pb-4">
|
||||
{/* Enhanced URL Input Section */}
|
||||
<div className="space-y-3">
|
||||
<Label htmlFor={urlId} className="text-sm font-medium text-gray-900 dark:text-white/90">
|
||||
@@ -215,6 +249,9 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* Advanced Configuration - positioned directly below URL */}
|
||||
<AdvancedCrawlConfig config={crawlConfig} onChange={setCrawlConfig} />
|
||||
|
||||
<div className="space-y-6">
|
||||
<KnowledgeTypeSelector value={crawlType} onValueChange={setCrawlType} disabled={isProcessing} />
|
||||
|
||||
@@ -233,7 +270,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
disabled={isProcessing || !crawlUrl}
|
||||
className="w-full bg-gradient-to-r from-cyan-500 to-cyan-600 hover:from-cyan-600 hover:to-cyan-700 backdrop-blur-md border border-cyan-400/50 shadow-[0_0_20px_rgba(6,182,212,0.25)] hover:shadow-[0_0_30px_rgba(6,182,212,0.35)] transition-all duration-200"
|
||||
>
|
||||
{crawlMutation.isPending ? (
|
||||
{(crawlMutation.isPending || crawlV2Mutation.isPending) ? (
|
||||
<>
|
||||
<Loader2 className="w-4 h-4 mr-2 animate-spin" />
|
||||
Starting Crawl...
|
||||
@@ -245,10 +282,21 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</TabsContent>
|
||||
|
||||
{/* Upload Tab */}
|
||||
<TabsContent value="upload" className="space-y-6 mt-6">
|
||||
<TabsContent value="upload" className="mt-6 flex-1 min-h-0">
|
||||
<div
|
||||
className="overflow-y-auto overflow-x-hidden pr-2 scrollbar-thin scrollbar-thumb-gray-400 dark:scrollbar-thumb-gray-600 scrollbar-track-transparent"
|
||||
style={{
|
||||
maxHeight: "calc(85vh - 200px)",
|
||||
overflowY: "scroll",
|
||||
WebkitOverflowScrolling: "touch",
|
||||
scrollbarWidth: "thin"
|
||||
}}>
|
||||
<div className="space-y-6 pb-4">
|
||||
{/* Enhanced File Input Section */}
|
||||
<div className="space-y-3">
|
||||
<Label htmlFor={fileId} className="text-sm font-medium text-gray-900 dark:text-white/90">
|
||||
@@ -326,6 +374,8 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
</TabsContent>
|
||||
</Tabs>
|
||||
</DialogContent>
|
||||
|
||||
@@ -0,0 +1,308 @@
|
||||
/**
|
||||
* Advanced Crawl Configuration Component
|
||||
* Provides UI for configuring domain filtering and URL patterns
|
||||
*/
|
||||
|
||||
import { ChevronDown, Info, Plus, X } from "lucide-react";
|
||||
import React, { useState } from "react";
|
||||
import type { CrawlConfig } from "../types";
|
||||
|
||||
interface Props {
|
||||
config: CrawlConfig;
|
||||
onChange: (config: CrawlConfig) => void;
|
||||
}
|
||||
|
||||
export const AdvancedCrawlConfig: React.FC<Props> = ({ config, onChange }) => {
|
||||
const [isExpanded, setIsExpanded] = useState(false);
|
||||
const [newDomain, setNewDomain] = useState("");
|
||||
const [newPattern, setNewPattern] = useState("");
|
||||
const [activeTab, setActiveTab] = useState<"allowed" | "excluded">("allowed");
|
||||
const [patternTab, setPatternTab] = useState<"include" | "exclude">("include");
|
||||
|
||||
const handleAddDomain = (type: "allowed" | "excluded") => {
|
||||
if (!newDomain.trim()) return;
|
||||
|
||||
const domain = newDomain.trim().toLowerCase().replace(/^https?:\/\//, "").replace(/\/$/, "");
|
||||
const key = `${type}_domains` as keyof CrawlConfig;
|
||||
const current = config[key] || [];
|
||||
|
||||
if (!current.includes(domain)) {
|
||||
onChange({
|
||||
...config,
|
||||
[key]: [...current, domain],
|
||||
});
|
||||
}
|
||||
|
||||
setNewDomain("");
|
||||
};
|
||||
|
||||
const handleRemoveDomain = (type: "allowed" | "excluded", domain: string) => {
|
||||
const key = `${type}_domains` as keyof CrawlConfig;
|
||||
onChange({
|
||||
...config,
|
||||
[key]: (config[key] || []).filter(d => d !== domain),
|
||||
});
|
||||
};
|
||||
|
||||
const handleAddPattern = (type: "include" | "exclude") => {
|
||||
if (!newPattern.trim()) return;
|
||||
|
||||
const key = `${type}_patterns` as keyof CrawlConfig;
|
||||
const current = config[key] || [];
|
||||
|
||||
if (!current.includes(newPattern)) {
|
||||
onChange({
|
||||
...config,
|
||||
[key]: [...current, newPattern],
|
||||
});
|
||||
}
|
||||
|
||||
setNewPattern("");
|
||||
};
|
||||
|
||||
const handleRemovePattern = (type: "include" | "exclude", pattern: string) => {
|
||||
const key = `${type}_patterns` as keyof CrawlConfig;
|
||||
onChange({
|
||||
...config,
|
||||
[key]: (config[key] || []).filter(p => p !== pattern),
|
||||
});
|
||||
};
|
||||
|
||||
const hasAnyConfig =
|
||||
(config.allowed_domains && config.allowed_domains.length > 0) ||
|
||||
(config.excluded_domains && config.excluded_domains.length > 0) ||
|
||||
(config.include_patterns && config.include_patterns.length > 0) ||
|
||||
(config.exclude_patterns && config.exclude_patterns.length > 0);
|
||||
|
||||
return (
|
||||
<div className="border border-gray-800 rounded-lg bg-gray-900/50 backdrop-blur-sm">
|
||||
<button
|
||||
onClick={() => setIsExpanded(!isExpanded)}
|
||||
className="w-full p-4 flex items-center justify-between hover:bg-gray-800/30 transition-colors"
|
||||
>
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="text-gray-200 font-medium">Advanced Configuration</span>
|
||||
{hasAnyConfig && (
|
||||
<span className="text-xs bg-blue-500/20 text-blue-400 px-2 py-1 rounded-full">
|
||||
Active filters
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<ChevronDown
|
||||
className={`w-5 h-5 text-gray-400 transform transition-transform ${
|
||||
isExpanded ? "rotate-180" : ""
|
||||
}`}
|
||||
/>
|
||||
</button>
|
||||
|
||||
{isExpanded && (
|
||||
<div className="p-4 space-y-4 border-t border-gray-800">
|
||||
{/* Domain Filters Section */}
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-3">
|
||||
<h3 className="text-sm font-medium text-gray-300">Domain Filters</h3>
|
||||
<div className="group relative">
|
||||
<Info className="w-4 h-4 text-gray-500 cursor-help" />
|
||||
<div className="absolute left-0 bottom-full mb-1 w-64 p-2 bg-gray-800 rounded text-xs text-gray-300
|
||||
opacity-0 group-hover:opacity-100 transition-opacity pointer-events-none z-50">
|
||||
Control which domains are crawled. Blacklist takes priority over whitelist.
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Domain Tabs */}
|
||||
<div className="flex gap-2 mb-3">
|
||||
<button
|
||||
onClick={() => setActiveTab("allowed")}
|
||||
className={`px-3 py-1 text-sm rounded transition-colors ${
|
||||
activeTab === "allowed"
|
||||
? "bg-green-500/20 text-green-400"
|
||||
: "bg-gray-800 text-gray-400 hover:bg-gray-700"
|
||||
}`}
|
||||
>
|
||||
Allowed Domains ({config.allowed_domains?.length || 0})
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setActiveTab("excluded")}
|
||||
className={`px-3 py-1 text-sm rounded transition-colors ${
|
||||
activeTab === "excluded"
|
||||
? "bg-red-500/20 text-red-400"
|
||||
: "bg-gray-800 text-gray-400 hover:bg-gray-700"
|
||||
}`}
|
||||
>
|
||||
Excluded Domains ({config.excluded_domains?.length || 0})
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Domain Input */}
|
||||
<div className="flex gap-2 mb-2">
|
||||
<input
|
||||
type="text"
|
||||
value={newDomain}
|
||||
onChange={(e) => setNewDomain(e.target.value)}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === "Enter") {
|
||||
handleAddDomain(activeTab);
|
||||
}
|
||||
}}
|
||||
placeholder={`Add ${activeTab} domain (e.g., docs.example.com)`}
|
||||
className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200
|
||||
placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors"
|
||||
/>
|
||||
<button
|
||||
onClick={() => handleAddDomain(activeTab)}
|
||||
className="px-4 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded text-sm
|
||||
transition-colors flex items-center gap-1"
|
||||
>
|
||||
<Plus className="w-4 h-4" />
|
||||
Add
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Domain List */}
|
||||
<div className="space-y-1 max-h-32 overflow-y-auto">
|
||||
{activeTab === "allowed" && config.allowed_domains?.map(domain => (
|
||||
<div
|
||||
key={domain}
|
||||
className="flex items-center justify-between px-3 py-1 bg-green-500/10
|
||||
rounded text-sm text-green-400 group"
|
||||
>
|
||||
<span>{domain}</span>
|
||||
<button
|
||||
onClick={() => handleRemoveDomain("allowed", domain)}
|
||||
className="opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
>
|
||||
<X className="w-4 h-4 hover:text-red-400" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
{activeTab === "excluded" && config.excluded_domains?.map(domain => (
|
||||
<div
|
||||
key={domain}
|
||||
className="flex items-center justify-between px-3 py-1 bg-red-500/10
|
||||
rounded text-sm text-red-400 group"
|
||||
>
|
||||
<span>{domain}</span>
|
||||
<button
|
||||
onClick={() => handleRemoveDomain("excluded", domain)}
|
||||
className="opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
>
|
||||
<X className="w-4 h-4 hover:text-red-400" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* URL Patterns Section */}
|
||||
<div>
|
||||
<div className="flex items-center gap-2 mb-3">
|
||||
<h3 className="text-sm font-medium text-gray-300">URL Patterns</h3>
|
||||
<div className="group relative">
|
||||
<Info className="w-4 h-4 text-gray-500 cursor-help" />
|
||||
<div className="absolute left-0 bottom-full mb-1 w-64 p-2 bg-gray-800 rounded text-xs text-gray-300
|
||||
opacity-0 group-hover:opacity-100 transition-opacity pointer-events-none z-50">
|
||||
Use glob patterns to filter URLs. Example: */docs/* or *.pdf
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Pattern Tabs */}
|
||||
<div className="flex gap-2 mb-3">
|
||||
<button
|
||||
onClick={() => setPatternTab("include")}
|
||||
className={`px-3 py-1 text-sm rounded transition-colors ${
|
||||
patternTab === "include"
|
||||
? "bg-green-500/20 text-green-400"
|
||||
: "bg-gray-800 text-gray-400 hover:bg-gray-700"
|
||||
}`}
|
||||
>
|
||||
Include Patterns ({config.include_patterns?.length || 0})
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setPatternTab("exclude")}
|
||||
className={`px-3 py-1 text-sm rounded transition-colors ${
|
||||
patternTab === "exclude"
|
||||
? "bg-red-500/20 text-red-400"
|
||||
: "bg-gray-800 text-gray-400 hover:bg-gray-700"
|
||||
}`}
|
||||
>
|
||||
Exclude Patterns ({config.exclude_patterns?.length || 0})
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Pattern Input */}
|
||||
<div className="flex gap-2 mb-2">
|
||||
<input
|
||||
type="text"
|
||||
value={newPattern}
|
||||
onChange={(e) => setNewPattern(e.target.value)}
|
||||
onKeyDown={(e) => {
|
||||
if (e.key === "Enter") {
|
||||
handleAddPattern(patternTab);
|
||||
}
|
||||
}}
|
||||
placeholder={`Add ${patternTab} pattern (e.g., */api/* or *.pdf)`}
|
||||
className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200
|
||||
placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors"
|
||||
/>
|
||||
<button
|
||||
onClick={() => handleAddPattern(patternTab)}
|
||||
className="px-4 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded text-sm
|
||||
transition-colors flex items-center gap-1"
|
||||
>
|
||||
<Plus className="w-4 h-4" />
|
||||
Add
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Pattern List */}
|
||||
<div className="space-y-1 max-h-32 overflow-y-auto">
|
||||
{patternTab === "include" && config.include_patterns?.map(pattern => (
|
||||
<div
|
||||
key={pattern}
|
||||
className="flex items-center justify-between px-3 py-1 bg-green-500/10
|
||||
rounded text-sm text-green-400 group font-mono"
|
||||
>
|
||||
<span>{pattern}</span>
|
||||
<button
|
||||
onClick={() => handleRemovePattern("include", pattern)}
|
||||
className="opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
>
|
||||
<X className="w-4 h-4 hover:text-red-400" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
{patternTab === "exclude" && config.exclude_patterns?.map(pattern => (
|
||||
<div
|
||||
key={pattern}
|
||||
className="flex items-center justify-between px-3 py-1 bg-red-500/10
|
||||
rounded text-sm text-red-400 group font-mono"
|
||||
>
|
||||
<span>{pattern}</span>
|
||||
<button
|
||||
onClick={() => handleRemovePattern("exclude", pattern)}
|
||||
className="opacity-0 group-hover:opacity-100 transition-opacity"
|
||||
>
|
||||
<X className="w-4 h-4 hover:text-red-400" />
|
||||
</button>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Clear All Button */}
|
||||
{hasAnyConfig && (
|
||||
<button
|
||||
onClick={() => onChange({})}
|
||||
className="px-3 py-1 text-xs bg-gray-800 hover:bg-gray-700 text-gray-400
|
||||
rounded transition-colors"
|
||||
>
|
||||
Clear All Filters
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
};
|
||||
@@ -3,13 +3,14 @@
|
||||
* Shows document chunks and code examples for a knowledge item
|
||||
*/
|
||||
|
||||
import { ChevronDown, ChevronRight, Code, FileText, Search } from "lucide-react";
|
||||
import { useState } from "react";
|
||||
import { ChevronDown, ChevronRight, Code, ExternalLink, FileText, Globe, Search, X } from "lucide-react";
|
||||
import { useMemo, useState } from "react";
|
||||
import { Input } from "../../ui/primitives";
|
||||
import { Dialog, DialogContent, DialogHeader, DialogTitle } from "../../ui/primitives/dialog";
|
||||
import { cn } from "../../ui/primitives/styles";
|
||||
import { Tabs, TabsContent, TabsList, TabsTrigger } from "../../ui/primitives/tabs";
|
||||
import { useCodeExamples, useKnowledgeItemChunks } from "../hooks";
|
||||
import { useCodeExamples, useKnowledgeItem, useKnowledgeItemChunks } from "../hooks";
|
||||
import { extractDomain } from "../utils/knowledge-utils";
|
||||
|
||||
interface DocumentBrowserProps {
|
||||
sourceId: string;
|
||||
@@ -21,7 +22,9 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
const [activeTab, setActiveTab] = useState<"documents" | "code">("documents");
|
||||
const [searchQuery, setSearchQuery] = useState("");
|
||||
const [expandedChunks, setExpandedChunks] = useState<Set<string>>(new Set());
|
||||
const [selectedDomains, setSelectedDomains] = useState<Set<string>>(new Set());
|
||||
|
||||
const { data: sourceItem } = useKnowledgeItem(sourceId);
|
||||
const {
|
||||
data: chunksData,
|
||||
isLoading: chunksLoading,
|
||||
@@ -33,12 +36,36 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
const chunks = chunksData?.chunks || [];
|
||||
const codeExamples = codeData?.code_examples || [];
|
||||
|
||||
// Filter chunks based on search
|
||||
const filteredChunks = chunks.filter(
|
||||
(chunk) =>
|
||||
// Extract unique domains from chunks
|
||||
const domainStats = useMemo(() => {
|
||||
const stats = new Map<string, number>();
|
||||
chunks.forEach((chunk) => {
|
||||
const url = chunk.url || chunk.metadata?.url;
|
||||
if (url) {
|
||||
const domain = extractDomain(url);
|
||||
stats.set(domain, (stats.get(domain) || 0) + 1);
|
||||
}
|
||||
});
|
||||
|
||||
return Array.from(stats.entries())
|
||||
.sort((a, b) => b[1] - a[1]) // Sort by count descending
|
||||
.map(([domain, count]) => ({ domain, count }));
|
||||
}, [chunks]);
|
||||
|
||||
// Filter chunks based on search and domain
|
||||
const filteredChunks = chunks.filter((chunk) => {
|
||||
// Search filter
|
||||
const matchesSearch =
|
||||
!searchQuery ||
|
||||
chunk.content.toLowerCase().includes(searchQuery.toLowerCase()) ||
|
||||
chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase()),
|
||||
);
|
||||
chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase());
|
||||
|
||||
// Domain filter
|
||||
const url = chunk.url || chunk.metadata?.url;
|
||||
const matchesDomain = selectedDomains.size === 0 || (url && selectedDomains.has(extractDomain(url)));
|
||||
|
||||
return matchesSearch && matchesDomain;
|
||||
});
|
||||
|
||||
// Filter code examples based on search
|
||||
const filteredCode = codeExamples.filter((example) => {
|
||||
@@ -66,9 +93,30 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
<Dialog open={open} onOpenChange={onOpenChange}>
|
||||
<DialogContent className="max-w-4xl h-[80vh] flex flex-col">
|
||||
<DialogHeader>
|
||||
<DialogTitle>Document Browser</DialogTitle>
|
||||
<div className="flex items-center gap-2 mt-4">
|
||||
<div className="relative flex-1">
|
||||
<DialogTitle className="flex items-center justify-between">
|
||||
<div className="flex items-center gap-2">
|
||||
Document Browser
|
||||
{chunksData && (
|
||||
<span className="text-sm text-gray-400 font-normal">
|
||||
({chunks.length} documents from {domainStats.length} domain{domainStats.length !== 1 ? "s" : ""})
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{sourceItem && sourceItem.url && (
|
||||
<a
|
||||
href={sourceItem.url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="flex items-center gap-1 text-xs text-cyan-400 hover:text-cyan-300 transition-colors"
|
||||
>
|
||||
<ExternalLink className="w-3 h-3" />
|
||||
View Source
|
||||
</a>
|
||||
)}
|
||||
</DialogTitle>
|
||||
<div className="space-y-3 mt-4">
|
||||
{/* Search Bar */}
|
||||
<div className="relative">
|
||||
<Search className="absolute left-3 top-1/2 -translate-y-1/2 w-4 h-4 text-gray-400" />
|
||||
<Input
|
||||
type="text"
|
||||
@@ -78,6 +126,61 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
className="pl-10 bg-black/30 border-white/10 focus:border-cyan-500/50"
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Domain Filter */}
|
||||
{domainStats.length > 0 && (
|
||||
<div className="space-y-2">
|
||||
<div className="text-sm text-gray-400 flex items-center gap-2">
|
||||
<Globe className="w-4 h-4" />
|
||||
Domain Filter
|
||||
{selectedDomains.size > 0 && (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => setSelectedDomains(new Set())}
|
||||
className="ml-auto text-xs text-cyan-400 hover:text-cyan-300 flex items-center gap-1"
|
||||
>
|
||||
<X className="w-3 h-3" />
|
||||
Clear filter
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-2">
|
||||
{domainStats.map(({ domain, count }) => {
|
||||
const isSelected = selectedDomains.has(domain);
|
||||
return (
|
||||
<button
|
||||
key={domain}
|
||||
type="button"
|
||||
onClick={() => {
|
||||
const newSelection = new Set(selectedDomains);
|
||||
if (isSelected) {
|
||||
newSelection.delete(domain);
|
||||
} else {
|
||||
newSelection.add(domain);
|
||||
}
|
||||
setSelectedDomains(newSelection);
|
||||
}}
|
||||
className={cn(
|
||||
"px-3 py-1 text-xs rounded-full border transition-all",
|
||||
"flex items-center gap-2",
|
||||
isSelected
|
||||
? "bg-cyan-500/20 border-cyan-500/50 text-cyan-400"
|
||||
: "bg-black/20 border-white/10 text-gray-400 hover:border-cyan-500/30 hover:text-cyan-400"
|
||||
)}
|
||||
>
|
||||
<span className="truncate max-w-[200px]">{domain}</span>
|
||||
<span className={cn(
|
||||
"px-1.5 py-0.5 rounded text-[10px] font-mono",
|
||||
isSelected ? "bg-cyan-500/30" : "bg-white/10"
|
||||
)}>
|
||||
{count}
|
||||
</span>
|
||||
</button>
|
||||
);
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</DialogHeader>
|
||||
|
||||
@@ -123,8 +226,9 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
key={chunk.id}
|
||||
className="bg-black/30 rounded-lg border border-white/10 p-4 hover:border-cyan-500/30 transition-colors"
|
||||
>
|
||||
{chunk.metadata?.title && (
|
||||
<h4 className="font-medium text-white/90 mb-2 flex items-center gap-2">
|
||||
<div className="flex items-start justify-between gap-2 mb-2">
|
||||
{chunk.metadata?.title && (
|
||||
<h4 className="font-medium text-white/90 flex items-center gap-2 flex-1">
|
||||
{needsExpansion && (
|
||||
<button
|
||||
type="button"
|
||||
@@ -140,7 +244,20 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
|
||||
)}
|
||||
{chunk.metadata.title}
|
||||
</h4>
|
||||
)}
|
||||
)}
|
||||
{(chunk.url || chunk.metadata?.url) && (
|
||||
<a
|
||||
href={chunk.url || chunk.metadata?.url}
|
||||
target="_blank"
|
||||
rel="noopener noreferrer"
|
||||
className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
|
||||
title={`View on ${extractDomain(chunk.url || chunk.metadata?.url || "")}`}
|
||||
>
|
||||
{extractDomain(chunk.url || chunk.metadata?.url || "")}
|
||||
<ExternalLink className="w-3 h-3" />
|
||||
</a>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="text-sm text-gray-300 whitespace-pre-wrap">
|
||||
{isExpanded || !needsExpansion ? (
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
export * from "./AddKnowledgeDialog";
|
||||
export * from "./AdvancedCrawlConfig";
|
||||
export * from "./DocumentBrowser";
|
||||
export * from "./KnowledgeCard";
|
||||
export * from "./KnowledgeList";
|
||||
|
||||
@@ -15,6 +15,7 @@ import { useToast } from "../../ui/hooks/useToast";
|
||||
import { knowledgeService } from "../services";
|
||||
import type {
|
||||
CrawlRequest,
|
||||
CrawlRequestV2,
|
||||
CrawlStartResponse,
|
||||
KnowledgeItem,
|
||||
KnowledgeItemsFilter,
|
||||
@@ -298,6 +299,181 @@ export function useCrawlUrl() {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Crawl URL mutation with domain filtering (v2) with optimistic updates
|
||||
* Returns the progressId that can be used to track crawl progress
|
||||
*/
|
||||
export function useCrawlUrlV2() {
|
||||
const queryClient = useQueryClient();
|
||||
const { showToast } = useToast();
|
||||
|
||||
return useMutation<
|
||||
CrawlStartResponse,
|
||||
Error,
|
||||
CrawlRequestV2,
|
||||
{
|
||||
previousKnowledge?: KnowledgeItem[];
|
||||
previousSummaries?: Array<[readonly unknown[], KnowledgeItemsResponse | undefined]>;
|
||||
previousOperations?: ActiveOperationsResponse;
|
||||
tempProgressId: string;
|
||||
tempItemId: string;
|
||||
}
|
||||
>({
|
||||
mutationFn: (request: CrawlRequestV2) => knowledgeService.crawlUrlV2(request),
|
||||
onMutate: async (request) => {
|
||||
// Cancel any outgoing refetches to prevent race conditions
|
||||
await queryClient.cancelQueries({ queryKey: knowledgeKeys.summariesPrefix() });
|
||||
await queryClient.cancelQueries({ queryKey: progressKeys.active() });
|
||||
|
||||
// Snapshot the previous values for rollback
|
||||
const previousSummaries = queryClient.getQueriesData<KnowledgeItemsResponse>({
|
||||
queryKey: knowledgeKeys.summariesPrefix(),
|
||||
});
|
||||
const previousOperations = queryClient.getQueryData<ActiveOperationsResponse>(progressKeys.active());
|
||||
|
||||
// Generate temporary progress ID and optimistic entity
|
||||
const tempProgressId = createOptimisticId();
|
||||
const optimisticItem = createOptimisticEntity<KnowledgeItem>({
|
||||
title: (() => {
|
||||
try {
|
||||
return new URL(request.url).hostname || "New crawl";
|
||||
} catch {
|
||||
return "New crawl";
|
||||
}
|
||||
})(),
|
||||
url: request.url,
|
||||
source_id: tempProgressId,
|
||||
source_type: "url",
|
||||
knowledge_type: request.knowledge_type || "technical",
|
||||
status: "processing",
|
||||
document_count: 0,
|
||||
code_examples_count: 0,
|
||||
metadata: {
|
||||
knowledge_type: request.knowledge_type || "technical",
|
||||
tags: request.tags || [],
|
||||
source_type: "url",
|
||||
status: "processing",
|
||||
description: `Crawling ${request.url} with domain filters`,
|
||||
crawl_config: request.crawl_config,
|
||||
},
|
||||
created_at: new Date().toISOString(),
|
||||
updated_at: new Date().toISOString(),
|
||||
} as Omit<KnowledgeItem, "id">);
|
||||
const tempItemId = optimisticItem.id;
|
||||
|
||||
// Update all summaries caches with optimistic data
|
||||
const entries = queryClient.getQueriesData<KnowledgeItemsResponse>({
|
||||
queryKey: knowledgeKeys.summariesPrefix(),
|
||||
});
|
||||
for (const [qk, old] of entries) {
|
||||
const filter = qk[qk.length - 1] as KnowledgeItemsFilter | undefined;
|
||||
const matchesType = !filter?.knowledge_type || optimisticItem.knowledge_type === filter.knowledge_type;
|
||||
const matchesTags =
|
||||
!filter?.tags || filter.tags.every((t) => (optimisticItem.metadata?.tags ?? []).includes(t));
|
||||
if (!(matchesType && matchesTags)) continue;
|
||||
if (!old) {
|
||||
queryClient.setQueryData<KnowledgeItemsResponse>(qk, {
|
||||
items: [optimisticItem],
|
||||
total: 1,
|
||||
page: 1,
|
||||
per_page: 100,
|
||||
});
|
||||
} else {
|
||||
queryClient.setQueryData<KnowledgeItemsResponse>(qk, {
|
||||
...old,
|
||||
items: [optimisticItem, ...old.items],
|
||||
total: (old.total ?? old.items.length) + 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Add optimistic progress entry
|
||||
if (!previousOperations) {
|
||||
queryClient.setQueryData<ActiveOperationsResponse>(progressKeys.active(), {
|
||||
operations: [
|
||||
{
|
||||
operation_id: tempProgressId,
|
||||
operation_type: "crawl",
|
||||
status: "starting",
|
||||
progress: 0,
|
||||
message: `Starting crawl of ${request.url} with domain filtering`,
|
||||
started_at: new Date().toISOString(),
|
||||
progressId: tempProgressId,
|
||||
} as ActiveOperation,
|
||||
],
|
||||
});
|
||||
} else {
|
||||
queryClient.setQueryData<ActiveOperationsResponse>(progressKeys.active(), {
|
||||
operations: [
|
||||
{
|
||||
operation_id: tempProgressId,
|
||||
operation_type: "crawl",
|
||||
status: "starting",
|
||||
progress: 0,
|
||||
message: `Starting crawl of ${request.url} with domain filtering`,
|
||||
started_at: new Date().toISOString(),
|
||||
progressId: tempProgressId,
|
||||
} as ActiveOperation,
|
||||
...(previousOperations.operations || []),
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
return { previousSummaries, previousOperations, tempProgressId, tempItemId };
|
||||
},
|
||||
onSuccess: async (response, _variables, context) => {
|
||||
// Show success message
|
||||
showToast("Crawl started with domain filtering", "success");
|
||||
|
||||
// Update the temporary progress ID with the real one
|
||||
if (context) {
|
||||
const activeOps = queryClient.getQueryData<ActiveOperationsResponse>(progressKeys.active());
|
||||
if (activeOps) {
|
||||
const updated = {
|
||||
operations: activeOps.operations.map((op) =>
|
||||
op.progressId === context.tempProgressId ? { ...op, progressId: response.progressId } : op,
|
||||
),
|
||||
};
|
||||
queryClient.setQueryData(progressKeys.active(), updated);
|
||||
}
|
||||
|
||||
// Update item in all summaries caches
|
||||
const entries = queryClient.getQueriesData<KnowledgeItemsResponse>({
|
||||
queryKey: knowledgeKeys.summariesPrefix(),
|
||||
});
|
||||
for (const [qk, data] of entries) {
|
||||
if (data) {
|
||||
const updated = {
|
||||
...data,
|
||||
items: data.items.map((item) =>
|
||||
item.id === context.tempItemId ? { ...item, source_id: response.progressId } : item,
|
||||
),
|
||||
};
|
||||
queryClient.setQueryData(qk, updated);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the response so caller can access progressId
|
||||
return response;
|
||||
},
|
||||
onError: (error, _variables, context) => {
|
||||
// Rollback optimistic updates on error
|
||||
if (context?.previousSummaries) {
|
||||
for (const [queryKey, data] of context.previousSummaries) {
|
||||
queryClient.setQueryData(queryKey, data);
|
||||
}
|
||||
}
|
||||
if (context?.previousOperations) {
|
||||
queryClient.setQueryData(progressKeys.active(), context.previousOperations);
|
||||
}
|
||||
|
||||
const errorMessage = getProviderErrorMessage(error) || "Failed to start crawl with filters";
|
||||
showToast(errorMessage, "error");
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload document mutation with optimistic updates
|
||||
*/
|
||||
|
||||
@@ -4,10 +4,12 @@
|
||||
*/
|
||||
|
||||
import { motion } from "framer-motion";
|
||||
import { Code, FileText, Hash, Loader2, Search } from "lucide-react";
|
||||
import { Code, FileText, Globe, Hash, Loader2, Search, X } from "lucide-react";
|
||||
import { useMemo } from "react";
|
||||
import { Button, Input } from "../../../ui/primitives";
|
||||
import { cn } from "../../../ui/primitives/styles";
|
||||
import type { CodeExample, DocumentChunk } from "../../types";
|
||||
import { extractDomain } from "../../utils/knowledge-utils";
|
||||
|
||||
interface InspectorSidebarProps {
|
||||
viewMode: "documents" | "code";
|
||||
@@ -20,6 +22,8 @@ interface InspectorSidebarProps {
|
||||
hasNextPage: boolean;
|
||||
onLoadMore: () => void;
|
||||
isFetchingNextPage: boolean;
|
||||
selectedDomains?: Set<string>;
|
||||
onDomainsChange?: (domains: Set<string>) => void;
|
||||
}
|
||||
|
||||
export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
|
||||
@@ -33,7 +37,39 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
|
||||
hasNextPage,
|
||||
onLoadMore,
|
||||
isFetchingNextPage,
|
||||
selectedDomains = new Set(),
|
||||
onDomainsChange,
|
||||
}) => {
|
||||
// Extract unique domains from documents
|
||||
const domainStats = useMemo(() => {
|
||||
if (viewMode !== "documents") return [];
|
||||
|
||||
const stats = new Map<string, number>();
|
||||
(items as DocumentChunk[]).forEach((doc) => {
|
||||
const url = doc.url || doc.metadata?.url;
|
||||
if (url) {
|
||||
const domain = extractDomain(url);
|
||||
stats.set(domain, (stats.get(domain) || 0) + 1);
|
||||
}
|
||||
});
|
||||
|
||||
return Array.from(stats.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.map(([domain, count]) => ({ domain, count }));
|
||||
}, [items, viewMode]);
|
||||
|
||||
// Filter items by selected domains
|
||||
const filteredItems = useMemo(() => {
|
||||
if (viewMode !== "documents" || selectedDomains.size === 0) {
|
||||
return items;
|
||||
}
|
||||
|
||||
return (items as DocumentChunk[]).filter((doc) => {
|
||||
const url = doc.url || doc.metadata?.url;
|
||||
if (!url) return false;
|
||||
return selectedDomains.has(extractDomain(url));
|
||||
});
|
||||
}, [items, selectedDomains, viewMode]);
|
||||
const getItemTitle = (item: DocumentChunk | CodeExample) => {
|
||||
const idSuffix = String(item.id).slice(-6);
|
||||
if (viewMode === "documents") {
|
||||
@@ -62,8 +98,9 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
|
||||
|
||||
return (
|
||||
<aside className="w-80 border-r border-white/10 flex flex-col bg-black/40" aria-label="Document and code browser">
|
||||
{/* Search */}
|
||||
<div className="p-4 border-b border-white/10 flex-shrink-0">
|
||||
{/* Search and Filters */}
|
||||
<div className="p-4 border-b border-white/10 flex-shrink-0 space-y-3">
|
||||
{/* Search Bar */}
|
||||
<div className="relative">
|
||||
<Search
|
||||
className="absolute left-3 top-1/2 -translate-y-1/2 w-4 h-4 text-gray-500 pointer-events-none"
|
||||
@@ -77,6 +114,66 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
|
||||
aria-label={`Search ${viewMode}`}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Domain Filter - Only show for documents */}
|
||||
{viewMode === "documents" && domainStats.length > 0 && onDomainsChange && (
|
||||
<div className="space-y-2">
|
||||
<div className="text-xs text-gray-400 flex items-center gap-2">
|
||||
<Globe className="w-3 h-3" />
|
||||
Domain Filter
|
||||
{selectedDomains.size > 0 && (
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => onDomainsChange(new Set())}
|
||||
className="ml-auto text-cyan-400 hover:text-cyan-300 flex items-center gap-1"
|
||||
>
|
||||
<X className="w-3 h-3" />
|
||||
Clear
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex flex-wrap gap-1">
|
||||
{domainStats.slice(0, 5).map(({ domain, count }) => {
|
||||
const isSelected = selectedDomains.has(domain);
|
||||
return (
|
||||
<button
|
||||
key={domain}
|
||||
type="button"
|
||||
onClick={() => {
|
||||
const newSelection = new Set(selectedDomains);
|
||||
if (isSelected) {
|
||||
newSelection.delete(domain);
|
||||
} else {
|
||||
newSelection.add(domain);
|
||||
}
|
||||
onDomainsChange(newSelection);
|
||||
}}
|
||||
className={cn(
|
||||
"px-2 py-0.5 text-[10px] rounded-full border transition-all",
|
||||
"flex items-center gap-1",
|
||||
isSelected
|
||||
? "bg-cyan-500/20 border-cyan-500/50 text-cyan-400"
|
||||
: "bg-black/20 border-white/10 text-gray-500 hover:border-cyan-500/30 hover:text-cyan-400"
|
||||
)}
|
||||
>
|
||||
<span className="truncate max-w-[100px]">{domain}</span>
|
||||
<span className={cn(
|
||||
"px-1 rounded text-[9px] font-mono",
|
||||
isSelected ? "bg-cyan-500/30" : "bg-white/10"
|
||||
)}>
|
||||
{count}
|
||||
</span>
|
||||
</button>
|
||||
);
|
||||
})}
|
||||
{domainStats.length > 5 && (
|
||||
<span className="text-[10px] text-gray-600 px-2 py-0.5">
|
||||
+{domainStats.length - 5} more
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Item List */}
|
||||
@@ -93,7 +190,7 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
|
||||
</div>
|
||||
) : (
|
||||
<div className="p-2">
|
||||
{items.map((item) => (
|
||||
{filteredItems.map((item) => (
|
||||
<motion.button
|
||||
type="button"
|
||||
key={item.id}
|
||||
@@ -133,9 +230,16 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
<p className="text-xs text-gray-500 line-clamp-2" title={getItemDescription(item)}>
|
||||
{getItemDescription(item)}
|
||||
</p>
|
||||
<div className="flex items-center justify-between gap-2">
|
||||
<p className="text-xs text-gray-500 line-clamp-2 flex-1" title={getItemDescription(item)}>
|
||||
{getItemDescription(item)}
|
||||
</p>
|
||||
{viewMode === "documents" && (item as DocumentChunk).url && (
|
||||
<span className="text-[9px] px-1.5 py-0.5 rounded bg-white/5 text-gray-600 font-mono shrink-0">
|
||||
{extractDomain((item as DocumentChunk).url || "")}
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{item.metadata?.relevance_score != null && (
|
||||
<div className="flex items-center gap-1 mt-1">
|
||||
<Hash className="w-3 h-3 text-gray-600" aria-hidden="true" />
|
||||
|
||||
@@ -31,6 +31,7 @@ export const KnowledgeInspector: React.FC<KnowledgeInspectorProps> = ({
|
||||
const [searchQuery, setSearchQuery] = useState("");
|
||||
const [selectedItem, setSelectedItem] = useState<InspectorSelectedItem | null>(null);
|
||||
const [copiedId, setCopiedId] = useState<string | null>(null);
|
||||
const [selectedDomains, setSelectedDomains] = useState<Set<string>>(new Set());
|
||||
|
||||
// Reset view mode when item or initialTab changes
|
||||
useEffect(() => {
|
||||
@@ -141,6 +142,7 @@ export const KnowledgeInspector: React.FC<KnowledgeInspectorProps> = ({
|
||||
setViewMode(mode);
|
||||
setSelectedItem(null);
|
||||
setSearchQuery("");
|
||||
setSelectedDomains(new Set()); // Clear domain filter when switching modes
|
||||
}, []);
|
||||
|
||||
return (
|
||||
@@ -175,6 +177,8 @@ export const KnowledgeInspector: React.FC<KnowledgeInspectorProps> = ({
|
||||
hasNextPage={hasNextPage}
|
||||
onLoadMore={fetchNextPage}
|
||||
isFetchingNextPage={isFetchingNextPage}
|
||||
selectedDomains={selectedDomains}
|
||||
onDomainsChange={setSelectedDomains}
|
||||
/>
|
||||
|
||||
{/* Content Viewer */}
|
||||
|
||||
@@ -9,6 +9,7 @@ import type {
|
||||
ChunksResponse,
|
||||
CodeExamplesResponse,
|
||||
CrawlRequest,
|
||||
CrawlRequestV2,
|
||||
CrawlStartResponse,
|
||||
KnowledgeItem,
|
||||
KnowledgeItemsFilter,
|
||||
@@ -89,6 +90,18 @@ export const knowledgeService = {
|
||||
return response;
|
||||
},
|
||||
|
||||
/**
|
||||
* Start crawling a URL with domain filtering (v2)
|
||||
*/
|
||||
async crawlUrlV2(request: CrawlRequestV2): Promise<CrawlStartResponse> {
|
||||
const response = await callAPIWithETag<CrawlStartResponse>("/api/knowledge-items/crawl-v2", {
|
||||
method: "POST",
|
||||
body: JSON.stringify(request),
|
||||
});
|
||||
|
||||
return response;
|
||||
},
|
||||
|
||||
/**
|
||||
* Refresh an existing knowledge item
|
||||
*/
|
||||
|
||||
@@ -133,6 +133,13 @@ export interface KnowledgeItemsFilter {
|
||||
per_page?: number;
|
||||
}
|
||||
|
||||
export interface CrawlConfig {
|
||||
allowed_domains?: string[];
|
||||
excluded_domains?: string[];
|
||||
include_patterns?: string[];
|
||||
exclude_patterns?: string[];
|
||||
}
|
||||
|
||||
export interface CrawlRequest {
|
||||
url: string;
|
||||
knowledge_type?: "technical" | "business";
|
||||
@@ -142,6 +149,10 @@ export interface CrawlRequest {
|
||||
extract_code_examples?: boolean;
|
||||
}
|
||||
|
||||
export interface CrawlRequestV2 extends CrawlRequest {
|
||||
crawl_config?: CrawlConfig;
|
||||
}
|
||||
|
||||
export interface UploadMetadata {
|
||||
knowledge_type?: "technical" | "business";
|
||||
tags?: string[];
|
||||
|
||||
@@ -29,6 +29,7 @@ from ..services.search.rag_service import RAGService
|
||||
from ..services.storage import DocumentStorageService
|
||||
from ..utils import get_supabase_client
|
||||
from ..utils.document_processing import extract_text_from_document
|
||||
from ..utils.progress.progress_tracker import ProgressTracker
|
||||
|
||||
# Get logger for this module
|
||||
logger = get_logger(__name__)
|
||||
@@ -855,6 +856,135 @@ async def _perform_crawl_with_progress(
|
||||
)
|
||||
|
||||
|
||||
@router.post("/knowledge-items/crawl-v2")
|
||||
async def crawl_knowledge_item_v2(request: dict):
|
||||
"""
|
||||
Crawl a URL with advanced domain filtering configuration.
|
||||
|
||||
This is version 2 of the crawl endpoint that supports domain filtering.
|
||||
"""
|
||||
# Import CrawlRequestV2 model
|
||||
from ..models.crawl_models import CrawlRequestV2, CrawlConfig
|
||||
|
||||
# Parse and validate request
|
||||
crawl_request = CrawlRequestV2(**request)
|
||||
|
||||
# Validate API key before starting expensive operation
|
||||
logger.info("🔍 About to validate API key for crawl-v2...")
|
||||
provider_config = await credential_service.get_active_provider("embedding")
|
||||
provider = provider_config.get("provider", "openai")
|
||||
await _validate_provider_api_key(provider)
|
||||
logger.info("✅ API key validation completed successfully")
|
||||
|
||||
try:
|
||||
safe_logfire_info(
|
||||
f"Starting knowledge item crawl v2 | url={crawl_request.url} | "
|
||||
f"knowledge_type={crawl_request.knowledge_type} | "
|
||||
f"has_crawl_config={crawl_request.crawl_config is not None}"
|
||||
)
|
||||
|
||||
# Generate unique progress ID
|
||||
progress_id = str(uuid.uuid4())
|
||||
|
||||
# Create progress tracker for HTTP polling
|
||||
tracker = ProgressTracker(progress_id, operation_type="crawl")
|
||||
await tracker.start({
|
||||
"status": "starting",
|
||||
"url": crawl_request.url,
|
||||
"has_filters": crawl_request.crawl_config is not None
|
||||
})
|
||||
|
||||
# Create async task for crawling
|
||||
crawl_task = asyncio.create_task(_run_crawl_v2(request_dict=crawl_request.dict(), progress_id=progress_id))
|
||||
active_crawl_tasks[progress_id] = crawl_task
|
||||
|
||||
safe_logfire_info(
|
||||
f"Crawl v2 task created | progress_id={progress_id} | url={crawl_request.url}"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"progressId": progress_id,
|
||||
"message": "Crawl started with domain filtering",
|
||||
"estimatedDuration": "2-10 minutes depending on site size"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Failed to start crawl v2 | error={str(e)}")
|
||||
raise HTTPException(status_code=500, detail={"error": str(e)})
|
||||
|
||||
|
||||
async def _run_crawl_v2(request_dict: dict, progress_id: str):
|
||||
"""Run the crawl v2 with domain filtering in background."""
|
||||
tracker = ProgressTracker(progress_id, operation_type="crawl")
|
||||
|
||||
try:
|
||||
safe_logfire_info(
|
||||
f"Starting crawl v2 with progress tracking | progress_id={progress_id} | url={request_dict['url']}"
|
||||
)
|
||||
|
||||
# Get crawler from CrawlerManager
|
||||
try:
|
||||
crawler = await get_crawler()
|
||||
if crawler is None:
|
||||
raise Exception("Crawler not available - initialization may have failed")
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Failed to get crawler | error={str(e)}")
|
||||
await tracker.error(f"Failed to initialize crawler: {str(e)}")
|
||||
return
|
||||
|
||||
supabase_client = get_supabase_client()
|
||||
|
||||
# Extract crawl_config if present
|
||||
crawl_config_dict = request_dict.get("crawl_config")
|
||||
crawl_config = None
|
||||
if crawl_config_dict:
|
||||
from ..models.crawl_models import CrawlConfig
|
||||
crawl_config = CrawlConfig(**crawl_config_dict)
|
||||
|
||||
# Create orchestration service with crawl_config
|
||||
orchestration_service = CrawlingService(
|
||||
crawler,
|
||||
supabase_client,
|
||||
crawl_config=crawl_config
|
||||
)
|
||||
orchestration_service.set_progress_id(progress_id)
|
||||
|
||||
# Add crawl_config to metadata for storage
|
||||
if crawl_config:
|
||||
request_dict["metadata"] = request_dict.get("metadata", {})
|
||||
request_dict["metadata"]["crawl_config"] = crawl_config.dict()
|
||||
|
||||
# Orchestrate the crawl - this returns immediately with task info
|
||||
result = await orchestration_service.orchestrate_crawl(request_dict)
|
||||
|
||||
# Store the actual crawl task for proper cancellation
|
||||
crawl_task = result.get("task")
|
||||
if crawl_task:
|
||||
active_crawl_tasks[progress_id] = crawl_task
|
||||
safe_logfire_info(
|
||||
f"Stored actual crawl v2 task in active_crawl_tasks | progress_id={progress_id}"
|
||||
)
|
||||
else:
|
||||
safe_logfire_error(f"No task returned from orchestrate_crawl v2 | progress_id={progress_id}")
|
||||
|
||||
safe_logfire_info(
|
||||
f"Crawl v2 task started | progress_id={progress_id} | task_id={result.get('task_id')}"
|
||||
)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
safe_logfire_info(f"Crawl v2 cancelled | progress_id={progress_id}")
|
||||
raise
|
||||
except Exception as e:
|
||||
safe_logfire_error(f"Crawl v2 task failed | progress_id={progress_id} | error={str(e)}")
|
||||
await tracker.error(str(e))
|
||||
finally:
|
||||
# Clean up task from registry when done
|
||||
if progress_id in active_crawl_tasks:
|
||||
del active_crawl_tasks[progress_id]
|
||||
safe_logfire_info(f"Cleaned up crawl v2 task from registry | progress_id={progress_id}")
|
||||
|
||||
|
||||
@router.post("/documents/upload")
|
||||
async def upload_document(
|
||||
file: UploadFile = File(...),
|
||||
|
||||
0
python/src/server/models/__init__.py
Normal file
0
python/src/server/models/__init__.py
Normal file
63
python/src/server/models/crawl_models.py
Normal file
63
python/src/server/models/crawl_models.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""
|
||||
Crawling Models Module
|
||||
|
||||
This module contains Pydantic models for crawling configuration,
|
||||
specifically for domain filtering and URL pattern matching.
|
||||
"""
|
||||
|
||||
|
||||
from pydantic import BaseModel, Field, validator
|
||||
|
||||
|
||||
class CrawlConfig(BaseModel):
|
||||
"""Configuration for domain filtering during crawl."""
|
||||
|
||||
allowed_domains: list[str] | None = Field(None, description="Whitelist of domains to crawl")
|
||||
excluded_domains: list[str] | None = Field(None, description="Blacklist of domains to exclude")
|
||||
include_patterns: list[str] | None = Field(None, description="URL patterns to include (glob-style)")
|
||||
exclude_patterns: list[str] | None = Field(None, description="URL patterns to exclude (glob-style)")
|
||||
|
||||
@validator("allowed_domains", "excluded_domains", pre=True)
|
||||
def normalize_domains(cls, v):
|
||||
"""Normalize domain formats for consistent matching."""
|
||||
if v is None:
|
||||
return v
|
||||
return [d.lower().strip().replace("http://", "").replace("https://", "").rstrip("/") for d in v]
|
||||
|
||||
@validator("include_patterns", "exclude_patterns", pre=True)
|
||||
def validate_patterns(cls, v):
|
||||
"""Validate URL patterns are valid glob patterns."""
|
||||
if v is None:
|
||||
return v
|
||||
# Ensure patterns are strings and not empty
|
||||
return [p.strip() for p in v if p and isinstance(p, str) and p.strip()]
|
||||
|
||||
|
||||
class CrawlRequestV2(BaseModel):
|
||||
"""Extended crawl request with domain filtering."""
|
||||
|
||||
url: str = Field(..., description="URL to start crawling from")
|
||||
knowledge_type: str | None = Field("technical", description="Type of knowledge (technical/business)")
|
||||
tags: list[str] | None = Field(default_factory=list, description="Tags to apply to crawled content")
|
||||
update_frequency: int | None = Field(None, description="Update frequency in days")
|
||||
max_depth: int | None = Field(3, description="Maximum crawl depth")
|
||||
crawl_config: CrawlConfig | None = Field(None, description="Domain filtering configuration")
|
||||
crawl_options: dict | None = Field(None, description="Additional crawl options")
|
||||
extract_code_examples: bool | None = Field(True, description="Whether to extract code examples")
|
||||
|
||||
@validator("url")
|
||||
def validate_url(cls, v):
|
||||
"""Ensure URL is properly formatted."""
|
||||
if not v or not v.strip():
|
||||
raise ValueError("URL cannot be empty")
|
||||
# Add http:// if no protocol specified
|
||||
if not v.startswith(("http://", "https://")):
|
||||
v = f"https://{v}"
|
||||
return v.strip()
|
||||
|
||||
@validator("knowledge_type")
|
||||
def validate_knowledge_type(cls, v):
|
||||
"""Ensure knowledge type is valid."""
|
||||
if v and v not in ["technical", "business"]:
|
||||
return "technical" # Default to technical if invalid
|
||||
return v or "technical"
|
||||
@@ -12,12 +12,14 @@ from collections.abc import Awaitable, Callable
|
||||
from typing import Any, Optional
|
||||
|
||||
from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
|
||||
from ...models.crawl_models import CrawlConfig
|
||||
from ...utils import get_supabase_client
|
||||
from ...utils.progress.progress_tracker import ProgressTracker
|
||||
|
||||
# Import strategies
|
||||
# Import operations
|
||||
from .document_storage_operations import DocumentStorageOperations
|
||||
from .domain_filter import DomainFilter
|
||||
from .helpers.site_config import SiteConfig
|
||||
|
||||
# Import helpers
|
||||
@@ -56,7 +58,7 @@ class CrawlingService:
|
||||
Combines functionality from both CrawlingService and CrawlOrchestrationService.
|
||||
"""
|
||||
|
||||
def __init__(self, crawler=None, supabase_client=None, progress_id=None):
|
||||
def __init__(self, crawler=None, supabase_client=None, progress_id=None, crawl_config=None):
|
||||
"""
|
||||
Initialize the crawling service.
|
||||
|
||||
@@ -64,21 +66,24 @@ class CrawlingService:
|
||||
crawler: The Crawl4AI crawler instance
|
||||
supabase_client: The Supabase client for database operations
|
||||
progress_id: Optional progress ID for HTTP polling updates
|
||||
crawl_config: Optional CrawlConfig for domain filtering
|
||||
"""
|
||||
self.crawler = crawler
|
||||
self.supabase_client = supabase_client or get_supabase_client()
|
||||
self.progress_id = progress_id
|
||||
self.progress_tracker = None
|
||||
self.crawl_config = crawl_config
|
||||
|
||||
# Initialize helpers
|
||||
self.url_handler = URLHandler()
|
||||
self.site_config = SiteConfig()
|
||||
self.markdown_generator = self.site_config.get_markdown_generator()
|
||||
self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
|
||||
self.domain_filter = DomainFilter()
|
||||
|
||||
# Initialize strategies
|
||||
self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator)
|
||||
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator)
|
||||
self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator, self.domain_filter)
|
||||
self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
|
||||
self.sitemap_strategy = SitemapCrawlStrategy()
|
||||
|
||||
@@ -225,6 +230,7 @@ class CrawlingService:
|
||||
max_concurrent,
|
||||
progress_callback,
|
||||
self._check_cancellation, # Pass cancellation check
|
||||
self.crawl_config, # Pass crawl config for domain filtering
|
||||
)
|
||||
|
||||
# Orchestration methods
|
||||
|
||||
169
python/src/server/services/crawling/domain_filter.py
Normal file
169
python/src/server/services/crawling/domain_filter.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""
|
||||
Domain Filtering Module
|
||||
|
||||
This module provides domain filtering utilities for web crawling,
|
||||
allowing users to control which domains and URL patterns are crawled.
|
||||
"""
|
||||
|
||||
import fnmatch
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from ...config.logfire_config import get_logger
|
||||
from ...models.crawl_models import CrawlConfig
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DomainFilter:
|
||||
"""
|
||||
Handles domain and URL pattern filtering for crawl operations.
|
||||
|
||||
Priority order:
|
||||
1. Blacklist (excluded_domains) - always blocks
|
||||
2. Whitelist (allowed_domains) - must match if specified
|
||||
3. Exclude patterns - blocks matching URLs
|
||||
4. Include patterns - must match if specified
|
||||
"""
|
||||
|
||||
def is_url_allowed(self, url: str, base_url: str, config: CrawlConfig | None) -> bool:
|
||||
"""
|
||||
Check if a URL should be crawled based on domain filtering configuration.
|
||||
|
||||
Args:
|
||||
url: The URL to check
|
||||
base_url: The base URL of the crawl (for resolving relative URLs)
|
||||
config: The crawl configuration with filtering rules
|
||||
|
||||
Returns:
|
||||
True if the URL should be crawled, False otherwise
|
||||
"""
|
||||
if not config:
|
||||
# No filtering configured, allow all URLs
|
||||
return True
|
||||
|
||||
try:
|
||||
# Parse the URL
|
||||
parsed = urlparse(url)
|
||||
|
||||
# Handle relative URLs by using base URL's domain
|
||||
if not parsed.netloc:
|
||||
base_parsed = urlparse(base_url)
|
||||
domain = base_parsed.netloc.lower()
|
||||
# Construct full URL for pattern matching
|
||||
full_url = f"{base_parsed.scheme}://{base_parsed.netloc}{parsed.path or '/'}"
|
||||
else:
|
||||
domain = parsed.netloc.lower()
|
||||
full_url = url
|
||||
|
||||
# Remove www. prefix for consistent matching
|
||||
normalized_domain = domain.replace("www.", "")
|
||||
|
||||
# PRIORITY 1: Blacklist always wins
|
||||
if config.excluded_domains:
|
||||
for excluded in config.excluded_domains:
|
||||
if self._matches_domain(normalized_domain, excluded):
|
||||
logger.debug(f"URL blocked by excluded domain | url={url} | domain={normalized_domain} | excluded={excluded}")
|
||||
return False
|
||||
|
||||
# PRIORITY 2: If whitelist exists, URL must match
|
||||
if config.allowed_domains:
|
||||
allowed = False
|
||||
for allowed_domain in config.allowed_domains:
|
||||
if self._matches_domain(normalized_domain, allowed_domain):
|
||||
allowed = True
|
||||
break
|
||||
|
||||
if not allowed:
|
||||
logger.debug(f"URL blocked - not in allowed domains | url={url} | domain={normalized_domain}")
|
||||
return False
|
||||
|
||||
# PRIORITY 3: Check exclude patterns (glob-style)
|
||||
if config.exclude_patterns:
|
||||
for pattern in config.exclude_patterns:
|
||||
if fnmatch.fnmatch(full_url, pattern):
|
||||
logger.debug(f"URL blocked by exclude pattern | url={url} | pattern={pattern}")
|
||||
return False
|
||||
|
||||
# PRIORITY 4: Check include patterns if specified
|
||||
if config.include_patterns:
|
||||
matched = False
|
||||
for pattern in config.include_patterns:
|
||||
if fnmatch.fnmatch(full_url, pattern):
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
logger.debug(f"URL blocked - doesn't match include patterns | url={url}")
|
||||
return False
|
||||
|
||||
logger.debug(f"URL allowed | url={url} | domain={normalized_domain}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error filtering URL | url={url} | error={str(e)}")
|
||||
# On error, be conservative and block the URL
|
||||
return False
|
||||
|
||||
def _matches_domain(self, domain: str, pattern: str) -> bool:
|
||||
"""
|
||||
Check if a domain matches a pattern.
|
||||
|
||||
Supports:
|
||||
- Exact matches: example.com matches example.com
|
||||
- Subdomain wildcards: *.example.com matches sub.example.com
|
||||
- Subdomain matching: sub.example.com matches sub.example.com and subsub.sub.example.com
|
||||
|
||||
Args:
|
||||
domain: The domain to check (already normalized and lowercase)
|
||||
pattern: The pattern to match against (already normalized and lowercase)
|
||||
|
||||
Returns:
|
||||
True if the domain matches the pattern
|
||||
"""
|
||||
# Remove any remaining protocol or path from pattern
|
||||
pattern = pattern.replace("http://", "").replace("https://", "").split("/")[0]
|
||||
pattern = pattern.replace("www.", "") # Remove www. for consistent matching
|
||||
|
||||
# Exact match
|
||||
if domain == pattern:
|
||||
return True
|
||||
|
||||
# Wildcard subdomain match (*.example.com)
|
||||
if pattern.startswith("*."):
|
||||
base_pattern = pattern[2:] # Remove *.
|
||||
# Check if domain ends with the base pattern and has a subdomain
|
||||
if domain.endswith(base_pattern):
|
||||
# Make sure it's a proper subdomain, not just containing the pattern
|
||||
prefix = domain[:-len(base_pattern)]
|
||||
if prefix and prefix.endswith("."):
|
||||
return True
|
||||
|
||||
# Subdomain match (allow any subdomain of the pattern)
|
||||
# e.g., pattern=example.com should match sub.example.com
|
||||
if domain.endswith(f".{pattern}"):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_domains_from_urls(self, urls: list[str]) -> set[str]:
|
||||
"""
|
||||
Extract unique domains from a list of URLs.
|
||||
|
||||
Args:
|
||||
urls: List of URLs to extract domains from
|
||||
|
||||
Returns:
|
||||
Set of unique domains (normalized and lowercase)
|
||||
"""
|
||||
domains = set()
|
||||
for url in urls:
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
if parsed.netloc:
|
||||
domain = parsed.netloc.lower().replace("www.", "")
|
||||
domains.add(domain)
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not extract domain from URL | url={url} | error={str(e)}")
|
||||
continue
|
||||
|
||||
return domains
|
||||
@@ -21,17 +21,19 @@ logger = get_logger(__name__)
|
||||
class RecursiveCrawlStrategy:
|
||||
"""Strategy for recursive crawling of websites."""
|
||||
|
||||
def __init__(self, crawler, markdown_generator):
|
||||
def __init__(self, crawler, markdown_generator, domain_filter=None):
|
||||
"""
|
||||
Initialize recursive crawl strategy.
|
||||
|
||||
Args:
|
||||
crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
|
||||
markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
|
||||
domain_filter: Optional DomainFilter instance for URL filtering
|
||||
"""
|
||||
self.crawler = crawler
|
||||
self.markdown_generator = markdown_generator
|
||||
self.url_handler = URLHandler()
|
||||
self.domain_filter = domain_filter
|
||||
|
||||
async def crawl_recursive_with_progress(
|
||||
self,
|
||||
@@ -42,6 +44,7 @@ class RecursiveCrawlStrategy:
|
||||
max_concurrent: int | None = None,
|
||||
progress_callback: Callable[..., Awaitable[None]] | None = None,
|
||||
cancellation_check: Callable[[], None] | None = None,
|
||||
crawl_config=None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Recursively crawl internal links from start URLs up to a maximum depth with progress reporting.
|
||||
@@ -291,6 +294,13 @@ class RecursiveCrawlStrategy:
|
||||
# Skip binary files and already visited URLs
|
||||
is_binary = self.url_handler.is_binary_file(next_url)
|
||||
if next_url not in visited and not is_binary:
|
||||
# Apply domain filtering if configured
|
||||
if self.domain_filter and crawl_config:
|
||||
base_url = start_urls[0] if start_urls else original_url
|
||||
if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config):
|
||||
logger.debug(f"Filtering URL based on domain rules: {next_url}")
|
||||
continue
|
||||
|
||||
if next_url not in next_level_urls:
|
||||
next_level_urls.add(next_url)
|
||||
total_discovered += 1 # Increment when we discover a new URL
|
||||
|
||||
0
python/src/server/services/tests/__init__.py
Normal file
0
python/src/server/services/tests/__init__.py
Normal file
204
python/src/server/services/tests/test_domain_filter.py
Normal file
204
python/src/server/services/tests/test_domain_filter.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""
|
||||
Unit tests for domain filtering functionality
|
||||
"""
|
||||
|
||||
from src.server.models.crawl_models import CrawlConfig
|
||||
from src.server.services.crawling.domain_filter import DomainFilter
|
||||
|
||||
|
||||
class TestDomainFilter:
|
||||
"""Test suite for DomainFilter class."""
|
||||
|
||||
def setup_method(self):
|
||||
"""Set up test fixtures."""
|
||||
self.filter = DomainFilter()
|
||||
|
||||
def test_no_config_allows_all(self):
|
||||
"""Test that no configuration allows all URLs."""
|
||||
assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", None) is True
|
||||
assert self.filter.is_url_allowed("https://other.com/page", "https://example.com", None) is True
|
||||
|
||||
def test_whitelist_only(self):
|
||||
"""Test whitelist-only configuration."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["example.com", "docs.example.com"]
|
||||
)
|
||||
|
||||
# Should allow whitelisted domains
|
||||
assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True
|
||||
|
||||
# Should block non-whitelisted domains
|
||||
assert self.filter.is_url_allowed("https://other.com/page", "https://example.com", config) is False
|
||||
assert self.filter.is_url_allowed("https://evil.com", "https://example.com", config) is False
|
||||
|
||||
def test_blacklist_only(self):
|
||||
"""Test blacklist-only configuration."""
|
||||
config = CrawlConfig(
|
||||
excluded_domains=["evil.com", "ads.example.com"]
|
||||
)
|
||||
|
||||
# Should block blacklisted domains
|
||||
assert self.filter.is_url_allowed("https://evil.com/page", "https://example.com", config) is False
|
||||
assert self.filter.is_url_allowed("https://ads.example.com/track", "https://example.com", config) is False
|
||||
|
||||
# Should allow non-blacklisted domains
|
||||
assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True
|
||||
|
||||
def test_blacklist_overrides_whitelist(self):
|
||||
"""Test that blacklist takes priority over whitelist."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["example.com", "blog.example.com"],
|
||||
excluded_domains=["blog.example.com"]
|
||||
)
|
||||
|
||||
# Blacklist should override whitelist
|
||||
assert self.filter.is_url_allowed("https://blog.example.com/post", "https://example.com", config) is False
|
||||
|
||||
# Non-blacklisted whitelisted domain should work
|
||||
assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
|
||||
|
||||
def test_subdomain_matching(self):
|
||||
"""Test subdomain matching patterns."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["example.com"]
|
||||
)
|
||||
|
||||
# Should match subdomains of allowed domain
|
||||
assert self.filter.is_url_allowed("https://docs.example.com/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://api.example.com/v1", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://sub.sub.example.com", "https://example.com", config) is True
|
||||
|
||||
# Should not match different domains
|
||||
assert self.filter.is_url_allowed("https://notexample.com", "https://example.com", config) is False
|
||||
|
||||
def test_wildcard_subdomain_matching(self):
|
||||
"""Test wildcard subdomain patterns."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["*.example.com"]
|
||||
)
|
||||
|
||||
# Should match subdomains
|
||||
assert self.filter.is_url_allowed("https://docs.example.com/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://api.example.com/v1", "https://example.com", config) is True
|
||||
|
||||
# Should NOT match the base domain without subdomain
|
||||
assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is False
|
||||
|
||||
def test_url_patterns_include(self):
|
||||
"""Test include URL patterns."""
|
||||
config = CrawlConfig(
|
||||
include_patterns=["*/api/*", "*/docs/*"]
|
||||
)
|
||||
|
||||
# Should match include patterns
|
||||
assert self.filter.is_url_allowed("https://example.com/api/v1", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://example.com/docs/guide", "https://example.com", config) is True
|
||||
|
||||
# Should not match URLs not in patterns
|
||||
assert self.filter.is_url_allowed("https://example.com/blog/post", "https://example.com", config) is False
|
||||
assert self.filter.is_url_allowed("https://example.com/", "https://example.com", config) is False
|
||||
|
||||
def test_url_patterns_exclude(self):
|
||||
"""Test exclude URL patterns."""
|
||||
config = CrawlConfig(
|
||||
exclude_patterns=["*/private/*", "*.pdf", "*/admin/*"]
|
||||
)
|
||||
|
||||
# Should block excluded patterns
|
||||
assert self.filter.is_url_allowed("https://example.com/private/data", "https://example.com", config) is False
|
||||
assert self.filter.is_url_allowed("https://example.com/file.pdf", "https://example.com", config) is False
|
||||
assert self.filter.is_url_allowed("https://example.com/admin/panel", "https://example.com", config) is False
|
||||
|
||||
# Should allow non-excluded URLs
|
||||
assert self.filter.is_url_allowed("https://example.com/public/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://example.com/file.html", "https://example.com", config) is True
|
||||
|
||||
def test_combined_filters(self):
|
||||
"""Test combination of all filter types."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["example.com", "docs.example.com"],
|
||||
excluded_domains=["ads.example.com"],
|
||||
include_patterns=["*/api/*", "*/guide/*"],
|
||||
exclude_patterns=["*/deprecated/*"]
|
||||
)
|
||||
|
||||
# Should pass all filters
|
||||
assert self.filter.is_url_allowed("https://docs.example.com/api/v2", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://example.com/guide/intro", "https://example.com", config) is True
|
||||
|
||||
# Should fail on blacklist (highest priority)
|
||||
assert self.filter.is_url_allowed("https://ads.example.com/api/track", "https://example.com", config) is False
|
||||
|
||||
# Should fail on not in whitelist
|
||||
assert self.filter.is_url_allowed("https://other.com/api/v1", "https://example.com", config) is False
|
||||
|
||||
# Should fail on exclude pattern
|
||||
assert self.filter.is_url_allowed("https://example.com/api/deprecated/old", "https://example.com", config) is False
|
||||
|
||||
# Should fail on not matching include pattern
|
||||
assert self.filter.is_url_allowed("https://example.com/blog/post", "https://example.com", config) is False
|
||||
|
||||
def test_relative_urls(self):
|
||||
"""Test handling of relative URLs."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["example.com"]
|
||||
)
|
||||
|
||||
# Relative URLs should use base URL's domain
|
||||
assert self.filter.is_url_allowed("/page/path", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("page.html", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("../other/page", "https://example.com", config) is True
|
||||
|
||||
def test_domain_normalization(self):
|
||||
"""Test that domains are properly normalized."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["EXAMPLE.COM", "https://docs.example.com/", "www.test.com"]
|
||||
)
|
||||
|
||||
# Should handle different cases and formats
|
||||
assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://EXAMPLE.COM/PAGE", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://www.test.com/page", "https://example.com", config) is True
|
||||
assert self.filter.is_url_allowed("https://test.com/page", "https://example.com", config) is True
|
||||
|
||||
def test_edge_cases(self):
|
||||
"""Test edge cases and error handling."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=["example.com"]
|
||||
)
|
||||
|
||||
# Should handle malformed URLs gracefully
|
||||
assert self.filter.is_url_allowed("not-a-url", "https://example.com", config) is True # Treated as relative
|
||||
assert self.filter.is_url_allowed("", "https://example.com", config) is True # Empty URL
|
||||
assert self.filter.is_url_allowed("//example.com/page", "https://example.com", config) is True # Protocol-relative
|
||||
|
||||
def test_get_domains_from_urls(self):
|
||||
"""Test extracting domains from URL list."""
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://docs.example.com/api",
|
||||
"https://example.com/page2",
|
||||
"https://other.com/resource",
|
||||
"https://WWW.TEST.COM/page",
|
||||
"/relative/path", # Should be skipped
|
||||
"invalid-url", # Should be skipped
|
||||
]
|
||||
|
||||
domains = self.filter.get_domains_from_urls(urls)
|
||||
|
||||
assert domains == {"example.com", "docs.example.com", "other.com", "test.com"}
|
||||
|
||||
def test_empty_filter_lists(self):
|
||||
"""Test that empty filter lists behave correctly."""
|
||||
config = CrawlConfig(
|
||||
allowed_domains=[],
|
||||
excluded_domains=[],
|
||||
include_patterns=[],
|
||||
exclude_patterns=[]
|
||||
)
|
||||
|
||||
# Empty lists should be ignored (allow all)
|
||||
assert self.filter.is_url_allowed("https://any.com/page", "https://example.com", config) is True
|
||||
Reference in New Issue
Block a user