feat: Add advanced web crawling with domain filtering

- Implement domain filtering for web crawler with whitelist/blacklist support - Add URL pattern matching (glob-style) for include/exclude patterns - Create AdvancedCrawlConfig UI component with collapsible panel - Add domain filter to Knowledge Inspector sidebar for easy filtering - Implement crawl-v2 API endpoint with backward compatibility - Add comprehensive unit tests for domain filtering logic Implements priority-based filtering: 1. Blacklist (excluded_domains) - highest priority 2. Whitelist (allowed_domains) - must match if provided 3. Exclude patterns - glob patterns to exclude 4. Include patterns - glob patterns to include UI improvements: - Advanced configuration section in Add Knowledge dialog - Domain pills in Inspector sidebar showing document distribution - Visual domain indicators on each document - Responsive domain filtering with document counts 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-24 02:39:17 -05:00 · 2025-09-22 09:33:08 +02:00
parent 4c910c1471
commit cc46b3422c
17 changed files with 1408 additions and 42 deletions
--- a/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx
+++ b/archon-ui-main/src/features/knowledge/components/AddKnowledgeDialog.tsx
@@ -10,8 +10,9 @@ import { Button, Input, Label } from "../../ui/primitives";
 import { Dialog, DialogContent, DialogDescription, DialogHeader, DialogTitle } from "../../ui/primitives/dialog";
 import { cn } from "../../ui/primitives/styles";
 import { Tabs, TabsContent } from "../../ui/primitives/tabs";
-import { useCrawlUrl, useUploadDocument } from "../hooks";
-import type { CrawlRequest, UploadMetadata } from "../types";
+import { useCrawlUrl, useCrawlUrlV2, useUploadDocument } from "../hooks";
+import type { CrawlConfig, CrawlRequest, CrawlRequestV2, UploadMetadata } from "../types";
+import { AdvancedCrawlConfig } from "./AdvancedCrawlConfig";
 import { KnowledgeTypeSelector } from "./KnowledgeTypeSelector";
 import { LevelSelector } from "./LevelSelector";
 import { TagInput } from "./TagInput";
@@ -32,6 +33,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
  const [activeTab, setActiveTab] = useState<"crawl" | "upload">("crawl");
  const { showToast } = useToast();
  const crawlMutation = useCrawlUrl();
+  const crawlV2Mutation = useCrawlUrlV2();
  const uploadMutation = useUploadDocument();

  // Generate unique IDs for form elements
@@ -43,6 +45,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
  const [crawlType, setCrawlType] = useState<"technical" | "business">("technical");
  const [maxDepth, setMaxDepth] = useState("2");
  const [tags, setTags] = useState<string[]>([]);
+  const [crawlConfig, setCrawlConfig] = useState<CrawlConfig>({});

  // Upload form state
  const [selectedFile, setSelectedFile] = useState<File | null>(null);
@@ -54,6 +57,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
    setCrawlType("technical");
    setMaxDepth("2");
    setTags([]);
+    setCrawlConfig({});
    setSelectedFile(null);
    setUploadType("technical");
    setUploadTags([]);
@@ -66,21 +70,42 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
    }

    try {
-      const request: CrawlRequest = {
-        url: crawlUrl,
-        knowledge_type: crawlType,
-        max_depth: parseInt(maxDepth, 10),
-        tags: tags.length > 0 ? tags : undefined,
-      };
+      // Check if we have any domain filtering configuration
+      const hasCrawlConfig =
+        (crawlConfig.allowed_domains && crawlConfig.allowed_domains.length > 0) ||
+        (crawlConfig.excluded_domains && crawlConfig.excluded_domains.length > 0) ||
+        (crawlConfig.include_patterns && crawlConfig.include_patterns.length > 0) ||
+        (crawlConfig.exclude_patterns && crawlConfig.exclude_patterns.length > 0);

-      const response = await crawlMutation.mutateAsync(request);
+      let response;
+
+      if (hasCrawlConfig) {
+        // Use v2 endpoint with domain filtering
+        const requestV2: CrawlRequestV2 = {
+          url: crawlUrl,
+          knowledge_type: crawlType,
+          max_depth: parseInt(maxDepth, 10),
+          tags: tags.length > 0 ? tags : undefined,
+          crawl_config: crawlConfig,
+        };
+        response = await crawlV2Mutation.mutateAsync(requestV2);
+      } else {
+        // Use regular endpoint
+        const request: CrawlRequest = {
+          url: crawlUrl,
+          knowledge_type: crawlType,
+          max_depth: parseInt(maxDepth, 10),
+          tags: tags.length > 0 ? tags : undefined,
+        };
+        response = await crawlMutation.mutateAsync(request);
+      }

      // Notify parent about the new crawl operation
      if (response?.progressId && onCrawlStarted) {
        onCrawlStarted(response.progressId);
      }

-      showToast("Crawl started successfully", "success");
+      showToast(hasCrawlConfig ? "Crawl started with domain filtering" : "Crawl started successfully", "success");
      resetForm();
      onSuccess();
      onOpenChange(false);
@@ -123,19 +148,19 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
    }
  };

-  const isProcessing = crawlMutation.isPending || uploadMutation.isPending;
+  const isProcessing = crawlMutation.isPending || crawlV2Mutation.isPending || uploadMutation.isPending;

  return (
    <Dialog open={open} onOpenChange={onOpenChange}>
-      <DialogContent className="sm:max-w-[600px]">
-        <DialogHeader>
+      <DialogContent className="sm:max-w-[600px]" style={{ maxHeight: "85vh", display: "flex", flexDirection: "column" }}>
+        <DialogHeader className="flex-shrink-0">
          <DialogTitle>Add Knowledge</DialogTitle>
          <DialogDescription>Crawl websites or upload documents to expand your knowledge base.</DialogDescription>
        </DialogHeader>

-        <Tabs value={activeTab} onValueChange={(v) => setActiveTab(v as "crawl" | "upload")}>
+        <Tabs value={activeTab} onValueChange={(v) => setActiveTab(v as "crawl" | "upload")} className="flex-1 flex flex-col min-h-0">
          {/* Enhanced Tab Buttons */}
-          <div className="grid grid-cols-2 gap-3 p-2 rounded-xl backdrop-blur-md bg-gradient-to-b from-gray-100/30 via-gray-50/20 to-white/40 dark:from-gray-900/30 dark:via-gray-800/20 dark:to-black/40 border border-gray-200/40 dark:border-gray-700/40">
+          <div className="grid grid-cols-2 gap-3 p-2 rounded-xl backdrop-blur-md bg-gradient-to-b from-gray-100/30 via-gray-50/20 to-white/40 dark:from-gray-900/30 dark:via-gray-800/20 dark:to-black/40 border border-gray-200/40 dark:border-gray-700/40 flex-shrink-0">
            {/* Crawl Website Tab */}
            <button
              type="button"
@@ -190,7 +215,16 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
          </div>

          {/* Crawl Tab */}
-          <TabsContent value="crawl" className="space-y-6 mt-6">
+          <TabsContent value="crawl" className="mt-6 flex-1 min-h-0">
+            <div
+              className="overflow-y-auto overflow-x-hidden pr-2 scrollbar-thin scrollbar-thumb-gray-400 dark:scrollbar-thumb-gray-600 scrollbar-track-transparent"
+              style={{
+                maxHeight: "calc(85vh - 200px)",
+                overflowY: "scroll",
+                WebkitOverflowScrolling: "touch",
+                scrollbarWidth: "thin"
+              }}>
+              <div className="space-y-6 pb-4">
            {/* Enhanced URL Input Section */}
            <div className="space-y-3">
              <Label htmlFor={urlId} className="text-sm font-medium text-gray-900 dark:text-white/90">
@@ -215,6 +249,9 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
              </p>
            </div>

+            {/* Advanced Configuration - positioned directly below URL */}
+            <AdvancedCrawlConfig config={crawlConfig} onChange={setCrawlConfig} />
+
            <div className="space-y-6">
              <KnowledgeTypeSelector value={crawlType} onValueChange={setCrawlType} disabled={isProcessing} />

@@ -233,7 +270,7 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
              disabled={isProcessing || !crawlUrl}
              className="w-full bg-gradient-to-r from-cyan-500 to-cyan-600 hover:from-cyan-600 hover:to-cyan-700 backdrop-blur-md border border-cyan-400/50 shadow-[0_0_20px_rgba(6,182,212,0.25)] hover:shadow-[0_0_30px_rgba(6,182,212,0.35)] transition-all duration-200"
            >
-              {crawlMutation.isPending ? (
+              {(crawlMutation.isPending || crawlV2Mutation.isPending) ? (
                <>
                  <Loader2 className="w-4 h-4 mr-2 animate-spin" />
                  Starting Crawl...
@@ -245,10 +282,21 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
                </>
              )}
            </Button>
+              </div>
+            </div>
          </TabsContent>

          {/* Upload Tab */}
-          <TabsContent value="upload" className="space-y-6 mt-6">
+          <TabsContent value="upload" className="mt-6 flex-1 min-h-0">
+            <div
+              className="overflow-y-auto overflow-x-hidden pr-2 scrollbar-thin scrollbar-thumb-gray-400 dark:scrollbar-thumb-gray-600 scrollbar-track-transparent"
+              style={{
+                maxHeight: "calc(85vh - 200px)",
+                overflowY: "scroll",
+                WebkitOverflowScrolling: "touch",
+                scrollbarWidth: "thin"
+              }}>
+              <div className="space-y-6 pb-4">
            {/* Enhanced File Input Section */}
            <div className="space-y-3">
              <Label htmlFor={fileId} className="text-sm font-medium text-gray-900 dark:text-white/90">
@@ -326,6 +374,8 @@ export const AddKnowledgeDialog: React.FC<AddKnowledgeDialogProps> = ({
                </>
              )}
            </Button>
+              </div>
+            </div>
          </TabsContent>
        </Tabs>
      </DialogContent>
--- a/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx
+++ b/archon-ui-main/src/features/knowledge/components/AdvancedCrawlConfig.tsx
@@ -0,0 +1,308 @@
+/**
+ * Advanced Crawl Configuration Component
+ * Provides UI for configuring domain filtering and URL patterns
+ */
+
+import { ChevronDown, Info, Plus, X } from "lucide-react";
+import React, { useState } from "react";
+import type { CrawlConfig } from "../types";
+
+interface Props {
+  config: CrawlConfig;
+  onChange: (config: CrawlConfig) => void;
+}
+
+export const AdvancedCrawlConfig: React.FC<Props> = ({ config, onChange }) => {
+  const [isExpanded, setIsExpanded] = useState(false);
+  const [newDomain, setNewDomain] = useState("");
+  const [newPattern, setNewPattern] = useState("");
+  const [activeTab, setActiveTab] = useState<"allowed" | "excluded">("allowed");
+  const [patternTab, setPatternTab] = useState<"include" | "exclude">("include");
+
+  const handleAddDomain = (type: "allowed" | "excluded") => {
+    if (!newDomain.trim()) return;
+
+    const domain = newDomain.trim().toLowerCase().replace(/^https?:\/\//, "").replace(/\/$/, "");
+    const key = `${type}_domains` as keyof CrawlConfig;
+    const current = config[key] || [];
+
+    if (!current.includes(domain)) {
+      onChange({
+        ...config,
+        [key]: [...current, domain],
+      });
+    }
+
+    setNewDomain("");
+  };
+
+  const handleRemoveDomain = (type: "allowed" | "excluded", domain: string) => {
+    const key = `${type}_domains` as keyof CrawlConfig;
+    onChange({
+      ...config,
+      [key]: (config[key] || []).filter(d => d !== domain),
+    });
+  };
+
+  const handleAddPattern = (type: "include" | "exclude") => {
+    if (!newPattern.trim()) return;
+
+    const key = `${type}_patterns` as keyof CrawlConfig;
+    const current = config[key] || [];
+
+    if (!current.includes(newPattern)) {
+      onChange({
+        ...config,
+        [key]: [...current, newPattern],
+      });
+    }
+
+    setNewPattern("");
+  };
+
+  const handleRemovePattern = (type: "include" | "exclude", pattern: string) => {
+    const key = `${type}_patterns` as keyof CrawlConfig;
+    onChange({
+      ...config,
+      [key]: (config[key] || []).filter(p => p !== pattern),
+    });
+  };
+
+  const hasAnyConfig =
+    (config.allowed_domains && config.allowed_domains.length > 0) ||
+    (config.excluded_domains && config.excluded_domains.length > 0) ||
+    (config.include_patterns && config.include_patterns.length > 0) ||
+    (config.exclude_patterns && config.exclude_patterns.length > 0);
+
+  return (
+    <div className="border border-gray-800 rounded-lg bg-gray-900/50 backdrop-blur-sm">
+      <button
+        onClick={() => setIsExpanded(!isExpanded)}
+        className="w-full p-4 flex items-center justify-between hover:bg-gray-800/30 transition-colors"
+      >
+        <div className="flex items-center gap-2">
+          <span className="text-gray-200 font-medium">Advanced Configuration</span>
+          {hasAnyConfig && (
+            <span className="text-xs bg-blue-500/20 text-blue-400 px-2 py-1 rounded-full">
+              Active filters
+            </span>
+          )}
+        </div>
+        <ChevronDown
+          className={`w-5 h-5 text-gray-400 transform transition-transform ${
+            isExpanded ? "rotate-180" : ""
+          }`}
+        />
+      </button>
+
+      {isExpanded && (
+        <div className="p-4 space-y-4 border-t border-gray-800">
+          {/* Domain Filters Section */}
+          <div>
+            <div className="flex items-center gap-2 mb-3">
+              <h3 className="text-sm font-medium text-gray-300">Domain Filters</h3>
+              <div className="group relative">
+                <Info className="w-4 h-4 text-gray-500 cursor-help" />
+                <div className="absolute left-0 bottom-full mb-1 w-64 p-2 bg-gray-800 rounded text-xs text-gray-300
+                  opacity-0 group-hover:opacity-100 transition-opacity pointer-events-none z-50">
+                  Control which domains are crawled. Blacklist takes priority over whitelist.
+                </div>
+              </div>
+            </div>
+
+            {/* Domain Tabs */}
+            <div className="flex gap-2 mb-3">
+              <button
+                onClick={() => setActiveTab("allowed")}
+                className={`px-3 py-1 text-sm rounded transition-colors ${
+                  activeTab === "allowed"
+                    ? "bg-green-500/20 text-green-400"
+                    : "bg-gray-800 text-gray-400 hover:bg-gray-700"
+                }`}
+              >
+                Allowed Domains ({config.allowed_domains?.length || 0})
+              </button>
+              <button
+                onClick={() => setActiveTab("excluded")}
+                className={`px-3 py-1 text-sm rounded transition-colors ${
+                  activeTab === "excluded"
+                    ? "bg-red-500/20 text-red-400"
+                    : "bg-gray-800 text-gray-400 hover:bg-gray-700"
+                }`}
+              >
+                Excluded Domains ({config.excluded_domains?.length || 0})
+              </button>
+            </div>
+
+            {/* Domain Input */}
+            <div className="flex gap-2 mb-2">
+              <input
+                type="text"
+                value={newDomain}
+                onChange={(e) => setNewDomain(e.target.value)}
+                onKeyDown={(e) => {
+                  if (e.key === "Enter") {
+                    handleAddDomain(activeTab);
+                  }
+                }}
+                placeholder={`Add ${activeTab} domain (e.g., docs.example.com)`}
+                className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200
+                  placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors"
+              />
+              <button
+                onClick={() => handleAddDomain(activeTab)}
+                className="px-4 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded text-sm
+                  transition-colors flex items-center gap-1"
+              >
+                <Plus className="w-4 h-4" />
+                Add
+              </button>
+            </div>
+
+            {/* Domain List */}
+            <div className="space-y-1 max-h-32 overflow-y-auto">
+              {activeTab === "allowed" && config.allowed_domains?.map(domain => (
+                <div
+                  key={domain}
+                  className="flex items-center justify-between px-3 py-1 bg-green-500/10
+                    rounded text-sm text-green-400 group"
+                >
+                  <span>{domain}</span>
+                  <button
+                    onClick={() => handleRemoveDomain("allowed", domain)}
+                    className="opacity-0 group-hover:opacity-100 transition-opacity"
+                  >
+                    <X className="w-4 h-4 hover:text-red-400" />
+                  </button>
+                </div>
+              ))}
+              {activeTab === "excluded" && config.excluded_domains?.map(domain => (
+                <div
+                  key={domain}
+                  className="flex items-center justify-between px-3 py-1 bg-red-500/10
+                    rounded text-sm text-red-400 group"
+                >
+                  <span>{domain}</span>
+                  <button
+                    onClick={() => handleRemoveDomain("excluded", domain)}
+                    className="opacity-0 group-hover:opacity-100 transition-opacity"
+                  >
+                    <X className="w-4 h-4 hover:text-red-400" />
+                  </button>
+                </div>
+              ))}
+            </div>
+          </div>
+
+          {/* URL Patterns Section */}
+          <div>
+            <div className="flex items-center gap-2 mb-3">
+              <h3 className="text-sm font-medium text-gray-300">URL Patterns</h3>
+              <div className="group relative">
+                <Info className="w-4 h-4 text-gray-500 cursor-help" />
+                <div className="absolute left-0 bottom-full mb-1 w-64 p-2 bg-gray-800 rounded text-xs text-gray-300
+                  opacity-0 group-hover:opacity-100 transition-opacity pointer-events-none z-50">
+                  Use glob patterns to filter URLs. Example: */docs/* or *.pdf
+                </div>
+              </div>
+            </div>
+
+            {/* Pattern Tabs */}
+            <div className="flex gap-2 mb-3">
+              <button
+                onClick={() => setPatternTab("include")}
+                className={`px-3 py-1 text-sm rounded transition-colors ${
+                  patternTab === "include"
+                    ? "bg-green-500/20 text-green-400"
+                    : "bg-gray-800 text-gray-400 hover:bg-gray-700"
+                }`}
+              >
+                Include Patterns ({config.include_patterns?.length || 0})
+              </button>
+              <button
+                onClick={() => setPatternTab("exclude")}
+                className={`px-3 py-1 text-sm rounded transition-colors ${
+                  patternTab === "exclude"
+                    ? "bg-red-500/20 text-red-400"
+                    : "bg-gray-800 text-gray-400 hover:bg-gray-700"
+                }`}
+              >
+                Exclude Patterns ({config.exclude_patterns?.length || 0})
+              </button>
+            </div>
+
+            {/* Pattern Input */}
+            <div className="flex gap-2 mb-2">
+              <input
+                type="text"
+                value={newPattern}
+                onChange={(e) => setNewPattern(e.target.value)}
+                onKeyDown={(e) => {
+                  if (e.key === "Enter") {
+                    handleAddPattern(patternTab);
+                  }
+                }}
+                placeholder={`Add ${patternTab} pattern (e.g., */api/* or *.pdf)`}
+                className="flex-1 px-3 py-2 bg-gray-800 border border-gray-700 rounded text-sm text-gray-200
+                  placeholder-gray-500 focus:outline-none focus:border-blue-500 transition-colors"
+              />
+              <button
+                onClick={() => handleAddPattern(patternTab)}
+                className="px-4 py-2 bg-blue-600 hover:bg-blue-700 text-white rounded text-sm
+                  transition-colors flex items-center gap-1"
+              >
+                <Plus className="w-4 h-4" />
+                Add
+              </button>
+            </div>
+
+            {/* Pattern List */}
+            <div className="space-y-1 max-h-32 overflow-y-auto">
+              {patternTab === "include" && config.include_patterns?.map(pattern => (
+                <div
+                  key={pattern}
+                  className="flex items-center justify-between px-3 py-1 bg-green-500/10
+                    rounded text-sm text-green-400 group font-mono"
+                >
+                  <span>{pattern}</span>
+                  <button
+                    onClick={() => handleRemovePattern("include", pattern)}
+                    className="opacity-0 group-hover:opacity-100 transition-opacity"
+                  >
+                    <X className="w-4 h-4 hover:text-red-400" />
+                  </button>
+                </div>
+              ))}
+              {patternTab === "exclude" && config.exclude_patterns?.map(pattern => (
+                <div
+                  key={pattern}
+                  className="flex items-center justify-between px-3 py-1 bg-red-500/10
+                    rounded text-sm text-red-400 group font-mono"
+                >
+                  <span>{pattern}</span>
+                  <button
+                    onClick={() => handleRemovePattern("exclude", pattern)}
+                    className="opacity-0 group-hover:opacity-100 transition-opacity"
+                  >
+                    <X className="w-4 h-4 hover:text-red-400" />
+                  </button>
+                </div>
+              ))}
+            </div>
+          </div>
+
+          {/* Clear All Button */}
+          {hasAnyConfig && (
+            <button
+              onClick={() => onChange({})}
+              className="px-3 py-1 text-xs bg-gray-800 hover:bg-gray-700 text-gray-400
+                rounded transition-colors"
+            >
+              Clear All Filters
+            </button>
+          )}
+        </div>
+      )}
+    </div>
+  );
+};
--- a/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
+++ b/archon-ui-main/src/features/knowledge/components/DocumentBrowser.tsx
@@ -3,13 +3,14 @@
 * Shows document chunks and code examples for a knowledge item
 */

-import { ChevronDown, ChevronRight, Code, FileText, Search } from "lucide-react";
-import { useState } from "react";
+import { ChevronDown, ChevronRight, Code, ExternalLink, FileText, Globe, Search, X } from "lucide-react";
+import { useMemo, useState } from "react";
 import { Input } from "../../ui/primitives";
 import { Dialog, DialogContent, DialogHeader, DialogTitle } from "../../ui/primitives/dialog";
 import { cn } from "../../ui/primitives/styles";
 import { Tabs, TabsContent, TabsList, TabsTrigger } from "../../ui/primitives/tabs";
-import { useCodeExamples, useKnowledgeItemChunks } from "../hooks";
+import { useCodeExamples, useKnowledgeItem, useKnowledgeItemChunks } from "../hooks";
+import { extractDomain } from "../utils/knowledge-utils";

 interface DocumentBrowserProps {
  sourceId: string;
@@ -21,7 +22,9 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
  const [activeTab, setActiveTab] = useState<"documents" | "code">("documents");
  const [searchQuery, setSearchQuery] = useState("");
  const [expandedChunks, setExpandedChunks] = useState<Set<string>>(new Set());
+  const [selectedDomains, setSelectedDomains] = useState<Set<string>>(new Set());

+  const { data: sourceItem } = useKnowledgeItem(sourceId);
  const {
    data: chunksData,
    isLoading: chunksLoading,
@@ -33,12 +36,36 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
  const chunks = chunksData?.chunks || [];
  const codeExamples = codeData?.code_examples || [];

-  // Filter chunks based on search
-  const filteredChunks = chunks.filter(
-    (chunk) =>
+  // Extract unique domains from chunks
+  const domainStats = useMemo(() => {
+    const stats = new Map<string, number>();
+    chunks.forEach((chunk) => {
+      const url = chunk.url || chunk.metadata?.url;
+      if (url) {
+        const domain = extractDomain(url);
+        stats.set(domain, (stats.get(domain) || 0) + 1);
+      }
+    });
+
+    return Array.from(stats.entries())
+      .sort((a, b) => b[1] - a[1]) // Sort by count descending
+      .map(([domain, count]) => ({ domain, count }));
+  }, [chunks]);
+
+  // Filter chunks based on search and domain
+  const filteredChunks = chunks.filter((chunk) => {
+    // Search filter
+    const matchesSearch =
+      !searchQuery ||
      chunk.content.toLowerCase().includes(searchQuery.toLowerCase()) ||
-      chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase()),
-  );
+      chunk.metadata?.title?.toLowerCase().includes(searchQuery.toLowerCase());
+
+    // Domain filter
+    const url = chunk.url || chunk.metadata?.url;
+    const matchesDomain = selectedDomains.size === 0 || (url && selectedDomains.has(extractDomain(url)));
+
+    return matchesSearch && matchesDomain;
+  });

  // Filter code examples based on search
  const filteredCode = codeExamples.filter((example) => {
@@ -66,9 +93,30 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
    <Dialog open={open} onOpenChange={onOpenChange}>
      <DialogContent className="max-w-4xl h-[80vh] flex flex-col">
        <DialogHeader>
-          <DialogTitle>Document Browser</DialogTitle>
-          <div className="flex items-center gap-2 mt-4">
-            <div className="relative flex-1">
+          <DialogTitle className="flex items-center justify-between">
+            <div className="flex items-center gap-2">
+              Document Browser
+              {chunksData && (
+                <span className="text-sm text-gray-400 font-normal">
+                  ({chunks.length} documents from {domainStats.length} domain{domainStats.length !== 1 ? "s" : ""})
+                </span>
+              )}
+            </div>
+            {sourceItem && sourceItem.url && (
+              <a
+                href={sourceItem.url}
+                target="_blank"
+                rel="noopener noreferrer"
+                className="flex items-center gap-1 text-xs text-cyan-400 hover:text-cyan-300 transition-colors"
+              >
+                <ExternalLink className="w-3 h-3" />
+                View Source
+              </a>
+            )}
+          </DialogTitle>
+          <div className="space-y-3 mt-4">
+            {/* Search Bar */}
+            <div className="relative">
              <Search className="absolute left-3 top-1/2 -translate-y-1/2 w-4 h-4 text-gray-400" />
              <Input
                type="text"
@@ -78,6 +126,61 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
                className="pl-10 bg-black/30 border-white/10 focus:border-cyan-500/50"
              />
            </div>
+
+            {/* Domain Filter */}
+            {domainStats.length > 0 && (
+              <div className="space-y-2">
+                <div className="text-sm text-gray-400 flex items-center gap-2">
+                  <Globe className="w-4 h-4" />
+                  Domain Filter
+                  {selectedDomains.size > 0 && (
+                    <button
+                      type="button"
+                      onClick={() => setSelectedDomains(new Set())}
+                      className="ml-auto text-xs text-cyan-400 hover:text-cyan-300 flex items-center gap-1"
+                    >
+                      <X className="w-3 h-3" />
+                      Clear filter
+                    </button>
+                  )}
+                </div>
+                <div className="flex flex-wrap gap-2">
+                  {domainStats.map(({ domain, count }) => {
+                    const isSelected = selectedDomains.has(domain);
+                    return (
+                      <button
+                        key={domain}
+                        type="button"
+                        onClick={() => {
+                          const newSelection = new Set(selectedDomains);
+                          if (isSelected) {
+                            newSelection.delete(domain);
+                          } else {
+                            newSelection.add(domain);
+                          }
+                          setSelectedDomains(newSelection);
+                        }}
+                        className={cn(
+                          "px-3 py-1 text-xs rounded-full border transition-all",
+                          "flex items-center gap-2",
+                          isSelected
+                            ? "bg-cyan-500/20 border-cyan-500/50 text-cyan-400"
+                            : "bg-black/20 border-white/10 text-gray-400 hover:border-cyan-500/30 hover:text-cyan-400"
+                        )}
+                      >
+                        <span className="truncate max-w-[200px]">{domain}</span>
+                        <span className={cn(
+                          "px-1.5 py-0.5 rounded text-[10px] font-mono",
+                          isSelected ? "bg-cyan-500/30" : "bg-white/10"
+                        )}>
+                          {count}
+                        </span>
+                      </button>
+                    );
+                  })}
+                </div>
+              </div>
+            )}
          </div>
        </DialogHeader>

@@ -123,8 +226,9 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
                        key={chunk.id}
                        className="bg-black/30 rounded-lg border border-white/10 p-4 hover:border-cyan-500/30 transition-colors"
                      >
-                        {chunk.metadata?.title && (
-                          <h4 className="font-medium text-white/90 mb-2 flex items-center gap-2">
+                        <div className="flex items-start justify-between gap-2 mb-2">
+                          {chunk.metadata?.title && (
+                          <h4 className="font-medium text-white/90 flex items-center gap-2 flex-1">
                            {needsExpansion && (
                              <button
                                type="button"
@@ -140,7 +244,20 @@ export const DocumentBrowser: React.FC<DocumentBrowserProps> = ({ sourceId, open
                            )}
                            {chunk.metadata.title}
                          </h4>
-                        )}
+                          )}
+                          {(chunk.url || chunk.metadata?.url) && (
+                            <a
+                              href={chunk.url || chunk.metadata?.url}
+                              target="_blank"
+                              rel="noopener noreferrer"
+                              className="text-[10px] px-2 py-1 rounded bg-white/5 text-gray-500 hover:text-cyan-400 hover:bg-cyan-500/10 font-mono shrink-0 transition-colors flex items-center gap-1"
+                              title={`View on ${extractDomain(chunk.url || chunk.metadata?.url || "")}`}
+                            >
+                              {extractDomain(chunk.url || chunk.metadata?.url || "")}
+                              <ExternalLink className="w-3 h-3" />
+                            </a>
+                          )}
+                        </div>

                        <div className="text-sm text-gray-300 whitespace-pre-wrap">
                          {isExpanded || !needsExpansion ? (
--- a/archon-ui-main/src/features/knowledge/components/index.ts
+++ b/archon-ui-main/src/features/knowledge/components/index.ts
@@ -1,4 +1,5 @@
 export * from "./AddKnowledgeDialog";
+export * from "./AdvancedCrawlConfig";
 export * from "./DocumentBrowser";
 export * from "./KnowledgeCard";
 export * from "./KnowledgeList";
--- a/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts
+++ b/archon-ui-main/src/features/knowledge/hooks/useKnowledgeQueries.ts
@@ -15,6 +15,7 @@ import { useToast } from "../../ui/hooks/useToast";
 import { knowledgeService } from "../services";
 import type {
  CrawlRequest,
+  CrawlRequestV2,
  CrawlStartResponse,
  KnowledgeItem,
  KnowledgeItemsFilter,
@@ -298,6 +299,181 @@ export function useCrawlUrl() {
  });
 }

+/**
+ * Crawl URL mutation with domain filtering (v2) with optimistic updates
+ * Returns the progressId that can be used to track crawl progress
+ */
+export function useCrawlUrlV2() {
+  const queryClient = useQueryClient();
+  const { showToast } = useToast();
+
+  return useMutation<
+    CrawlStartResponse,
+    Error,
+    CrawlRequestV2,
+    {
+      previousKnowledge?: KnowledgeItem[];
+      previousSummaries?: Array<[readonly unknown[], KnowledgeItemsResponse | undefined]>;
+      previousOperations?: ActiveOperationsResponse;
+      tempProgressId: string;
+      tempItemId: string;
+    }
+  >({
+    mutationFn: (request: CrawlRequestV2) => knowledgeService.crawlUrlV2(request),
+    onMutate: async (request) => {
+      // Cancel any outgoing refetches to prevent race conditions
+      await queryClient.cancelQueries({ queryKey: knowledgeKeys.summariesPrefix() });
+      await queryClient.cancelQueries({ queryKey: progressKeys.active() });
+
+      // Snapshot the previous values for rollback
+      const previousSummaries = queryClient.getQueriesData<KnowledgeItemsResponse>({
+        queryKey: knowledgeKeys.summariesPrefix(),
+      });
+      const previousOperations = queryClient.getQueryData<ActiveOperationsResponse>(progressKeys.active());
+
+      // Generate temporary progress ID and optimistic entity
+      const tempProgressId = createOptimisticId();
+      const optimisticItem = createOptimisticEntity<KnowledgeItem>({
+        title: (() => {
+          try {
+            return new URL(request.url).hostname || "New crawl";
+          } catch {
+            return "New crawl";
+          }
+        })(),
+        url: request.url,
+        source_id: tempProgressId,
+        source_type: "url",
+        knowledge_type: request.knowledge_type || "technical",
+        status: "processing",
+        document_count: 0,
+        code_examples_count: 0,
+        metadata: {
+          knowledge_type: request.knowledge_type || "technical",
+          tags: request.tags || [],
+          source_type: "url",
+          status: "processing",
+          description: `Crawling ${request.url} with domain filters`,
+          crawl_config: request.crawl_config,
+        },
+        created_at: new Date().toISOString(),
+        updated_at: new Date().toISOString(),
+      } as Omit<KnowledgeItem, "id">);
+      const tempItemId = optimisticItem.id;
+
+      // Update all summaries caches with optimistic data
+      const entries = queryClient.getQueriesData<KnowledgeItemsResponse>({
+        queryKey: knowledgeKeys.summariesPrefix(),
+      });
+      for (const [qk, old] of entries) {
+        const filter = qk[qk.length - 1] as KnowledgeItemsFilter | undefined;
+        const matchesType = !filter?.knowledge_type || optimisticItem.knowledge_type === filter.knowledge_type;
+        const matchesTags =
+          !filter?.tags || filter.tags.every((t) => (optimisticItem.metadata?.tags ?? []).includes(t));
+        if (!(matchesType && matchesTags)) continue;
+        if (!old) {
+          queryClient.setQueryData<KnowledgeItemsResponse>(qk, {
+            items: [optimisticItem],
+            total: 1,
+            page: 1,
+            per_page: 100,
+          });
+        } else {
+          queryClient.setQueryData<KnowledgeItemsResponse>(qk, {
+            ...old,
+            items: [optimisticItem, ...old.items],
+            total: (old.total ?? old.items.length) + 1,
+          });
+        }
+      }
+
+      // Add optimistic progress entry
+      if (!previousOperations) {
+        queryClient.setQueryData<ActiveOperationsResponse>(progressKeys.active(), {
+          operations: [
+            {
+              operation_id: tempProgressId,
+              operation_type: "crawl",
+              status: "starting",
+              progress: 0,
+              message: `Starting crawl of ${request.url} with domain filtering`,
+              started_at: new Date().toISOString(),
+              progressId: tempProgressId,
+            } as ActiveOperation,
+          ],
+        });
+      } else {
+        queryClient.setQueryData<ActiveOperationsResponse>(progressKeys.active(), {
+          operations: [
+            {
+              operation_id: tempProgressId,
+              operation_type: "crawl",
+              status: "starting",
+              progress: 0,
+              message: `Starting crawl of ${request.url} with domain filtering`,
+              started_at: new Date().toISOString(),
+              progressId: tempProgressId,
+            } as ActiveOperation,
+            ...(previousOperations.operations || []),
+          ],
+        });
+      }
+
+      return { previousSummaries, previousOperations, tempProgressId, tempItemId };
+    },
+    onSuccess: async (response, _variables, context) => {
+      // Show success message
+      showToast("Crawl started with domain filtering", "success");
+
+      // Update the temporary progress ID with the real one
+      if (context) {
+        const activeOps = queryClient.getQueryData<ActiveOperationsResponse>(progressKeys.active());
+        if (activeOps) {
+          const updated = {
+            operations: activeOps.operations.map((op) =>
+              op.progressId === context.tempProgressId ? { ...op, progressId: response.progressId } : op,
+            ),
+          };
+          queryClient.setQueryData(progressKeys.active(), updated);
+        }
+
+        // Update item in all summaries caches
+        const entries = queryClient.getQueriesData<KnowledgeItemsResponse>({
+          queryKey: knowledgeKeys.summariesPrefix(),
+        });
+        for (const [qk, data] of entries) {
+          if (data) {
+            const updated = {
+              ...data,
+              items: data.items.map((item) =>
+                item.id === context.tempItemId ? { ...item, source_id: response.progressId } : item,
+              ),
+            };
+            queryClient.setQueryData(qk, updated);
+          }
+        }
+      }
+
+      // Return the response so caller can access progressId
+      return response;
+    },
+    onError: (error, _variables, context) => {
+      // Rollback optimistic updates on error
+      if (context?.previousSummaries) {
+        for (const [queryKey, data] of context.previousSummaries) {
+          queryClient.setQueryData(queryKey, data);
+        }
+      }
+      if (context?.previousOperations) {
+        queryClient.setQueryData(progressKeys.active(), context.previousOperations);
+      }
+
+      const errorMessage = getProviderErrorMessage(error) || "Failed to start crawl with filters";
+      showToast(errorMessage, "error");
+    },
+  });
+}
+
 /**
 * Upload document mutation with optimistic updates
 */
--- a/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx
+++ b/archon-ui-main/src/features/knowledge/inspector/components/InspectorSidebar.tsx
@@ -4,10 +4,12 @@
 */

 import { motion } from "framer-motion";
-import { Code, FileText, Hash, Loader2, Search } from "lucide-react";
+import { Code, FileText, Globe, Hash, Loader2, Search, X } from "lucide-react";
+import { useMemo } from "react";
 import { Button, Input } from "../../../ui/primitives";
 import { cn } from "../../../ui/primitives/styles";
 import type { CodeExample, DocumentChunk } from "../../types";
+import { extractDomain } from "../../utils/knowledge-utils";

 interface InspectorSidebarProps {
  viewMode: "documents" | "code";
@@ -20,6 +22,8 @@ interface InspectorSidebarProps {
  hasNextPage: boolean;
  onLoadMore: () => void;
  isFetchingNextPage: boolean;
+  selectedDomains?: Set<string>;
+  onDomainsChange?: (domains: Set<string>) => void;
 }

 export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
@@ -33,7 +37,39 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
  hasNextPage,
  onLoadMore,
  isFetchingNextPage,
+  selectedDomains = new Set(),
+  onDomainsChange,
 }) => {
+  // Extract unique domains from documents
+  const domainStats = useMemo(() => {
+    if (viewMode !== "documents") return [];
+
+    const stats = new Map<string, number>();
+    (items as DocumentChunk[]).forEach((doc) => {
+      const url = doc.url || doc.metadata?.url;
+      if (url) {
+        const domain = extractDomain(url);
+        stats.set(domain, (stats.get(domain) || 0) + 1);
+      }
+    });
+
+    return Array.from(stats.entries())
+      .sort((a, b) => b[1] - a[1])
+      .map(([domain, count]) => ({ domain, count }));
+  }, [items, viewMode]);
+
+  // Filter items by selected domains
+  const filteredItems = useMemo(() => {
+    if (viewMode !== "documents" || selectedDomains.size === 0) {
+      return items;
+    }
+
+    return (items as DocumentChunk[]).filter((doc) => {
+      const url = doc.url || doc.metadata?.url;
+      if (!url) return false;
+      return selectedDomains.has(extractDomain(url));
+    });
+  }, [items, selectedDomains, viewMode]);
  const getItemTitle = (item: DocumentChunk | CodeExample) => {
    const idSuffix = String(item.id).slice(-6);
    if (viewMode === "documents") {
@@ -62,8 +98,9 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({

  return (
    <aside className="w-80 border-r border-white/10 flex flex-col bg-black/40" aria-label="Document and code browser">
-      {/* Search */}
-      <div className="p-4 border-b border-white/10 flex-shrink-0">
+      {/* Search and Filters */}
+      <div className="p-4 border-b border-white/10 flex-shrink-0 space-y-3">
+        {/* Search Bar */}
        <div className="relative">
          <Search
            className="absolute left-3 top-1/2 -translate-y-1/2 w-4 h-4 text-gray-500 pointer-events-none"
@@ -77,6 +114,66 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
            aria-label={`Search ${viewMode}`}
          />
        </div>
+
+        {/* Domain Filter - Only show for documents */}
+        {viewMode === "documents" && domainStats.length > 0 && onDomainsChange && (
+          <div className="space-y-2">
+            <div className="text-xs text-gray-400 flex items-center gap-2">
+              <Globe className="w-3 h-3" />
+              Domain Filter
+              {selectedDomains.size > 0 && (
+                <button
+                  type="button"
+                  onClick={() => onDomainsChange(new Set())}
+                  className="ml-auto text-cyan-400 hover:text-cyan-300 flex items-center gap-1"
+                >
+                  <X className="w-3 h-3" />
+                  Clear
+                </button>
+              )}
+            </div>
+            <div className="flex flex-wrap gap-1">
+              {domainStats.slice(0, 5).map(({ domain, count }) => {
+                const isSelected = selectedDomains.has(domain);
+                return (
+                  <button
+                    key={domain}
+                    type="button"
+                    onClick={() => {
+                      const newSelection = new Set(selectedDomains);
+                      if (isSelected) {
+                        newSelection.delete(domain);
+                      } else {
+                        newSelection.add(domain);
+                      }
+                      onDomainsChange(newSelection);
+                    }}
+                    className={cn(
+                      "px-2 py-0.5 text-[10px] rounded-full border transition-all",
+                      "flex items-center gap-1",
+                      isSelected
+                        ? "bg-cyan-500/20 border-cyan-500/50 text-cyan-400"
+                        : "bg-black/20 border-white/10 text-gray-500 hover:border-cyan-500/30 hover:text-cyan-400"
+                    )}
+                  >
+                    <span className="truncate max-w-[100px]">{domain}</span>
+                    <span className={cn(
+                      "px-1 rounded text-[9px] font-mono",
+                      isSelected ? "bg-cyan-500/30" : "bg-white/10"
+                    )}>
+                      {count}
+                    </span>
+                  </button>
+                );
+              })}
+              {domainStats.length > 5 && (
+                <span className="text-[10px] text-gray-600 px-2 py-0.5">
+                  +{domainStats.length - 5} more
+                </span>
+              )}
+            </div>
+          </div>
+        )}
      </div>

      {/* Item List */}
@@ -93,7 +190,7 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
          </div>
        ) : (
          <div className="p-2">
-            {items.map((item) => (
+            {filteredItems.map((item) => (
              <motion.button
                type="button"
                key={item.id}
@@ -133,9 +230,16 @@ export const InspectorSidebar: React.FC<InspectorSidebarProps> = ({
                        </span>
                      )}
                    </div>
-                    <p className="text-xs text-gray-500 line-clamp-2" title={getItemDescription(item)}>
-                      {getItemDescription(item)}
-                    </p>
+                    <div className="flex items-center justify-between gap-2">
+                      <p className="text-xs text-gray-500 line-clamp-2 flex-1" title={getItemDescription(item)}>
+                        {getItemDescription(item)}
+                      </p>
+                      {viewMode === "documents" && (item as DocumentChunk).url && (
+                        <span className="text-[9px] px-1.5 py-0.5 rounded bg-white/5 text-gray-600 font-mono shrink-0">
+                          {extractDomain((item as DocumentChunk).url || "")}
+                        </span>
+                      )}
+                    </div>
                    {item.metadata?.relevance_score != null && (
                      <div className="flex items-center gap-1 mt-1">
                        <Hash className="w-3 h-3 text-gray-600" aria-hidden="true" />
--- a/archon-ui-main/src/features/knowledge/inspector/components/KnowledgeInspector.tsx
+++ b/archon-ui-main/src/features/knowledge/inspector/components/KnowledgeInspector.tsx
@@ -31,6 +31,7 @@ export const KnowledgeInspector: React.FC<KnowledgeInspectorProps> = ({
  const [searchQuery, setSearchQuery] = useState("");
  const [selectedItem, setSelectedItem] = useState<InspectorSelectedItem | null>(null);
  const [copiedId, setCopiedId] = useState<string | null>(null);
+  const [selectedDomains, setSelectedDomains] = useState<Set<string>>(new Set());

  // Reset view mode when item or initialTab changes
  useEffect(() => {
@@ -141,6 +142,7 @@ export const KnowledgeInspector: React.FC<KnowledgeInspectorProps> = ({
    setViewMode(mode);
    setSelectedItem(null);
    setSearchQuery("");
+    setSelectedDomains(new Set()); // Clear domain filter when switching modes
  }, []);

  return (
@@ -175,6 +177,8 @@ export const KnowledgeInspector: React.FC<KnowledgeInspectorProps> = ({
            hasNextPage={hasNextPage}
            onLoadMore={fetchNextPage}
            isFetchingNextPage={isFetchingNextPage}
+            selectedDomains={selectedDomains}
+            onDomainsChange={setSelectedDomains}
          />

          {/* Content Viewer */}
--- a/archon-ui-main/src/features/knowledge/services/knowledgeService.ts
+++ b/archon-ui-main/src/features/knowledge/services/knowledgeService.ts
@@ -9,6 +9,7 @@ import type {
  ChunksResponse,
  CodeExamplesResponse,
  CrawlRequest,
+  CrawlRequestV2,
  CrawlStartResponse,
  KnowledgeItem,
  KnowledgeItemsFilter,
@@ -89,6 +90,18 @@ export const knowledgeService = {
    return response;
  },

+  /**
+   * Start crawling a URL with domain filtering (v2)
+   */
+  async crawlUrlV2(request: CrawlRequestV2): Promise<CrawlStartResponse> {
+    const response = await callAPIWithETag<CrawlStartResponse>("/api/knowledge-items/crawl-v2", {
+      method: "POST",
+      body: JSON.stringify(request),
+    });
+
+    return response;
+  },
+
  /**
   * Refresh an existing knowledge item
   */
--- a/archon-ui-main/src/features/knowledge/types/knowledge.ts
+++ b/archon-ui-main/src/features/knowledge/types/knowledge.ts
@@ -133,6 +133,13 @@ export interface KnowledgeItemsFilter {
  per_page?: number;
 }

+export interface CrawlConfig {
+  allowed_domains?: string[];
+  excluded_domains?: string[];
+  include_patterns?: string[];
+  exclude_patterns?: string[];
+}
+
 export interface CrawlRequest {
  url: string;
  knowledge_type?: "technical" | "business";
@@ -142,6 +149,10 @@ export interface CrawlRequest {
  extract_code_examples?: boolean;
 }

+export interface CrawlRequestV2 extends CrawlRequest {
+  crawl_config?: CrawlConfig;
+}
+
 export interface UploadMetadata {
  knowledge_type?: "technical" | "business";
  tags?: string[];
--- a/python/src/server/api_routes/knowledge_api.py
+++ b/python/src/server/api_routes/knowledge_api.py
@@ -29,6 +29,7 @@ from ..services.search.rag_service import RAGService
 from ..services.storage import DocumentStorageService
 from ..utils import get_supabase_client
 from ..utils.document_processing import extract_text_from_document
+from ..utils.progress.progress_tracker import ProgressTracker

 # Get logger for this module
 logger = get_logger(__name__)
@@ -855,6 +856,135 @@ async def _perform_crawl_with_progress(
                )


+@router.post("/knowledge-items/crawl-v2")
+async def crawl_knowledge_item_v2(request: dict):
+    """
+    Crawl a URL with advanced domain filtering configuration.
+
+    This is version 2 of the crawl endpoint that supports domain filtering.
+    """
+    # Import CrawlRequestV2 model
+    from ..models.crawl_models import CrawlRequestV2, CrawlConfig
+
+    # Parse and validate request
+    crawl_request = CrawlRequestV2(**request)
+
+    # Validate API key before starting expensive operation
+    logger.info("🔍 About to validate API key for crawl-v2...")
+    provider_config = await credential_service.get_active_provider("embedding")
+    provider = provider_config.get("provider", "openai")
+    await _validate_provider_api_key(provider)
+    logger.info("✅ API key validation completed successfully")
+
+    try:
+        safe_logfire_info(
+            f"Starting knowledge item crawl v2 | url={crawl_request.url} | "
+            f"knowledge_type={crawl_request.knowledge_type} | "
+            f"has_crawl_config={crawl_request.crawl_config is not None}"
+        )
+
+        # Generate unique progress ID
+        progress_id = str(uuid.uuid4())
+
+        # Create progress tracker for HTTP polling
+        tracker = ProgressTracker(progress_id, operation_type="crawl")
+        await tracker.start({
+            "status": "starting",
+            "url": crawl_request.url,
+            "has_filters": crawl_request.crawl_config is not None
+        })
+
+        # Create async task for crawling
+        crawl_task = asyncio.create_task(_run_crawl_v2(request_dict=crawl_request.dict(), progress_id=progress_id))
+        active_crawl_tasks[progress_id] = crawl_task
+
+        safe_logfire_info(
+            f"Crawl v2 task created | progress_id={progress_id} | url={crawl_request.url}"
+        )
+
+        return {
+            "success": True,
+            "progressId": progress_id,
+            "message": "Crawl started with domain filtering",
+            "estimatedDuration": "2-10 minutes depending on site size"
+        }
+
+    except Exception as e:
+        safe_logfire_error(f"Failed to start crawl v2 | error={str(e)}")
+        raise HTTPException(status_code=500, detail={"error": str(e)})
+
+
+async def _run_crawl_v2(request_dict: dict, progress_id: str):
+    """Run the crawl v2 with domain filtering in background."""
+    tracker = ProgressTracker(progress_id, operation_type="crawl")
+
+    try:
+        safe_logfire_info(
+            f"Starting crawl v2 with progress tracking | progress_id={progress_id} | url={request_dict['url']}"
+        )
+
+        # Get crawler from CrawlerManager
+        try:
+            crawler = await get_crawler()
+            if crawler is None:
+                raise Exception("Crawler not available - initialization may have failed")
+        except Exception as e:
+            safe_logfire_error(f"Failed to get crawler | error={str(e)}")
+            await tracker.error(f"Failed to initialize crawler: {str(e)}")
+            return
+
+        supabase_client = get_supabase_client()
+
+        # Extract crawl_config if present
+        crawl_config_dict = request_dict.get("crawl_config")
+        crawl_config = None
+        if crawl_config_dict:
+            from ..models.crawl_models import CrawlConfig
+            crawl_config = CrawlConfig(**crawl_config_dict)
+
+        # Create orchestration service with crawl_config
+        orchestration_service = CrawlingService(
+            crawler,
+            supabase_client,
+            crawl_config=crawl_config
+        )
+        orchestration_service.set_progress_id(progress_id)
+
+        # Add crawl_config to metadata for storage
+        if crawl_config:
+            request_dict["metadata"] = request_dict.get("metadata", {})
+            request_dict["metadata"]["crawl_config"] = crawl_config.dict()
+
+        # Orchestrate the crawl - this returns immediately with task info
+        result = await orchestration_service.orchestrate_crawl(request_dict)
+
+        # Store the actual crawl task for proper cancellation
+        crawl_task = result.get("task")
+        if crawl_task:
+            active_crawl_tasks[progress_id] = crawl_task
+            safe_logfire_info(
+                f"Stored actual crawl v2 task in active_crawl_tasks | progress_id={progress_id}"
+            )
+        else:
+            safe_logfire_error(f"No task returned from orchestrate_crawl v2 | progress_id={progress_id}")
+
+        safe_logfire_info(
+            f"Crawl v2 task started | progress_id={progress_id} | task_id={result.get('task_id')}"
+        )
+
+    except asyncio.CancelledError:
+        safe_logfire_info(f"Crawl v2 cancelled | progress_id={progress_id}")
+        raise
+    except Exception as e:
+        safe_logfire_error(f"Crawl v2 task failed | progress_id={progress_id} | error={str(e)}")
+        await tracker.error(str(e))
+    finally:
+        # Clean up task from registry when done
+        if progress_id in active_crawl_tasks:
+            del active_crawl_tasks[progress_id]
+            safe_logfire_info(f"Cleaned up crawl v2 task from registry | progress_id={progress_id}")
+
+
@router.post("/documents/upload")
 async def upload_document(
    file: UploadFile = File(...),
--- a/python/src/server/models/init.py
+++ b/python/src/server/models/init.py
--- a/python/src/server/models/crawl_models.py
+++ b/python/src/server/models/crawl_models.py
@@ -0,0 +1,63 @@
+"""
+Crawling Models Module
+
+This module contains Pydantic models for crawling configuration,
+specifically for domain filtering and URL pattern matching.
+"""
+
+
+from pydantic import BaseModel, Field, validator
+
+
+class CrawlConfig(BaseModel):
+    """Configuration for domain filtering during crawl."""
+
+    allowed_domains: list[str] | None = Field(None, description="Whitelist of domains to crawl")
+    excluded_domains: list[str] | None = Field(None, description="Blacklist of domains to exclude")
+    include_patterns: list[str] | None = Field(None, description="URL patterns to include (glob-style)")
+    exclude_patterns: list[str] | None = Field(None, description="URL patterns to exclude (glob-style)")
+
+    @validator("allowed_domains", "excluded_domains", pre=True)
+    def normalize_domains(cls, v):
+        """Normalize domain formats for consistent matching."""
+        if v is None:
+            return v
+        return [d.lower().strip().replace("http://", "").replace("https://", "").rstrip("/") for d in v]
+
+    @validator("include_patterns", "exclude_patterns", pre=True)
+    def validate_patterns(cls, v):
+        """Validate URL patterns are valid glob patterns."""
+        if v is None:
+            return v
+        # Ensure patterns are strings and not empty
+        return [p.strip() for p in v if p and isinstance(p, str) and p.strip()]
+
+
+class CrawlRequestV2(BaseModel):
+    """Extended crawl request with domain filtering."""
+
+    url: str = Field(..., description="URL to start crawling from")
+    knowledge_type: str | None = Field("technical", description="Type of knowledge (technical/business)")
+    tags: list[str] | None = Field(default_factory=list, description="Tags to apply to crawled content")
+    update_frequency: int | None = Field(None, description="Update frequency in days")
+    max_depth: int | None = Field(3, description="Maximum crawl depth")
+    crawl_config: CrawlConfig | None = Field(None, description="Domain filtering configuration")
+    crawl_options: dict | None = Field(None, description="Additional crawl options")
+    extract_code_examples: bool | None = Field(True, description="Whether to extract code examples")
+
+    @validator("url")
+    def validate_url(cls, v):
+        """Ensure URL is properly formatted."""
+        if not v or not v.strip():
+            raise ValueError("URL cannot be empty")
+        # Add http:// if no protocol specified
+        if not v.startswith(("http://", "https://")):
+            v = f"https://{v}"
+        return v.strip()
+
+    @validator("knowledge_type")
+    def validate_knowledge_type(cls, v):
+        """Ensure knowledge type is valid."""
+        if v and v not in ["technical", "business"]:
+            return "technical"  # Default to technical if invalid
+        return v or "technical"
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -12,12 +12,14 @@ from collections.abc import Awaitable, Callable
 from typing import Any, Optional

 from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
+from ...models.crawl_models import CrawlConfig
 from ...utils import get_supabase_client
 from ...utils.progress.progress_tracker import ProgressTracker

 # Import strategies
 # Import operations
 from .document_storage_operations import DocumentStorageOperations
+from .domain_filter import DomainFilter
 from .helpers.site_config import SiteConfig

 # Import helpers
@@ -56,7 +58,7 @@ class CrawlingService:
    Combines functionality from both CrawlingService and CrawlOrchestrationService.
    """

-    def __init__(self, crawler=None, supabase_client=None, progress_id=None):
+    def __init__(self, crawler=None, supabase_client=None, progress_id=None, crawl_config=None):
        """
        Initialize the crawling service.

@@ -64,21 +66,24 @@ class CrawlingService:
            crawler: The Crawl4AI crawler instance
            supabase_client: The Supabase client for database operations
            progress_id: Optional progress ID for HTTP polling updates
+            crawl_config: Optional CrawlConfig for domain filtering
        """
        self.crawler = crawler
        self.supabase_client = supabase_client or get_supabase_client()
        self.progress_id = progress_id
        self.progress_tracker = None
+        self.crawl_config = crawl_config

        # Initialize helpers
        self.url_handler = URLHandler()
        self.site_config = SiteConfig()
        self.markdown_generator = self.site_config.get_markdown_generator()
        self.link_pruning_markdown_generator = self.site_config.get_link_pruning_markdown_generator()
+        self.domain_filter = DomainFilter()

        # Initialize strategies
        self.batch_strategy = BatchCrawlStrategy(crawler, self.link_pruning_markdown_generator)
-        self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator)
+        self.recursive_strategy = RecursiveCrawlStrategy(crawler, self.link_pruning_markdown_generator, self.domain_filter)
        self.single_page_strategy = SinglePageCrawlStrategy(crawler, self.markdown_generator)
        self.sitemap_strategy = SitemapCrawlStrategy()

@@ -225,6 +230,7 @@ class CrawlingService:
            max_concurrent,
            progress_callback,
            self._check_cancellation,  # Pass cancellation check
+            self.crawl_config,  # Pass crawl config for domain filtering
        )

    # Orchestration methods
--- a/python/src/server/services/crawling/domain_filter.py
+++ b/python/src/server/services/crawling/domain_filter.py
@@ -0,0 +1,169 @@
+"""
+Domain Filtering Module
+
+This module provides domain filtering utilities for web crawling,
+allowing users to control which domains and URL patterns are crawled.
+"""
+
+import fnmatch
+from urllib.parse import urlparse
+
+from ...config.logfire_config import get_logger
+from ...models.crawl_models import CrawlConfig
+
+logger = get_logger(__name__)
+
+
+class DomainFilter:
+    """
+    Handles domain and URL pattern filtering for crawl operations.
+
+    Priority order:
+    1. Blacklist (excluded_domains) - always blocks
+    2. Whitelist (allowed_domains) - must match if specified
+    3. Exclude patterns - blocks matching URLs
+    4. Include patterns - must match if specified
+    """
+
+    def is_url_allowed(self, url: str, base_url: str, config: CrawlConfig | None) -> bool:
+        """
+        Check if a URL should be crawled based on domain filtering configuration.
+
+        Args:
+            url: The URL to check
+            base_url: The base URL of the crawl (for resolving relative URLs)
+            config: The crawl configuration with filtering rules
+
+        Returns:
+            True if the URL should be crawled, False otherwise
+        """
+        if not config:
+            # No filtering configured, allow all URLs
+            return True
+
+        try:
+            # Parse the URL
+            parsed = urlparse(url)
+
+            # Handle relative URLs by using base URL's domain
+            if not parsed.netloc:
+                base_parsed = urlparse(base_url)
+                domain = base_parsed.netloc.lower()
+                # Construct full URL for pattern matching
+                full_url = f"{base_parsed.scheme}://{base_parsed.netloc}{parsed.path or '/'}"
+            else:
+                domain = parsed.netloc.lower()
+                full_url = url
+
+            # Remove www. prefix for consistent matching
+            normalized_domain = domain.replace("www.", "")
+
+            # PRIORITY 1: Blacklist always wins
+            if config.excluded_domains:
+                for excluded in config.excluded_domains:
+                    if self._matches_domain(normalized_domain, excluded):
+                        logger.debug(f"URL blocked by excluded domain | url={url} | domain={normalized_domain} | excluded={excluded}")
+                        return False
+
+            # PRIORITY 2: If whitelist exists, URL must match
+            if config.allowed_domains:
+                allowed = False
+                for allowed_domain in config.allowed_domains:
+                    if self._matches_domain(normalized_domain, allowed_domain):
+                        allowed = True
+                        break
+
+                if not allowed:
+                    logger.debug(f"URL blocked - not in allowed domains | url={url} | domain={normalized_domain}")
+                    return False
+
+            # PRIORITY 3: Check exclude patterns (glob-style)
+            if config.exclude_patterns:
+                for pattern in config.exclude_patterns:
+                    if fnmatch.fnmatch(full_url, pattern):
+                        logger.debug(f"URL blocked by exclude pattern | url={url} | pattern={pattern}")
+                        return False
+
+            # PRIORITY 4: Check include patterns if specified
+            if config.include_patterns:
+                matched = False
+                for pattern in config.include_patterns:
+                    if fnmatch.fnmatch(full_url, pattern):
+                        matched = True
+                        break
+
+                if not matched:
+                    logger.debug(f"URL blocked - doesn't match include patterns | url={url}")
+                    return False
+
+            logger.debug(f"URL allowed | url={url} | domain={normalized_domain}")
+            return True
+
+        except Exception as e:
+            logger.error(f"Error filtering URL | url={url} | error={str(e)}")
+            # On error, be conservative and block the URL
+            return False
+
+    def _matches_domain(self, domain: str, pattern: str) -> bool:
+        """
+        Check if a domain matches a pattern.
+
+        Supports:
+        - Exact matches: example.com matches example.com
+        - Subdomain wildcards: *.example.com matches sub.example.com
+        - Subdomain matching: sub.example.com matches sub.example.com and subsub.sub.example.com
+
+        Args:
+            domain: The domain to check (already normalized and lowercase)
+            pattern: The pattern to match against (already normalized and lowercase)
+
+        Returns:
+            True if the domain matches the pattern
+        """
+        # Remove any remaining protocol or path from pattern
+        pattern = pattern.replace("http://", "").replace("https://", "").split("/")[0]
+        pattern = pattern.replace("www.", "")  # Remove www. for consistent matching
+
+        # Exact match
+        if domain == pattern:
+            return True
+
+        # Wildcard subdomain match (*.example.com)
+        if pattern.startswith("*."):
+            base_pattern = pattern[2:]  # Remove *.
+            # Check if domain ends with the base pattern and has a subdomain
+            if domain.endswith(base_pattern):
+                # Make sure it's a proper subdomain, not just containing the pattern
+                prefix = domain[:-len(base_pattern)]
+                if prefix and prefix.endswith("."):
+                    return True
+
+        # Subdomain match (allow any subdomain of the pattern)
+        # e.g., pattern=example.com should match sub.example.com
+        if domain.endswith(f".{pattern}"):
+            return True
+
+        return False
+
+    def get_domains_from_urls(self, urls: list[str]) -> set[str]:
+        """
+        Extract unique domains from a list of URLs.
+
+        Args:
+            urls: List of URLs to extract domains from
+
+        Returns:
+            Set of unique domains (normalized and lowercase)
+        """
+        domains = set()
+        for url in urls:
+            try:
+                parsed = urlparse(url)
+                if parsed.netloc:
+                    domain = parsed.netloc.lower().replace("www.", "")
+                    domains.add(domain)
+            except Exception as e:
+                logger.debug(f"Could not extract domain from URL | url={url} | error={str(e)}")
+                continue
+
+        return domains
--- a/python/src/server/services/crawling/strategies/recursive.py
+++ b/python/src/server/services/crawling/strategies/recursive.py
@@ -21,17 +21,19 @@ logger = get_logger(__name__)
 class RecursiveCrawlStrategy:
    """Strategy for recursive crawling of websites."""

-    def __init__(self, crawler, markdown_generator):
+    def __init__(self, crawler, markdown_generator, domain_filter=None):
        """
        Initialize recursive crawl strategy.

        Args:
            crawler (AsyncWebCrawler): The Crawl4AI crawler instance for web crawling operations
            markdown_generator (DefaultMarkdownGenerator): The markdown generator instance for converting HTML to markdown
+            domain_filter: Optional DomainFilter instance for URL filtering
        """
        self.crawler = crawler
        self.markdown_generator = markdown_generator
        self.url_handler = URLHandler()
+        self.domain_filter = domain_filter

    async def crawl_recursive_with_progress(
        self,
@@ -42,6 +44,7 @@ class RecursiveCrawlStrategy:
        max_concurrent: int | None = None,
        progress_callback: Callable[..., Awaitable[None]] | None = None,
        cancellation_check: Callable[[], None] | None = None,
+        crawl_config=None,
    ) -> list[dict[str, Any]]:
        """
        Recursively crawl internal links from start URLs up to a maximum depth with progress reporting.
@@ -291,6 +294,13 @@ class RecursiveCrawlStrategy:
                            # Skip binary files and already visited URLs
                            is_binary = self.url_handler.is_binary_file(next_url)
                            if next_url not in visited and not is_binary:
+                                # Apply domain filtering if configured
+                                if self.domain_filter and crawl_config:
+                                    base_url = start_urls[0] if start_urls else original_url
+                                    if not self.domain_filter.is_url_allowed(next_url, base_url, crawl_config):
+                                        logger.debug(f"Filtering URL based on domain rules: {next_url}")
+                                        continue
+
                                if next_url not in next_level_urls:
                                    next_level_urls.add(next_url)
                                    total_discovered += 1  # Increment when we discover a new URL
--- a/python/src/server/services/tests/init.py
+++ b/python/src/server/services/tests/init.py
--- a/python/src/server/services/tests/test_domain_filter.py
+++ b/python/src/server/services/tests/test_domain_filter.py
@@ -0,0 +1,204 @@
+"""
+Unit tests for domain filtering functionality
+"""
+
+from src.server.models.crawl_models import CrawlConfig
+from src.server.services.crawling.domain_filter import DomainFilter
+
+
+class TestDomainFilter:
+    """Test suite for DomainFilter class."""
+
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.filter = DomainFilter()
+
+    def test_no_config_allows_all(self):
+        """Test that no configuration allows all URLs."""
+        assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", None) is True
+        assert self.filter.is_url_allowed("https://other.com/page", "https://example.com", None) is True
+
+    def test_whitelist_only(self):
+        """Test whitelist-only configuration."""
+        config = CrawlConfig(
+            allowed_domains=["example.com", "docs.example.com"]
+        )
+
+        # Should allow whitelisted domains
+        assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True
+
+        # Should block non-whitelisted domains
+        assert self.filter.is_url_allowed("https://other.com/page", "https://example.com", config) is False
+        assert self.filter.is_url_allowed("https://evil.com", "https://example.com", config) is False
+
+    def test_blacklist_only(self):
+        """Test blacklist-only configuration."""
+        config = CrawlConfig(
+            excluded_domains=["evil.com", "ads.example.com"]
+        )
+
+        # Should block blacklisted domains
+        assert self.filter.is_url_allowed("https://evil.com/page", "https://example.com", config) is False
+        assert self.filter.is_url_allowed("https://ads.example.com/track", "https://example.com", config) is False
+
+        # Should allow non-blacklisted domains
+        assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True
+
+    def test_blacklist_overrides_whitelist(self):
+        """Test that blacklist takes priority over whitelist."""
+        config = CrawlConfig(
+            allowed_domains=["example.com", "blog.example.com"],
+            excluded_domains=["blog.example.com"]
+        )
+
+        # Blacklist should override whitelist
+        assert self.filter.is_url_allowed("https://blog.example.com/post", "https://example.com", config) is False
+
+        # Non-blacklisted whitelisted domain should work
+        assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
+
+    def test_subdomain_matching(self):
+        """Test subdomain matching patterns."""
+        config = CrawlConfig(
+            allowed_domains=["example.com"]
+        )
+
+        # Should match subdomains of allowed domain
+        assert self.filter.is_url_allowed("https://docs.example.com/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://api.example.com/v1", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://sub.sub.example.com", "https://example.com", config) is True
+
+        # Should not match different domains
+        assert self.filter.is_url_allowed("https://notexample.com", "https://example.com", config) is False
+
+    def test_wildcard_subdomain_matching(self):
+        """Test wildcard subdomain patterns."""
+        config = CrawlConfig(
+            allowed_domains=["*.example.com"]
+        )
+
+        # Should match subdomains
+        assert self.filter.is_url_allowed("https://docs.example.com/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://api.example.com/v1", "https://example.com", config) is True
+
+        # Should NOT match the base domain without subdomain
+        assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is False
+
+    def test_url_patterns_include(self):
+        """Test include URL patterns."""
+        config = CrawlConfig(
+            include_patterns=["*/api/*", "*/docs/*"]
+        )
+
+        # Should match include patterns
+        assert self.filter.is_url_allowed("https://example.com/api/v1", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://example.com/docs/guide", "https://example.com", config) is True
+
+        # Should not match URLs not in patterns
+        assert self.filter.is_url_allowed("https://example.com/blog/post", "https://example.com", config) is False
+        assert self.filter.is_url_allowed("https://example.com/", "https://example.com", config) is False
+
+    def test_url_patterns_exclude(self):
+        """Test exclude URL patterns."""
+        config = CrawlConfig(
+            exclude_patterns=["*/private/*", "*.pdf", "*/admin/*"]
+        )
+
+        # Should block excluded patterns
+        assert self.filter.is_url_allowed("https://example.com/private/data", "https://example.com", config) is False
+        assert self.filter.is_url_allowed("https://example.com/file.pdf", "https://example.com", config) is False
+        assert self.filter.is_url_allowed("https://example.com/admin/panel", "https://example.com", config) is False
+
+        # Should allow non-excluded URLs
+        assert self.filter.is_url_allowed("https://example.com/public/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://example.com/file.html", "https://example.com", config) is True
+
+    def test_combined_filters(self):
+        """Test combination of all filter types."""
+        config = CrawlConfig(
+            allowed_domains=["example.com", "docs.example.com"],
+            excluded_domains=["ads.example.com"],
+            include_patterns=["*/api/*", "*/guide/*"],
+            exclude_patterns=["*/deprecated/*"]
+        )
+
+        # Should pass all filters
+        assert self.filter.is_url_allowed("https://docs.example.com/api/v2", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://example.com/guide/intro", "https://example.com", config) is True
+
+        # Should fail on blacklist (highest priority)
+        assert self.filter.is_url_allowed("https://ads.example.com/api/track", "https://example.com", config) is False
+
+        # Should fail on not in whitelist
+        assert self.filter.is_url_allowed("https://other.com/api/v1", "https://example.com", config) is False
+
+        # Should fail on exclude pattern
+        assert self.filter.is_url_allowed("https://example.com/api/deprecated/old", "https://example.com", config) is False
+
+        # Should fail on not matching include pattern
+        assert self.filter.is_url_allowed("https://example.com/blog/post", "https://example.com", config) is False
+
+    def test_relative_urls(self):
+        """Test handling of relative URLs."""
+        config = CrawlConfig(
+            allowed_domains=["example.com"]
+        )
+
+        # Relative URLs should use base URL's domain
+        assert self.filter.is_url_allowed("/page/path", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("page.html", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("../other/page", "https://example.com", config) is True
+
+    def test_domain_normalization(self):
+        """Test that domains are properly normalized."""
+        config = CrawlConfig(
+            allowed_domains=["EXAMPLE.COM", "https://docs.example.com/", "www.test.com"]
+        )
+
+        # Should handle different cases and formats
+        assert self.filter.is_url_allowed("https://example.com/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://EXAMPLE.COM/PAGE", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://docs.example.com/api", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://www.test.com/page", "https://example.com", config) is True
+        assert self.filter.is_url_allowed("https://test.com/page", "https://example.com", config) is True
+
+    def test_edge_cases(self):
+        """Test edge cases and error handling."""
+        config = CrawlConfig(
+            allowed_domains=["example.com"]
+        )
+
+        # Should handle malformed URLs gracefully
+        assert self.filter.is_url_allowed("not-a-url", "https://example.com", config) is True  # Treated as relative
+        assert self.filter.is_url_allowed("", "https://example.com", config) is True  # Empty URL
+        assert self.filter.is_url_allowed("//example.com/page", "https://example.com", config) is True  # Protocol-relative
+
+    def test_get_domains_from_urls(self):
+        """Test extracting domains from URL list."""
+        urls = [
+            "https://example.com/page1",
+            "https://docs.example.com/api",
+            "https://example.com/page2",
+            "https://other.com/resource",
+            "https://WWW.TEST.COM/page",
+            "/relative/path",  # Should be skipped
+            "invalid-url",  # Should be skipped
+        ]
+
+        domains = self.filter.get_domains_from_urls(urls)
+
+        assert domains == {"example.com", "docs.example.com", "other.com", "test.com"}
+
+    def test_empty_filter_lists(self):
+        """Test that empty filter lists behave correctly."""
+        config = CrawlConfig(
+            allowed_domains=[],
+            excluded_domains=[],
+            include_patterns=[],
+            exclude_patterns=[]
+        )
+
+        # Empty lists should be ignored (allow all)
+        assert self.filter.is_url_allowed("https://any.com/page", "https://example.com", config) is True