Merge remote-tracking branch 'origin/ui/agent-work-order' into feat/agent_work_orders_ui

2025-12-24 02:39:17 -05:00 · 2025-10-23 19:59:11 -04:00
parent acf1fcc21d 5b98adc1e4
commit 91f7747186
25 changed files with 3147 additions and 142 deletions
--- a/.github/ISSUE_TEMPLATE/auto_bug_report.md
+++ b/.github/ISSUE_TEMPLATE/auto_bug_report.md
@@ -0,0 +1,11 @@
 ---
 name: Auto Bug Report
 about: Automated bug report from Archon
 title: ''
 labels: bug, auto-report
 assignees: ''
 ---
 <!-- This template is used for automated bug reports submitted through the Archon UI -->
 <!-- The form data below is automatically filled by the bug reporter -->
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -216,6 +216,16 @@ SUPABASE_SERVICE_KEY=your-service-key-here      # Use legacy key format for clou
 Optional variables and full configuration:
 See `python/.env.example` for complete list
 ### Repository Configuration
 Repository information (owner, name) is centralized in `python/src/server/config/version.py`:
 - `GITHUB_REPO_OWNER` - GitHub repository owner (default: "coleam00")
 - `GITHUB_REPO_NAME` - GitHub repository name (default: "Archon")
 This is the single source of truth for repository configuration. All services (version checking, bug reports, etc.) should import these constants rather than hardcoding repository URLs.
 Environment variable override: `GITHUB_REPO="owner/repo"` can be set to override defaults.
 ## Common Development Tasks
 ### Add a new API endpoint
--- a/README.md
+++ b/README.md
@@ -113,7 +113,7 @@ This new vision for Archon replaces the old one (the agenteer). Archon used to b
 Once everything is running:
-1. **Test Web Crawling**: Go to http://localhost:3737 → Knowledge Base → "Crawl Website" → Enter a doc URL (such as https://ai.pydantic.dev/llms-full.txt)
+1. **Test Web Crawling**: Go to http://localhost:3737 → Knowledge Base → "Crawl Website" → Enter a doc URL (such as https://ai.pydantic.dev/llms.txt)
 2. **Test Document Upload**: Knowledge Base → Upload a PDF
 3. **Test Projects**: Projects → Create a new project and add tasks
 4. **Integrate with your AI coding assistant**: MCP Dashboard → Copy connection config for your AI coding assistant 
--- a/archon-ui-main/src/features/progress/components/CrawlingProgress.tsx
+++ b/archon-ui-main/src/features/progress/components/CrawlingProgress.tsx
@@ -12,6 +12,7 @@ import { Button } from "../../ui/primitives";
 import { cn } from "../../ui/primitives/styles";
 import { useCrawlProgressPolling } from "../hooks";
 import type { ActiveOperation } from "../types/progress";
 import { isValidHttpUrl } from "../utils/urlValidation";
 interface CrawlingProgressProps {
  onSwitchToBrowse: () => void;
@@ -129,6 +130,7 @@ export const CrawlingProgress: React.FC<CrawlingProgressProps> = ({ onSwitchToBr
            "in_progress",
            "starting",
            "initializing",
            "discovery",
            "analyzing",
            "storing",
            "source_creation",
@@ -245,6 +247,63 @@ export const CrawlingProgress: React.FC<CrawlingProgressProps> = ({ onSwitchToBr
                    )}
                  </div>
                  {/* Discovery Information */}
                  {operation.discovered_file && (
                    <div className="pt-2 border-t border-white/10">
                      <div className="flex items-center gap-2 mb-2">
                        <span className="text-xs font-semibold text-cyan-400">Discovery Result</span>
                        {operation.discovered_file_type && (
                          <span className="px-2 py-0.5 text-xs rounded bg-cyan-500/10 border border-cyan-500/20 text-cyan-300">
                            {operation.discovered_file_type}
                          </span>
                        )}
                      </div>
                      {isValidHttpUrl(operation.discovered_file) ? (
                        <a
                          href={operation.discovered_file}
                          target="_blank"
                          rel="noopener noreferrer"
                          className="text-sm text-gray-400 hover:text-cyan-400 transition-colors truncate block"
                        >
                          {operation.discovered_file}
                        </a>
                      ) : (
                        <span className="text-sm text-gray-400 truncate block">
                          {operation.discovered_file}
                        </span>
                      )}
                    </div>
                  )}
                  {/* Linked Files */}
                  {operation.linked_files && operation.linked_files.length > 0 && (
                    <div className="pt-2 border-t border-white/10">
                      <div className="text-xs font-semibold text-cyan-400 mb-2">
                        Following {operation.linked_files.length} Linked File
                        {operation.linked_files.length > 1 ? "s" : ""}
                      </div>
                      <div className="space-y-1 max-h-32 overflow-y-auto">
                        {operation.linked_files.map((file: string, idx: number) => (
                          isValidHttpUrl(file) ? (
                            <a
                              key={idx}
                              href={file}
                              target="_blank"
                              rel="noopener noreferrer"
                              className="text-xs text-gray-400 hover:text-cyan-400 transition-colors truncate block"
                            >
                              • {file}
                            </a>
                          ) : (
                            <span key={idx} className="text-xs text-gray-400 truncate block">
                              • {file}
                            </span>
                          )
                        ))}
                      </div>
                    </div>
                  )}
                  {/* Current Action or Operation Type Info */}
                  {(operation.current_url || operation.operation_type) && (
                    <div className="pt-2 border-t border-white/10">
--- a/archon-ui-main/src/features/progress/types/progress.ts
+++ b/archon-ui-main/src/features/progress/types/progress.ts
@@ -6,6 +6,7 @@
 export type ProgressStatus =
  | "starting"
  | "initializing"
  | "discovery"
  | "analyzing"
  | "crawling"
  | "processing"
@@ -24,7 +25,16 @@ export type ProgressStatus =
  | "cancelled"
  | "stopping";
-export type CrawlType = "normal" | "sitemap" | "llms-txt" | "text_file" | "refresh";
+export type CrawlType =
  | "normal"
  | "sitemap"
  | "llms-txt"
  | "text_file"
  | "refresh"
  | "llms_txt_with_linked_files"
  | "llms_txt_linked_files"
  | "discovery_single_file"
  | "discovery_sitemap";
 export type UploadType = "document";
 export interface BaseProgressData {
@@ -48,6 +58,10 @@ export interface CrawlProgressData extends BaseProgressData {
  codeBlocksFound?: number;
  totalSummaries?: number;
  completedSummaries?: number;
  // Discovery-related fields
  discoveredFile?: string;
  discoveredFileType?: string;
  linkedFiles?: string[];
  originalCrawlParams?: {
    url: string;
    knowledge_type?: string;
@@ -100,6 +114,10 @@ export interface ActiveOperation {
    code_examples_found?: number;
    current_operation?: string;
  };
  // Discovery information
  discovered_file?: string;
  discovered_file_type?: string;
  linked_files?: string[];
 }
 export interface ActiveOperationsResponse {
@@ -127,6 +145,13 @@ export interface ProgressResponse {
  codeBlocksFound?: number;
  totalSummaries?: number;
  completedSummaries?: number;
  // Discovery-related fields
  discoveredFile?: string;
  discovered_file?: string; // Snake case from backend
  discoveredFileType?: string;
  discovered_file_type?: string; // Snake case from backend
  linkedFiles?: string[];
  linked_files?: string[]; // Snake case from backend
  fileName?: string;
  fileSize?: number;
  chunksProcessed?: number;
--- a/archon-ui-main/src/features/progress/utils/urlValidation.ts
+++ b/archon-ui-main/src/features/progress/utils/urlValidation.ts
@@ -0,0 +1,44 @@
 /**
 * Client-side URL validation utility for discovered files.
 * Ensures only safe HTTP/HTTPS URLs are rendered as clickable links.
 */
 const SAFE_PROTOCOLS = ["http:", "https:"];
 /**
 * Validates that a URL is safe to render as a clickable link.
 * Only allows http: and https: protocols.
 *
 * @param url - URL string to validate
 * @returns true if URL is safe (http/https), false otherwise
 */
 export function isValidHttpUrl(url: string | undefined | null): boolean {
 	if (!url || typeof url !== "string") {
 		return false;
 	}
 	// Trim whitespace
 	const trimmed = url.trim();
 	if (!trimmed) {
 		return false;
 	}
 	try {
 		const parsed = new URL(trimmed);
 		// Only allow http and https protocols
 		if (!SAFE_PROTOCOLS.includes(parsed.protocol)) {
 			return false;
 		}
 		// Basic hostname validation (must have at least one dot or be localhost)
 		if (!parsed.hostname.includes(".") && parsed.hostname !== "localhost") {
 			return false;
 		}
 		return true;
 	} catch {
 		// URL parsing failed - not a valid URL
 		return false;
 	}
 }
--- a/archon-ui-main/src/features/style-guide/layouts/AgentWorkOrderExample.tsx
+++ b/archon-ui-main/src/features/style-guide/layouts/AgentWorkOrderExample.tsx
@@ -0,0 +1,332 @@
 import { AnimatePresence, motion } from "framer-motion";
 import { ChevronDown, ChevronUp, ExternalLink, Plus, User } from "lucide-react";
 import { useState } from "react";
 import { Button } from "@/features/ui/primitives/button";
 import { Card } from "@/features/ui/primitives/card";
 import { cn } from "@/features/ui/primitives/styles";
 import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/features/ui/primitives/tooltip";
 import { StepHistoryCard } from "./components/StepHistoryCard";
 import { WorkflowStepButton } from "./components/WorkflowStepButton";
 const MOCK_WORK_ORDER = {
  id: "wo-1",
  title: "Create comprehensive documentation",
  status: "in_progress" as const,
  workflow: {
    currentStep: 2,
    steps: [
      { id: "1", name: "Create Branch", status: "completed", duration: "33s" },
      { id: "2", name: "Planning", status: "in_progress", duration: "2m 11s" },
      { id: "3", name: "Execute", status: "pending", duration: null },
      { id: "4", name: "Commit", status: "pending", duration: null },
      { id: "5", name: "Create PR", status: "pending", duration: null },
    ],
  },
  stepHistory: [
    {
      id: "step-1",
      stepName: "Create Branch",
      timestamp: "7 minutes ago",
      output: "docs/remove-archon-mentions",
      session: "Session: a342d9ac-56c4-43ae-95b8-9ddf18143961",
      collapsible: true,
    },
    {
      id: "step-2",
      stepName: "Planning",
      timestamp: "5 minutes ago",
      output: `## Report
 **Work completed:**
 - Conducted comprehensive codebase audit for "archon" and "Archon" mentions
 - Verified main README.md is already breach (no archon mentions present)
 - Identified 14 subdirectory README files that need verification
 - Discovered historical git commits that added "hello from archon" but content has been removed
 - Identified 3 remote branches with "archon" in their names (out of scope for this task)
 - Created comprehensive PRP plan for documentation cleanup and verification`,
      session: "Session: e3889823-b272-43c0-b11d-7a786d7e3c88",
      collapsible: true,
      isHumanInLoop: true,
    },
  ],
  document: {
    id: "doc-1",
    title: "Planning Document",
    content: {
      markdown: `# Documentation Cleanup Plan
 ## Overview
 This document outlines the plan to remove all "archon" mentions from the codebase.
 ## Steps
 1. Audit all README files
 2. Check git history for sensitive content
 3. Verify no configuration files reference "archon"
 4. Update documentation
 ## Progress
 - [x] Initial audit complete
 - [ ] README updates pending
 - [ ] Configuration review pending`,
    },
  },
 };
 export const AgentWorkOrderExample = () => {
  const [hoveredStepIndex, setHoveredStepIndex] = useState<number | null>(null);
  const [expandedSteps, setExpandedSteps] = useState<Set<string>>(new Set(["step-2"]));
  const [showDetails, setShowDetails] = useState(false);
  const [humanInLoopCheckpoints, setHumanInLoopCheckpoints] = useState<Set<number>>(new Set());
  const toggleStepExpansion = (stepId: string) => {
    setExpandedSteps((prev) => {
      const newSet = new Set(prev);
      if (newSet.has(stepId)) {
        newSet.delete(stepId);
      } else {
        newSet.add(stepId);
      }
      return newSet;
    });
  };
  const addHumanInLoopCheckpoint = (index: number) => {
    setHumanInLoopCheckpoints((prev) => {
      const newSet = new Set(prev);
      newSet.add(index);
      return newSet;
    });
    setHoveredStepIndex(null);
  };
  const removeHumanInLoopCheckpoint = (index: number) => {
    setHumanInLoopCheckpoints((prev) => {
      const newSet = new Set(prev);
      newSet.delete(index);
      return newSet;
    });
  };
  return (
    <div className="space-y-6">
      {/* Explanation Text */}
      <p className="text-sm text-gray-600 dark:text-gray-400">
        <strong>Use this layout for:</strong> Agent work order workflows with step-by-step progress tracking,
        collapsible history, and integrated document editing for human-in-the-loop approval.
      </p>
      {/* Workflow Progress Bar */}
      <Card blur="md" transparency="light" edgePosition="top" edgeColor="cyan" size="lg" className="overflow-visible">
        <div className="flex items-center justify-between mb-6">
          <h3 className="text-lg font-semibold text-gray-900 dark:text-white">{MOCK_WORK_ORDER.title}</h3>
          <Button
            variant="ghost"
            size="sm"
            onClick={() => setShowDetails(!showDetails)}
            className="text-cyan-600 dark:text-cyan-400 hover:bg-cyan-500/10"
            aria-label={showDetails ? "Hide details" : "Show details"}
          >
            {showDetails ? (
              <ChevronUp className="w-4 h-4 mr-1" aria-hidden="true" />
            ) : (
              <ChevronDown className="w-4 h-4 mr-1" aria-hidden="true" />
            )}
            Details
          </Button>
        </div>
        <div className="flex items-center justify-center gap-0">
          {MOCK_WORK_ORDER.workflow.steps.map((step, index) => (
            <div key={step.id} className="flex items-center">
              {/* Step Button */}
              <WorkflowStepButton
                isCompleted={step.status === "completed"}
                isActive={step.status === "in_progress"}
                stepName={step.name}
                color="cyan"
                size={50}
              />
              {/* Connecting Line - only show between steps */}
              {index < MOCK_WORK_ORDER.workflow.steps.length - 1 && (
                // biome-ignore lint/a11y/noStaticElementInteractions: Visual hover effect container for showing plus button
                <div
                  className="relative flex-shrink-0"
                  style={{ width: "80px", height: "50px" }}
                  onMouseEnter={() => setHoveredStepIndex(index)}
                  onMouseLeave={() => setHoveredStepIndex(null)}
                >
                  {/* Neon line */}
                  <div
                    className={cn(
                      "absolute top-1/2 left-0 right-0 h-[2px] transition-all duration-200",
                      step.status === "completed"
                        ? "border-t-2 border-cyan-400 shadow-[0_0_8px_rgba(34,211,238,0.6)]"
                        : "border-t-2 border-gray-600 dark:border-gray-700",
                      hoveredStepIndex === index &&
                        step.status !== "completed" &&
                        "border-cyan-400/50 shadow-[0_0_6px_rgba(34,211,238,0.3)]",
                    )}
                  />
                  {/* Human-in-Loop Checkpoint Indicator */}
                  {humanInLoopCheckpoints.has(index) && (
                    <TooltipProvider>
                      <Tooltip>
                        <TooltipTrigger asChild>
                          <button
                            type="button"
                            onClick={() => removeHumanInLoopCheckpoint(index)}
                            className="absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2 bg-orange-500 hover:bg-orange-600 rounded-full p-1.5 shadow-lg shadow-orange-500/50 border-2 border-orange-400 transition-colors cursor-pointer"
                            aria-label="Remove Human-in-Loop checkpoint"
                          >
                            <User className="w-3.5 h-3.5 text-white" aria-hidden="true" />
                          </button>
                        </TooltipTrigger>
                        <TooltipContent>Click to remove</TooltipContent>
                      </Tooltip>
                    </TooltipProvider>
                  )}
                  {/* Plus button on hover - only show if no checkpoint exists */}
                  {hoveredStepIndex === index && !humanInLoopCheckpoints.has(index) && (
                    <TooltipProvider>
                      <Tooltip>
                        <TooltipTrigger asChild>
                          <button
                            type="button"
                            onClick={() => addHumanInLoopCheckpoint(index)}
                            className="absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2 w-8 h-8 rounded-full bg-orange-500 hover:bg-orange-600 transition-colors shadow-lg shadow-orange-500/50 flex items-center justify-center text-white"
                            aria-label="Add Human-in-Loop step"
                          >
                            <Plus className="w-4 h-4" aria-hidden="true" />
                          </button>
                        </TooltipTrigger>
                        <TooltipContent>Add Human-in-Loop</TooltipContent>
                      </Tooltip>
                    </TooltipProvider>
                  )}
                </div>
              )}
            </div>
          ))}
        </div>
        {/* Collapsible Details Section */}
        <AnimatePresence>
          {showDetails && (
            <motion.div
              initial={{ height: 0, opacity: 0 }}
              animate={{ height: "auto", opacity: 1 }}
              exit={{ height: 0, opacity: 0 }}
              transition={{
                height: {
                  duration: 0.3,
                  ease: [0.04, 0.62, 0.23, 0.98],
                },
                opacity: {
                  duration: 0.2,
                  ease: "easeInOut",
                },
              }}
              style={{ overflow: "hidden" }}
              className="mt-6"
            >
              <motion.div
                initial={{ y: -20 }}
                animate={{ y: 0 }}
                exit={{ y: -20 }}
                transition={{
                  duration: 0.2,
                  ease: "easeOut",
                }}
                className="grid grid-cols-1 md:grid-cols-2 gap-6 pt-6 border-t border-gray-200/50 dark:border-gray-700/30"
              >
                {/* Left Column */}
                <div className="space-y-4">
                  <div>
                    <h4 className="text-xs font-semibold text-gray-500 dark:text-gray-400 uppercase tracking-wider mb-2">
                      Details
                    </h4>
                    <div className="space-y-3">
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Status</p>
                        <p className="text-sm font-medium text-blue-600 dark:text-blue-400 mt-0.5">Running</p>
                      </div>
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Sandbox Type</p>
                        <p className="text-sm font-medium text-gray-900 dark:text-white mt-0.5">git_branch</p>
                      </div>
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Repository</p>
                        <a
                          href="https://github.com/Wirasm/dylan"
                          target="_blank"
                          rel="noopener noreferrer"
                          className="text-sm font-medium text-cyan-600 dark:text-cyan-400 hover:underline inline-flex items-center gap-1 mt-0.5"
                        >
                          https://github.com/Wirasm/dylan
                          <ExternalLink className="w-3 h-3" aria-hidden="true" />
                        </a>
                      </div>
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Branch</p>
                        <p className="text-sm font-medium font-mono text-gray-900 dark:text-white mt-0.5">
                          docs/remove-archon-mentions
                        </p>
                      </div>
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Work Order ID</p>
                        <p className="text-sm font-medium font-mono text-gray-700 dark:text-gray-300 mt-0.5">
                          wo-7fd39c8d
                        </p>
                      </div>
                    </div>
                  </div>
                </div>
                {/* Right Column */}
                <div className="space-y-4">
                  <div>
                    <h4 className="text-xs font-semibold text-gray-500 dark:text-gray-400 uppercase tracking-wider mb-2">
                      Statistics
                    </h4>
                    <div className="space-y-3">
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Commits</p>
                        <p className="text-2xl font-bold text-gray-900 dark:text-white mt-0.5">0</p>
                      </div>
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Files Changed</p>
                        <p className="text-2xl font-bold text-gray-900 dark:text-white mt-0.5">0</p>
                      </div>
                      <div>
                        <p className="text-xs text-gray-500 dark:text-gray-400">Steps Completed</p>
                        <p className="text-2xl font-bold text-gray-900 dark:text-white mt-0.5">2 / 2</p>
                      </div>
                    </div>
                  </div>
                </div>
              </motion.div>
            </motion.div>
          )}
        </AnimatePresence>
      </Card>
      {/* Step History Section */}
      <div className="space-y-4">
        <h3 className="text-lg font-semibold text-gray-900 dark:text-white">Step History</h3>
        {MOCK_WORK_ORDER.stepHistory.map((step) => (
          <StepHistoryCard
            key={step.id}
            step={step}
            isExpanded={expandedSteps.has(step.id)}
            onToggle={() => toggleStepExpansion(step.id)}
            document={step.isHumanInLoop ? MOCK_WORK_ORDER.document : undefined}
          />
        ))}
      </div>
    </div>
  );
 };
--- a/archon-ui-main/src/features/style-guide/layouts/components/StepHistoryCard.tsx
+++ b/archon-ui-main/src/features/style-guide/layouts/components/StepHistoryCard.tsx
@@ -0,0 +1,265 @@
 import { AnimatePresence, motion } from "framer-motion";
 import { AlertCircle, CheckCircle2, ChevronDown, ChevronUp, Edit3, Eye } from "lucide-react";
 import { useState } from "react";
 import ReactMarkdown from "react-markdown";
 import { Button } from "@/features/ui/primitives/button";
 import { Card } from "@/features/ui/primitives/card";
 import { cn } from "@/features/ui/primitives/styles";
 interface StepHistoryCardProps {
  step: {
    id: string;
    stepName: string;
    timestamp: string;
    output: string;
    session: string;
    collapsible: boolean;
    isHumanInLoop?: boolean;
  };
  isExpanded: boolean;
  onToggle: () => void;
  document?: {
    title: string;
    content: {
      markdown: string;
    };
  };
 }
 export const StepHistoryCard = ({ step, isExpanded, onToggle, document }: StepHistoryCardProps) => {
  const [isEditingDocument, setIsEditingDocument] = useState(false);
  const [editedContent, setEditedContent] = useState("");
  const [hasChanges, setHasChanges] = useState(false);
  const handleToggleEdit = () => {
    if (!isEditingDocument && document) {
      setEditedContent(document.content.markdown);
    }
    setIsEditingDocument(!isEditingDocument);
    setHasChanges(false);
  };
  const handleContentChange = (value: string) => {
    setEditedContent(value);
    setHasChanges(document ? value !== document.content.markdown : false);
  };
  const handleApproveAndContinue = () => {
    console.log("Approved and continuing to next step");
    setHasChanges(false);
    setIsEditingDocument(false);
  };
  return (
    <Card
      blur="md"
      transparency="light"
      edgePosition="left"
      edgeColor={step.isHumanInLoop ? "orange" : "blue"}
      size="md"
      className="overflow-visible"
    >
      {/* Header */}
      <div className="flex items-center justify-between mb-3">
        <div className="flex-1">
          <div className="flex items-center gap-2">
            <h4 className="font-semibold text-gray-900 dark:text-white">{step.stepName}</h4>
            {step.isHumanInLoop && (
              <span className="inline-flex items-center gap-1 px-2 py-1 text-xs font-medium rounded-md bg-orange-500/10 text-orange-600 dark:text-orange-400 border border-orange-500/20">
                <AlertCircle className="w-3 h-3" aria-hidden="true" />
                Human-in-Loop
              </span>
            )}
          </div>
          <p className="text-xs text-gray-500 dark:text-gray-400 mt-1">{step.timestamp}</p>
        </div>
        {/* Collapse toggle - only show if collapsible */}
        {step.collapsible && (
          <Button
            variant="ghost"
            size="sm"
            onClick={onToggle}
            className={cn(
              "px-2 transition-colors",
              step.isHumanInLoop
                ? "text-orange-500 hover:text-orange-600 dark:hover:text-orange-400"
                : "text-cyan-500 hover:text-cyan-600 dark:hover:text-cyan-400",
            )}
            aria-label={isExpanded ? "Collapse step" : "Expand step"}
            aria-expanded={isExpanded}
          >
            {isExpanded ? <ChevronUp className="w-4 h-4" /> : <ChevronDown className="w-4 h-4" />}
          </Button>
        )}
      </div>
      {/* Content - collapsible with animation */}
      <AnimatePresence mode="wait">
        {(isExpanded || !step.collapsible) && (
          <motion.div
            initial={{ height: 0, opacity: 0 }}
            animate={{ height: "auto", opacity: 1 }}
            exit={{ height: 0, opacity: 0 }}
            transition={{
              height: {
                duration: 0.3,
                ease: [0.04, 0.62, 0.23, 0.98],
              },
              opacity: {
                duration: 0.2,
                ease: "easeInOut",
              },
            }}
            style={{ overflow: "hidden" }}
          >
            <motion.div
              initial={{ y: -20 }}
              animate={{ y: 0 }}
              exit={{ y: -20 }}
              transition={{
                duration: 0.2,
                ease: "easeOut",
              }}
              className="space-y-3"
            >
              {/* Output content */}
              <div
                className={cn(
                  "p-4 rounded-lg border",
                  step.isHumanInLoop
                    ? "bg-orange-50/50 dark:bg-orange-950/10 border-orange-200/50 dark:border-orange-800/30"
                    : "bg-cyan-50/30 dark:bg-cyan-950/10 border-cyan-200/50 dark:border-cyan-800/30",
                )}
              >
                <pre className="text-xs font-mono text-gray-700 dark:text-gray-300 whitespace-pre-wrap leading-relaxed">
                  {step.output}
                </pre>
              </div>
              {/* Session info */}
              <p
                className={cn(
                  "text-xs font-mono",
                  step.isHumanInLoop ? "text-orange-600 dark:text-orange-400" : "text-cyan-600 dark:text-cyan-400",
                )}
              >
                {step.session}
              </p>
              {/* Review and Approve Plan - only for human-in-loop steps with documents */}
              {step.isHumanInLoop && document && (
                <div className="mt-6 space-y-3">
                  <h4 className="text-sm font-semibold text-gray-900 dark:text-white">Review and Approve Plan</h4>
                  {/* Document Card */}
                  <Card blur="md" transparency="light" size="md" className="overflow-visible">
                    {/* View/Edit toggle in top right */}
                    <div className="flex items-center justify-end mb-3">
                      <Button
                        variant="ghost"
                        size="sm"
                        onClick={handleToggleEdit}
                        className="text-gray-600 dark:text-gray-400 hover:bg-gray-500/10"
                        aria-label={isEditingDocument ? "Switch to preview mode" : "Switch to edit mode"}
                      >
                        {isEditingDocument ? (
                          <Eye className="w-4 h-4" aria-hidden="true" />
                        ) : (
                          <Edit3 className="w-4 h-4" aria-hidden="true" />
                        )}
                      </Button>
                    </div>
                    {isEditingDocument ? (
                      <div className="space-y-4">
                        <textarea
                          value={editedContent}
                          onChange={(e) => handleContentChange(e.target.value)}
                          className={cn(
                            "w-full min-h-[300px] p-4 rounded-lg",
                            "bg-white/50 dark:bg-black/30",
                            "border border-gray-300 dark:border-gray-700",
                            "text-gray-900 dark:text-white font-mono text-sm",
                            "focus:outline-none focus:border-orange-400 focus:ring-2 focus:ring-orange-400/20",
                            "resize-y",
                          )}
                          placeholder="Enter markdown content..."
                        />
                      </div>
                    ) : (
                      <div className="prose prose-sm dark:prose-invert max-w-none">
                        <ReactMarkdown
                          components={{
                            h1: ({ node, ...props }) => (
                              <h1 className="text-xl font-bold text-gray-900 dark:text-white mb-3 mt-4" {...props} />
                            ),
                            h2: ({ node, ...props }) => (
                              <h2
                                className="text-lg font-semibold text-gray-900 dark:text-white mb-2 mt-3"
                                {...props}
                              />
                            ),
                            h3: ({ node, ...props }) => (
                              <h3
                                className="text-base font-semibold text-gray-900 dark:text-white mb-2 mt-3"
                                {...props}
                              />
                            ),
                            p: ({ node, ...props }) => (
                              <p className="text-sm text-gray-700 dark:text-gray-300 mb-2 leading-relaxed" {...props} />
                            ),
                            ul: ({ node, ...props }) => (
                              <ul
                                className="list-disc list-inside text-sm text-gray-700 dark:text-gray-300 mb-2 space-y-1"
                                {...props}
                              />
                            ),
                            li: ({ node, ...props }) => <li className="ml-4" {...props} />,
                            code: ({ node, ...props }) => (
                              <code
                                className="bg-gray-100 dark:bg-gray-800 px-1.5 py-0.5 rounded text-xs font-mono text-orange-600 dark:text-orange-400"
                                {...props}
                              />
                            ),
                          }}
                        >
                          {document.content.markdown}
                        </ReactMarkdown>
                      </div>
                    )}
                    {/* Approve button - always visible with glass styling */}
                    <div className="flex items-center justify-between mt-4 pt-4 border-t border-gray-200/50 dark:border-gray-700/30">
                      <p className="text-xs text-gray-500 dark:text-gray-400">
                        {hasChanges ? "Unsaved changes" : "No changes"}
                      </p>
                      <Button
                        onClick={handleApproveAndContinue}
                        className={cn(
                          "backdrop-blur-md",
                          "bg-gradient-to-b from-green-100/80 to-white/60",
                          "dark:from-green-500/20 dark:to-green-500/10",
                          "text-green-700 dark:text-green-100",
                          "border border-green-300/50 dark:border-green-500/50",
                          "hover:from-green-200/90 hover:to-green-100/70",
                          "dark:hover:from-green-400/30 dark:hover:to-green-500/20",
                          "hover:shadow-[0_0_20px_rgba(34,197,94,0.5)]",
                          "dark:hover:shadow-[0_0_25px_rgba(34,197,94,0.7)]",
                          "shadow-lg shadow-green-500/20",
                        )}
                      >
                        <CheckCircle2 className="w-4 h-4 mr-2" aria-hidden="true" />
                        Approve and Move to Next Step
                      </Button>
                    </div>
                  </Card>
                </div>
              )}
            </motion.div>
          </motion.div>
        )}
      </AnimatePresence>
    </Card>
  );
 };
--- a/archon-ui-main/src/features/style-guide/layouts/components/WorkflowStepButton.tsx
+++ b/archon-ui-main/src/features/style-guide/layouts/components/WorkflowStepButton.tsx
@@ -0,0 +1,170 @@
 import { motion } from "framer-motion";
 import type React from "react";
 interface WorkflowStepButtonProps {
  isCompleted: boolean;
  isActive: boolean;
  stepName: string;
  onClick?: () => void;
  color?: "cyan" | "green" | "blue" | "purple";
  size?: number;
 }
 // Helper function to get color hex values for animations
 const getColorValue = (color: string) => {
  const colorValues = {
    purple: "rgb(168,85,247)",
    green: "rgb(34,197,94)",
    blue: "rgb(59,130,246)",
    cyan: "rgb(34,211,238)",
  };
  return colorValues[color as keyof typeof colorValues] || colorValues.blue;
 };
 export const WorkflowStepButton: React.FC<WorkflowStepButtonProps> = ({
  isCompleted,
  isActive,
  stepName,
  onClick,
  color = "cyan",
  size = 40,
 }) => {
  const colorMap = {
    purple: {
      border: "border-purple-400",
      glow: "shadow-[0_0_15px_rgba(168,85,247,0.8)]",
      glowHover: "hover:shadow-[0_0_25px_rgba(168,85,247,1)]",
      fill: "bg-purple-400",
      innerGlow: "shadow-[inset_0_0_10px_rgba(168,85,247,0.8)]",
    },
    green: {
      border: "border-green-400",
      glow: "shadow-[0_0_15px_rgba(34,197,94,0.8)]",
      glowHover: "hover:shadow-[0_0_25px_rgba(34,197,94,1)]",
      fill: "bg-green-400",
      innerGlow: "shadow-[inset_0_0_10px_rgba(34,197,94,0.8)]",
    },
    blue: {
      border: "border-blue-400",
      glow: "shadow-[0_0_15px_rgba(59,130,246,0.8)]",
      glowHover: "hover:shadow-[0_0_25px_rgba(59,130,246,1)]",
      fill: "bg-blue-400",
      innerGlow: "shadow-[inset_0_0_10px_rgba(59,130,246,0.8)]",
    },
    cyan: {
      border: "border-cyan-400",
      glow: "shadow-[0_0_15px_rgba(34,211,238,0.8)]",
      glowHover: "hover:shadow-[0_0_25px_rgba(34,211,238,1)]",
      fill: "bg-cyan-400",
      innerGlow: "shadow-[inset_0_0_10px_rgba(34,211,238,0.8)]",
    },
  };
  const styles = colorMap[color];
  return (
    <div className="flex flex-col items-center gap-2">
      <motion.button
        onClick={onClick}
        className={`
          relative rounded-full border-2 transition-all duration-300
          ${styles.border}
          ${isCompleted ? styles.glow : "shadow-[0_0_5px_rgba(0,0,0,0.3)]"}
          ${styles.glowHover}
          bg-gradient-to-b from-gray-900 to-black
          hover:scale-110
          active:scale-95
        `}
        style={{ width: size, height: size }}
        whileHover={{ scale: 1.1 }}
        whileTap={{ scale: 0.95 }}
        type="button"
        aria-label={`${stepName} - ${isCompleted ? "completed" : isActive ? "in progress" : "pending"}`}
      >
        {/* Outer ring glow effect */}
        <motion.div
          className={`
            absolute inset-[-4px] rounded-full border-2
            ${isCompleted ? styles.border : "border-transparent"}
            blur-sm
          `}
          animate={{
            opacity: isCompleted ? [0.3, 0.6, 0.3] : 0,
          }}
          transition={{
            duration: 2,
            repeat: Infinity,
            ease: "easeInOut",
          }}
        />
        {/* Inner glow effect */}
        <motion.div
          className={`
            absolute inset-[2px] rounded-full
            ${isCompleted ? styles.fill : ""}
            blur-md opacity-20
          `}
          animate={{
            opacity: isCompleted ? [0.1, 0.3, 0.1] : 0,
          }}
          transition={{
            duration: 2,
            repeat: Infinity,
            ease: "easeInOut",
          }}
        />
        {/* Checkmark icon container */}
        <div className="relative w-full h-full flex items-center justify-center">
          <motion.svg
            width={size * 0.5}
            height={size * 0.5}
            viewBox="0 0 24 24"
            fill="none"
            className="relative z-10"
            role="img"
            aria-label={`${stepName} status indicator`}
            animate={{
              filter: isCompleted
                ? [
                    `drop-shadow(0 0 8px ${getColorValue(color)}) drop-shadow(0 0 12px ${getColorValue(color)})`,
                    `drop-shadow(0 0 12px ${getColorValue(color)}) drop-shadow(0 0 16px ${getColorValue(color)})`,
                    `drop-shadow(0 0 8px ${getColorValue(color)}) drop-shadow(0 0 12px ${getColorValue(color)})`,
                  ]
                : "none",
            }}
            transition={{
              duration: 2,
              repeat: Infinity,
              ease: "easeInOut",
            }}
          >
            {/* Checkmark path */}
            <path
              d="M20 6L9 17l-5-5"
              stroke="currentColor"
              strokeWidth="3"
              strokeLinecap="round"
              strokeLinejoin="round"
              className={isCompleted ? "text-white" : "text-gray-600"}
            />
          </motion.svg>
        </div>
      </motion.button>
      {/* Step name label */}
      <span
        className={`text-xs font-medium transition-colors ${
          isCompleted
            ? "text-cyan-400 dark:text-cyan-300"
            : isActive
              ? "text-blue-500 dark:text-blue-400"
              : "text-gray-500 dark:text-gray-400"
        }`}
      >
        {stepName}
      </span>
    </div>
  );
 };
--- a/archon-ui-main/src/features/style-guide/shared/SideNavigation.tsx
+++ b/archon-ui-main/src/features/style-guide/shared/SideNavigation.tsx
@@ -1,4 +1,7 @@
 import { ChevronLeft, ChevronRight } from "lucide-react";
 import type { ReactNode } from "react";
 import { useState } from "react";
 import { Button } from "@/features/ui/primitives/button";
 import { cn } from "@/features/ui/primitives/styles";
 export interface SideNavigationSection {
@@ -14,9 +17,23 @@ interface SideNavigationProps {
 }
 export const SideNavigation = ({ sections, activeSection, onSectionClick }: SideNavigationProps) => {
  const [isCollapsed, setIsCollapsed] = useState(false);
  return (
-    <div className="w-32 flex-shrink-0">
+    <div className={cn("flex-shrink-0 transition-all duration-300", isCollapsed ? "w-12" : "w-32")}>
      <div className="sticky top-4 space-y-0.5">
        {/* Collapse/Expand button */}
        <div className="mb-2 flex justify-end">
          <Button
            variant="ghost"
            size="sm"
            onClick={() => setIsCollapsed(!isCollapsed)}
            className="px-2 py-1 h-auto text-gray-500 hover:text-gray-700 dark:hover:text-gray-300"
            aria-label={isCollapsed ? "Expand navigation" : "Collapse navigation"}
          >
            {isCollapsed ? <ChevronRight className="w-4 h-4" /> : <ChevronLeft className="w-4 h-4" />}
          </Button>
        </div>
        {sections.map((section) => {
          const isActive = activeSection === section.id;
          return (
@@ -24,16 +41,18 @@ export const SideNavigation = ({ sections, activeSection, onSectionClick }: Side
              key={section.id}
              type="button"
              onClick={() => onSectionClick(section.id)}
              title={isCollapsed ? section.label : undefined}
              className={cn(
                "w-full text-left px-2 py-1.5 rounded-md transition-all duration-200",
                "flex items-center gap-1.5",
                isActive
                  ? "bg-blue-500/10 dark:bg-blue-400/10 text-blue-700 dark:text-blue-300 border-l-2 border-blue-500"
                  : "text-gray-600 dark:text-gray-400 hover:bg-white/5 dark:hover:bg-white/5 border-l-2 border-transparent",
                isCollapsed && "justify-center",
              )}
            >
              {section.icon && <span className="flex-shrink-0 w-3 h-3">{section.icon}</span>}
-              <span className="text-xs font-medium truncate">{section.label}</span>
+              {!isCollapsed && <span className="text-xs font-medium truncate">{section.label}</span>}
            </button>
          );
        })}
--- a/archon-ui-main/src/features/style-guide/tabs/LayoutsTab.tsx
+++ b/archon-ui-main/src/features/style-guide/tabs/LayoutsTab.tsx
@@ -1,5 +1,6 @@
-import { Database, FileText, FolderKanban, Navigation, Settings } from "lucide-react";
+import { Briefcase, Database, FileText, FolderKanban, Navigation, Settings } from "lucide-react";
 import { useState } from "react";
 import { AgentWorkOrderExample } from "../layouts/AgentWorkOrderExample";
 import { DocumentBrowserExample } from "../layouts/DocumentBrowserExample";
 import { KnowledgeLayoutExample } from "../layouts/KnowledgeLayoutExample";
 import { NavigationExplanation } from "../layouts/NavigationExplanation";
@@ -16,6 +17,7 @@ export const LayoutsTab = () => {
    { id: "settings", label: "Settings", icon: <Settings className="w-4 h-4" /> },
    { id: "knowledge", label: "Knowledge", icon: <Database className="w-4 h-4" /> },
    { id: "document-browser", label: "Document Browser", icon: <FileText className="w-4 h-4" /> },
    { id: "agent-work-orders", label: "Agent Work Orders", icon: <Briefcase className="w-4 h-4" /> },
  ];
  // Render content based on active section
@@ -68,6 +70,16 @@ export const LayoutsTab = () => {
            <DocumentBrowserExample />
          </div>
        );
      case "agent-work-orders":
        return (
          <div>
            <h2 className="text-2xl font-bold mb-4 text-gray-900 dark:text-white">Agent Work Orders Layout</h2>
            <p className="text-gray-600 dark:text-gray-400 mb-4">
              Workflow progress visualization with step-by-step history and integrated document editing.
            </p>
            <AgentWorkOrderExample />
          </div>
        );
      default:
        return (
          <div>
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -59,6 +59,7 @@ server = [
    "pydantic>=2.0.0",
    "python-dotenv>=1.0.0",
    "docker>=6.1.0",
    "tldextract>=5.0.0",
    # Logging
    "logfire>=0.30.0",
    # Testing (needed for UI-triggered tests)
@@ -127,6 +128,7 @@ all = [
    "cryptography>=41.0.0",
    "slowapi>=0.1.9",
    "docker>=6.1.0",
    "tldextract>=5.0.0",
    "logfire>=0.30.0",
    # MCP specific (mcp version)
    "mcp==1.12.2",
--- a/python/src/server/api_routes/bug_report_api.py
+++ b/python/src/server/api_routes/bug_report_api.py
@@ -12,6 +12,7 @@ from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 from ..config.logfire_config import get_logger
 from ..config.version import GITHUB_REPO_NAME, GITHUB_REPO_OWNER
 logger = get_logger(__name__)
@@ -47,7 +48,9 @@ class BugReportResponse(BaseModel):
 class GitHubService:
    def __init__(self):
        self.token = os.getenv("GITHUB_TOKEN")
-        self.repo = os.getenv("GITHUB_REPO", "dynamous-community/Archon-V2-Alpha")
+        # Use centralized version config with environment override
        default_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
        self.repo = os.getenv("GITHUB_REPO", default_repo)
    async def create_issue(self, bug_report: BugReportRequest) -> dict[str, Any]:
        """Create a GitHub issue from a bug report."""
@@ -243,14 +246,14 @@ def _create_manual_submission_response(bug_report: BugReportRequest) -> BugRepor
    import urllib.parse
    base_url = f"https://github.com/{github_service.repo}/issues/new"
    params = {
        "template": "bug_report.yml",
        "title": bug_report.title,
        "labels": f"bug,auto-report,severity:{bug_report.severity},component:{bug_report.component}",
    }
-    # Add the formatted body as a parameter
+    # Use Markdown template for structured layout with URL pre-filling support
-    params["body"] = issue_body
+    # YAML templates don't support URL parameters, but Markdown templates do
    params = {
        "template": "auto_bug_report.md",
        "title": bug_report.title,
        "body": issue_body,
    }
    # Build the URL
    query_string = urllib.parse.urlencode(params)
@@ -271,10 +274,13 @@ async def bug_report_health():
    github_configured = bool(os.getenv("GITHUB_TOKEN"))
    repo_configured = bool(os.getenv("GITHUB_REPO"))
    # Use centralized version config with environment override
    default_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
    return {
        "status": "healthy" if github_configured else "degraded",
        "github_token_configured": github_configured,
        "github_repo_configured": repo_configured,
-        "repo": os.getenv("GITHUB_REPO", "dynamous-community/Archon-V2-Alpha"),
+        "repo": os.getenv("GITHUB_REPO", default_repo),
        "message": "Bug reporting is ready" if github_configured else "GitHub token not configured",
    }
--- a/python/src/server/services/crawling/crawling_service.py
+++ b/python/src/server/services/crawling/crawling_service.py
@@ -11,6 +11,8 @@ import uuid
 from collections.abc import Awaitable, Callable
 from typing import Any, Optional
 import tldextract
 from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
 from ...utils import get_supabase_client
 from ...utils.progress.progress_tracker import ProgressTracker
@@ -18,12 +20,13 @@ from ..credential_service import credential_service
 # Import strategies
 # Import operations
 from .discovery_service import DiscoveryService
 from .document_storage_operations import DocumentStorageOperations
 from .page_storage_operations import PageStorageOperations
 from .helpers.site_config import SiteConfig
 # Import helpers
 from .helpers.url_handler import URLHandler
 from .page_storage_operations import PageStorageOperations
 from .progress_mapper import ProgressMapper
 from .strategies.batch import BatchCrawlStrategy
 from .strategies.recursive import RecursiveCrawlStrategy
@@ -37,6 +40,34 @@ _active_orchestrations: dict[str, "CrawlingService"] = {}
 _orchestration_lock: asyncio.Lock | None = None
 def get_root_domain(host: str) -> str:
    """
    Extract the root domain from a hostname using tldextract.
    Handles multi-part public suffixes correctly (e.g., .co.uk, .com.au).
    Args:
        host: Hostname to extract root domain from
    Returns:
        Root domain (domain + suffix) or original host if extraction fails
    Examples:
        - "docs.example.com" -> "example.com"
        - "api.example.co.uk" -> "example.co.uk"
        - "localhost" -> "localhost"
    """
    try:
        extracted = tldextract.extract(host)
        # Return domain.suffix if both are present
        if extracted.domain and extracted.suffix:
            return f"{extracted.domain}.{extracted.suffix}"
        # Fallback to original host if extraction yields no domain or suffix
        return host
    except Exception:
        # If extraction fails, return original host
        return host
 def _ensure_orchestration_lock() -> asyncio.Lock:
    global _orchestration_lock
    if _orchestration_lock is None:
@@ -99,6 +130,7 @@ class CrawlingService:
        # Initialize operations
        self.doc_storage_ops = DocumentStorageOperations(self.supabase_client)
        self.discovery_service = DiscoveryService()
        self.page_storage_ops = PageStorageOperations(self.supabase_client)
        # Track progress state across all stages to prevent UI resets
@@ -196,13 +228,16 @@ class CrawlingService:
        )
    async def crawl_markdown_file(
-        self, url: str, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None
+        self, url: str, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
        start_progress: int = 10, end_progress: int = 20
    ) -> list[dict[str, Any]]:
        """Crawl a .txt or markdown file."""
        return await self.single_page_strategy.crawl_markdown_file(
            url,
            self.url_handler.transform_github_url,
            progress_callback,
            start_progress,
            end_progress,
        )
    def parse_sitemap(self, sitemap_url: str) -> list[str]:
@@ -351,15 +386,102 @@ class CrawlingService:
            # Check for cancellation before proceeding
            self._check_cancellation()
-            # Analyzing stage - report initial page count (at least 1)
+            # Discovery phase - find the single best related file
-            await update_mapped_progress(
+            discovered_urls = []
-                "analyzing", 50, f"Analyzing URL type for {url}",
+            # Skip discovery if the URL itself is already a discovery target (sitemap, llms file, etc.)
-                total_pages=1,  # We know we have at least the start URL
+            is_already_discovery_target = (
-                processed_pages=0
+                self.url_handler.is_sitemap(url) or
                self.url_handler.is_llms_variant(url) or
                self.url_handler.is_robots_txt(url) or
                self.url_handler.is_well_known_file(url) or
                self.url_handler.is_txt(url)  # Also skip for any .txt file that user provides directly
            )
-            # Detect URL type and perform crawl
+            if is_already_discovery_target:
-            crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
+                safe_logfire_info(f"Skipping discovery - URL is already a discovery target file: {url}")
            if request.get("auto_discovery", True) and not is_already_discovery_target:  # Default enabled, but skip if already a discovery file
                await update_mapped_progress(
                    "discovery", 25, f"Discovering best related file for {url}", current_url=url
                )
                try:
                    # Offload potential sync I/O to avoid blocking the event loop
                    discovered_file = await asyncio.to_thread(self.discovery_service.discover_files, url)
                    # Add the single best discovered file to crawl list
                    if discovered_file:
                        safe_logfire_info(f"Discovery found file: {discovered_file}")
                        # Filter through is_binary_file() check like existing code
                        if not self.url_handler.is_binary_file(discovered_file):
                            discovered_urls.append(discovered_file)
                            safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}")
                            # Determine file type for user feedback
                            discovered_file_type = "unknown"
                            if self.url_handler.is_llms_variant(discovered_file):
                                discovered_file_type = "llms.txt"
                            elif self.url_handler.is_sitemap(discovered_file):
                                discovered_file_type = "sitemap"
                            elif self.url_handler.is_robots_txt(discovered_file):
                                discovered_file_type = "robots.txt"
                            await update_mapped_progress(
                                "discovery", 100,
                                f"Discovery completed: found {discovered_file_type} file",
                                current_url=url,
                                discovered_file=discovered_file,
                                discovered_file_type=discovered_file_type
                            )
                        else:
                            safe_logfire_info(f"Skipping binary file: {discovered_file}")
                    else:
                        safe_logfire_info(f"Discovery found no files for {url}")
                        await update_mapped_progress(
                            "discovery", 100,
                            "Discovery completed: no special files found, will crawl main URL",
                            current_url=url
                        )
                except Exception as e:
                    safe_logfire_error(f"Discovery phase failed: {e}")
                    # Continue with regular crawl even if discovery fails
                    await update_mapped_progress(
                        "discovery", 100, "Discovery phase failed, continuing with regular crawl", current_url=url
                    )
            # Analyzing stage - determine what to crawl
            if discovered_urls:
                # Discovery found a file - crawl ONLY the discovered file, not the main URL
                total_urls_to_crawl = len(discovered_urls)
                await update_mapped_progress(
                    "analyzing", 50, f"Analyzing discovered file: {discovered_urls[0]}",
                    total_pages=total_urls_to_crawl,
                    processed_pages=0
                )
                # Crawl only the discovered file with discovery context
                discovered_url = discovered_urls[0]
                safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}")
                # Mark this as a discovery target for domain filtering
                discovery_request = request.copy()
                discovery_request["is_discovery_target"] = True
                discovery_request["original_domain"] = self.url_handler.get_base_url(discovered_url)
                crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request)
            else:
                # No discovery - crawl the main URL normally
                total_urls_to_crawl = 1
                await update_mapped_progress(
                    "analyzing", 50, f"Analyzing URL type for {url}",
                    total_pages=total_urls_to_crawl,
                    processed_pages=0
                )
                # Crawl the main URL
                safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
                crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
            # Update progress tracker with crawl type
            if self.progress_tracker and crawl_type:
@@ -531,7 +653,7 @@ class CrawlingService:
                    logger.error("Code extraction failed, continuing crawl without code examples", exc_info=True)
                    safe_logfire_error(f"Code extraction failed | error={e}")
                    code_examples_count = 0
-                    
+
                    # Report code extraction failure to progress tracker
                    if self.progress_tracker:
                        await self.progress_tracker.update(
@@ -628,6 +750,66 @@ class CrawlingService:
                    f"Unregistered orchestration service on error | progress_id={self.progress_id}"
                )
    def _is_same_domain(self, url: str, base_domain: str) -> bool:
        """
        Check if a URL belongs to the same domain as the base domain.
        Args:
            url: URL to check
            base_domain: Base domain URL to compare against
        Returns:
            True if the URL is from the same domain
        """
        try:
            from urllib.parse import urlparse
            u, b = urlparse(url), urlparse(base_domain)
            url_host = (u.hostname or "").lower()
            base_host = (b.hostname or "").lower()
            return bool(url_host) and url_host == base_host
        except Exception:
            # If parsing fails, be conservative and exclude the URL
            return False
    def _is_same_domain_or_subdomain(self, url: str, base_domain: str) -> bool:
        """
        Check if a URL belongs to the same root domain or subdomain.
        Examples:
            - docs.supabase.com matches supabase.com (subdomain)
            - api.supabase.com matches supabase.com (subdomain)
            - supabase.com matches supabase.com (exact match)
            - external.com does NOT match supabase.com
        Args:
            url: URL to check
            base_domain: Base domain URL to compare against
        Returns:
            True if the URL is from the same root domain or subdomain
        """
        try:
            from urllib.parse import urlparse
            u, b = urlparse(url), urlparse(base_domain)
            url_host = (u.hostname or "").lower()
            base_host = (b.hostname or "").lower()
            if not url_host or not base_host:
                return False
            # Exact match
            if url_host == base_host:
                return True
            # Check if url_host is a subdomain of base_host using tldextract
            url_root = get_root_domain(url_host)
            base_root = get_root_domain(base_host)
            return url_root == base_root
        except Exception:
            # If parsing fails, be conservative and exclude the URL
            return False
    def _is_self_link(self, link: str, base_url: str) -> bool:
        """
        Check if a link is a self-referential link to the base URL.
@@ -700,6 +882,63 @@ class CrawlingService:
            if crawl_results and len(crawl_results) > 0:
                content = crawl_results[0].get('markdown', '')
                if self.url_handler.is_link_collection_file(url, content):
                    # If this file was selected by discovery, check if it's an llms.txt file
                    if request.get("is_discovery_target"):
                        # Check if this is an llms.txt file (not sitemap or other discovery targets)
                        is_llms_file = self.url_handler.is_llms_variant(url)
                        if is_llms_file:
                            logger.info(f"Discovery llms.txt mode: following ALL same-domain links from {url}")
                            # Extract all links from the file
                            extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
                            # Filter for same-domain links (all types, not just llms.txt)
                            same_domain_links = []
                            if extracted_links_with_text:
                                original_domain = request.get("original_domain")
                                if original_domain:
                                    for link, text in extracted_links_with_text:
                                        # Check same domain/subdomain for ALL links
                                        if self._is_same_domain_or_subdomain(link, original_domain):
                                            same_domain_links.append((link, text))
                                            logger.debug(f"Found same-domain link: {link}")
                            if same_domain_links:
                                # Build mapping and extract just URLs
                                url_to_link_text = dict(same_domain_links)
                                extracted_urls = [link for link, _ in same_domain_links]
                                logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt")
                                # Notify user about linked files being crawled
                                await update_crawl_progress(
                                    60,  # 60% of crawling stage
                                    f"Found {len(extracted_urls)} links in llms.txt, crawling them now...",
                                    crawl_type="llms_txt_linked_files",
                                    linked_files=extracted_urls
                                )
                                # Crawl all same-domain links from llms.txt (no recursion, just one level)
                                batch_results = await self.crawl_batch_with_progress(
                                    extracted_urls,
                                    max_concurrent=request.get('max_concurrent'),
                                    progress_callback=await self._create_crawl_progress_callback("crawling"),
                                    link_text_fallbacks=url_to_link_text,
                                )
                                # Combine original llms.txt with linked pages
                                crawl_results.extend(batch_results)
                                crawl_type = "llms_txt_with_linked_pages"
                                logger.info(f"llms.txt crawling completed: {len(crawl_results)} total pages (1 llms.txt + {len(batch_results)} linked pages)")
                                return crawl_results, crawl_type
                        # For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode
                        logger.info(f"Discovery single-file mode: skipping link extraction for {url}")
                        crawl_type = "discovery_single_file"
                        logger.info(f"Discovery file crawling completed: {len(crawl_results)} result")
                        return crawl_results, crawl_type
                    # Extract links WITH text from the content
                    extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
@@ -714,6 +953,19 @@ class CrawlingService:
                        if self_filtered_count > 0:
                            logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
                    # For discovery targets, only follow same-domain links
                    if extracted_links_with_text and request.get("is_discovery_target"):
                        original_domain = request.get("original_domain")
                        if original_domain:
                            original_count = len(extracted_links_with_text)
                            extracted_links_with_text = [
                                (link, text) for link, text in extracted_links_with_text
                                if self._is_same_domain(link, original_domain)
                            ]
                            domain_filtered_count = original_count - len(extracted_links_with_text)
                            if domain_filtered_count > 0:
                                safe_logfire_info(f"Discovery mode: filtered out {domain_filtered_count} external links, keeping {len(extracted_links_with_text)} same-domain links")
                    # Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
                    if extracted_links_with_text:
                        original_count = len(extracted_links_with_text)
@@ -724,26 +976,39 @@ class CrawlingService:
                    if extracted_links_with_text:
                        # Build mapping of URL -> link text for title fallback
-                        url_to_link_text = {link: text for link, text in extracted_links_with_text}
+                        url_to_link_text = dict(extracted_links_with_text)
                        extracted_links = [link for link, _ in extracted_links_with_text]
-                        # Crawl the extracted links using batch crawling
+                        # For discovery targets, respect max_depth for same-domain links
-                        logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
+                        max_depth = request.get('max_depth', 2) if request.get("is_discovery_target") else request.get('max_depth', 1)
-                        batch_results = await self.crawl_batch_with_progress(
+
-                            extracted_links,
+                        if max_depth > 1 and request.get("is_discovery_target"):
-                            max_concurrent=request.get('max_concurrent'),  # None -> use DB settings
+                            # Use recursive crawling to respect depth limit for same-domain links
-                            progress_callback=await self._create_crawl_progress_callback("crawling"),
+                            logger.info(f"Crawling {len(extracted_links)} same-domain links with max_depth={max_depth-1}")
-                            link_text_fallbacks=url_to_link_text,  # Pass link text for title fallback
+                            batch_results = await self.crawl_recursive_with_progress(
-                        )
+                                extracted_links,
                                max_depth=max_depth - 1,  # Reduce depth since we're already 1 level deep
                                max_concurrent=request.get('max_concurrent'),
                                progress_callback=await self._create_crawl_progress_callback("crawling"),
                            )
                        else:
                            # Use normal batch crawling (with link text fallbacks)
                            logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
                            batch_results = await self.crawl_batch_with_progress(
                                extracted_links,
                                max_concurrent=request.get('max_concurrent'),  # None -> use DB settings
                                progress_callback=await self._create_crawl_progress_callback("crawling"),
                                link_text_fallbacks=url_to_link_text,  # Pass link text for title fallback
                            )
                        # Combine original text file results with batch results
                        crawl_results.extend(batch_results)
                        crawl_type = "link_collection_with_crawled_links"
                        logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
-                    else:
+                else:
-                        logger.info(f"No valid links found in link collection file: {url}")
+                    logger.info(f"No valid links found in link collection file: {url}")
-                        logger.info(f"Text file crawling completed: {len(crawl_results)} results")
+                    logger.info(f"Text file crawling completed: {len(crawl_results)} results")
        elif self.url_handler.is_sitemap(url):
            # Handle sitemaps
@@ -753,6 +1018,20 @@ class CrawlingService:
                "Detected sitemap, parsing URLs...",
                crawl_type=crawl_type
            )
            # If this sitemap was selected by discovery, just return the sitemap itself (single-file mode)
            if request.get("is_discovery_target"):
                logger.info(f"Discovery single-file mode: returning sitemap itself without crawling URLs from {url}")
                crawl_type = "discovery_sitemap"
                # Return the sitemap file as the result
                crawl_results = [{
                    'url': url,
                    'markdown': f"# Sitemap: {url}\n\nThis is a sitemap file discovered and returned in single-file mode.",
                    'title': f"Sitemap - {self.url_handler.extract_display_name(url)}",
                    'crawl_type': crawl_type
                }]
                return crawl_results, crawl_type
            sitemap_urls = self.parse_sitemap(url)
            if sitemap_urls:
--- a/python/src/server/services/crawling/discovery_service.py
+++ b/python/src/server/services/crawling/discovery_service.py
@@ -0,0 +1,558 @@
 """
 Discovery Service for Automatic File Detection
 Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
 to enhance crawling capabilities with priority-based discovery methods.
 """
 import ipaddress
 import socket
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
 import requests
 from ...config.logfire_config import get_logger
 logger = get_logger(__name__)
 class SitemapHTMLParser(HTMLParser):
    """HTML parser for extracting sitemap references from link and meta tags."""
    def __init__(self):
        super().__init__()
        self.sitemaps = []
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
        """Handle start tags to find sitemap references."""
        attrs_dict = {k.lower(): v for k, v in attrs if v is not None}
        # Check <link rel="sitemap" href="...">
        if tag == 'link':
            rel = attrs_dict.get('rel', '').lower()
            # Handle multi-valued rel attributes (space-separated)
            rel_values = rel.split() if rel else []
            if 'sitemap' in rel_values:
                href = attrs_dict.get('href')
                if href:
                    self.sitemaps.append(('link', href))
        # Check <meta name="sitemap" content="...">
        elif tag == 'meta':
            name = attrs_dict.get('name', '').lower()
            if name == 'sitemap':
                content = attrs_dict.get('content')
                if content:
                    self.sitemaps.append(('meta', content))
 class DiscoveryService:
    """Service for discovering related files automatically during crawls."""
    # Maximum response size to prevent memory exhaustion (10MB default)
    MAX_RESPONSE_SIZE = 10 * 1024 * 1024  # 10 MB
    # Global priority order - select ONE best file from all categories
    # Based on actual usage research - only includes files commonly found in the wild
    DISCOVERY_PRIORITY = [
        # LLMs files (highest priority - most comprehensive AI guidance)
        "llms.txt",          # Standard llms.txt spec - widely adopted
        "llms-full.txt",     # Part of llms.txt spec - comprehensive content
        # Sitemap files (structural crawling guidance)
        "sitemap.xml",       # Universal standard for site structure
        # Robots file (basic crawling rules)
        "robots.txt",        # Universal standard for crawl directives
        # Well-known variants (alternative locations per RFC 8615)
        ".well-known/ai.txt",
        ".well-known/llms.txt",
        ".well-known/sitemap.xml"
    ]
    # Known file extensions for path detection
    FILE_EXTENSIONS = {
        '.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv',
        '.rss', '.yaml', '.yml', '.pdf', '.zip'
    }
    def discover_files(self, base_url: str) -> str | None:
        """
        Main discovery orchestrator - selects ONE best file across all categories.
        All files contain similar AI/crawling guidance, so we only need the best one.
        Args:
            base_url: Base URL to discover files for
        Returns:
            Single best URL found, or None if no files discovered
        """
        try:
            logger.info(f"Starting single-file discovery for {base_url}")
            # Extract directory path from base URL
            base_dir = self._extract_directory(base_url)
            # Try each file in priority order
            for filename in self.DISCOVERY_PRIORITY:
                discovered_url = self._try_locations(base_url, base_dir, filename)
                if discovered_url:
                    logger.info(f"Discovery found best file: {discovered_url}")
                    return discovered_url
            # Fallback: Check HTML meta tags for sitemap references
            html_sitemaps = self._parse_html_meta_tags(base_url)
            if html_sitemaps:
                best_file = html_sitemaps[0]
                logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
                return best_file
            logger.info(f"Discovery completed for {base_url}: no files found")
            return None
        except Exception:
            logger.exception(f"Unexpected error during discovery for {base_url}")
            return None
    def _extract_directory(self, base_url: str) -> str:
        """
        Extract directory path from URL, handling both file URLs and directory URLs.
        Args:
            base_url: URL to extract directory from
        Returns:
            Directory path (without trailing slash)
        """
        parsed = urlparse(base_url)
        base_path = parsed.path.rstrip('/')
        # Check if last segment is a file (has known extension)
        last_segment = base_path.split('/')[-1] if base_path else ''
        has_file_extension = any(last_segment.lower().endswith(ext) for ext in self.FILE_EXTENSIONS)
        if has_file_extension:
            # Remove filename to get directory
            return '/'.join(base_path.split('/')[:-1])
        else:
            # Last segment is a directory
            return base_path
    def _try_locations(self, base_url: str, base_dir: str, filename: str) -> str | None:
        """
        Try different locations for a given filename in priority order.
        Priority:
        1. Same directory as base_url (if not root)
        2. Root level
        3. Common subdirectories (based on file type)
        Args:
            base_url: Original base URL
            base_dir: Extracted directory path
            filename: Filename to search for
        Returns:
            URL if file found, None otherwise
        """
        parsed = urlparse(base_url)
        # Priority 1: Check same directory (if not root)
        if base_dir and base_dir != '/':
            same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
            if self._check_url_exists(same_dir_url):
                return same_dir_url
        # Priority 2: Check root level
        root_url = urljoin(base_url, filename)
        if self._check_url_exists(root_url):
            return root_url
        # Priority 3: Check common subdirectories
        subdirs = self._get_subdirs_for_file(base_dir, filename)
        for subdir in subdirs:
            subdir_url = urljoin(base_url, f"{subdir}/{filename}")
            if self._check_url_exists(subdir_url):
                return subdir_url
        return None
    def _get_subdirs_for_file(self, base_dir: str, filename: str) -> list[str]:
        """
        Get relevant subdirectories to check based on file type.
        Args:
            base_dir: Base directory path
            filename: Filename being searched for
        Returns:
            List of subdirectory names to check
        """
        subdirs = []
        # Include base directory name if available
        if base_dir and base_dir != '/':
            base_dir_name = base_dir.split('/')[-1]
            if base_dir_name:
                subdirs.append(base_dir_name)
        # Add type-specific subdirectories
        if filename.startswith('llms') or filename.endswith('.txt') or filename.endswith('.md'):
            # LLMs files commonly in these locations
            subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
        elif filename.endswith('.xml') and not filename.startswith('.well-known'):
            # Sitemap files commonly in these locations
            subdirs.extend(["docs", "sitemaps", "sitemap", "xml", "feed"])
        return subdirs
    def _is_safe_ip(self, ip_str: str) -> bool:
        """
        Check if an IP address is safe (not private, loopback, link-local, or cloud metadata).
        Args:
            ip_str: IP address string to check
        Returns:
            True if IP is safe for outbound requests, False otherwise
        """
        try:
            ip = ipaddress.ip_address(ip_str)
            # Block private networks
            if ip.is_private:
                logger.warning(f"Blocked private IP address: {ip_str}")
                return False
            # Block loopback (127.0.0.0/8, ::1)
            if ip.is_loopback:
                logger.warning(f"Blocked loopback IP address: {ip_str}")
                return False
            # Block link-local (169.254.0.0/16, fe80::/10)
            if ip.is_link_local:
                logger.warning(f"Blocked link-local IP address: {ip_str}")
                return False
            # Block multicast
            if ip.is_multicast:
                logger.warning(f"Blocked multicast IP address: {ip_str}")
                return False
            # Block reserved ranges
            if ip.is_reserved:
                logger.warning(f"Blocked reserved IP address: {ip_str}")
                return False
            # Additional explicit checks for cloud metadata services
            # AWS metadata service
            if str(ip) == "169.254.169.254":
                logger.warning(f"Blocked AWS metadata service IP: {ip_str}")
                return False
            # GCP metadata service
            if str(ip) == "169.254.169.254":
                logger.warning(f"Blocked GCP metadata service IP: {ip_str}")
                return False
            return True
        except ValueError:
            logger.warning(f"Invalid IP address format: {ip_str}")
            return False
    def _resolve_and_validate_hostname(self, hostname: str) -> bool:
        """
        Resolve hostname to IP and validate it's safe.
        Args:
            hostname: Hostname to resolve and validate
        Returns:
            True if hostname resolves to safe IPs only, False otherwise
        """
        try:
            # Resolve hostname to IP addresses
            addr_info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
            # Check all resolved IPs
            for info in addr_info:
                ip_str = info[4][0]
                if not self._is_safe_ip(ip_str):
                    logger.warning(f"Hostname {hostname} resolves to unsafe IP {ip_str}")
                    return False
            return True
        except socket.gaierror as e:
            logger.warning(f"DNS resolution failed for {hostname}: {e}")
            return False
        except Exception as e:
            logger.warning(f"Error resolving hostname {hostname}: {e}")
            return False
    def _check_url_exists(self, url: str) -> bool:
        """
        Check if a URL exists and returns a successful response.
        Includes SSRF protection by validating hostnames and blocking private IPs.
        Args:
            url: URL to check
        Returns:
            True if URL returns 200, False otherwise
        """
        try:
            # Parse URL to extract hostname
            parsed = urlparse(url)
            if not parsed.scheme or not parsed.netloc:
                logger.warning(f"Invalid URL format: {url}")
                return False
            # Only allow HTTP/HTTPS
            if parsed.scheme not in ('http', 'https'):
                logger.warning(f"Blocked non-HTTP(S) scheme: {parsed.scheme}")
                return False
            # Validate initial hostname
            hostname = parsed.netloc.split(':')[0]  # Remove port if present
            if not self._resolve_and_validate_hostname(hostname):
                logger.warning(f"URL check blocked due to unsafe hostname: {url}")
                return False
            # Set safe User-Agent header
            headers = {
                'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'
            }
            # Create a session with limited redirects
            session = requests.Session()
            session.max_redirects = 3
            # Make request with redirect validation
            resp = session.get(
                url,
                timeout=5,
                allow_redirects=True,
                verify=True,
                headers=headers
            )
            try:
                # Check if there were redirects (history attribute exists on real responses)
                if hasattr(resp, 'history') and resp.history:
                    logger.debug(f"URL {url} had {len(resp.history)} redirect(s)")
                    # Validate final destination
                    final_url = resp.url
                    final_parsed = urlparse(final_url)
                    # Only allow HTTP/HTTPS for final destination
                    if final_parsed.scheme not in ('http', 'https'):
                        logger.warning(f"Blocked redirect to non-HTTP(S) scheme: {final_parsed.scheme}")
                        return False
                    # Validate final hostname
                    final_hostname = final_parsed.netloc.split(':')[0]
                    if not self._resolve_and_validate_hostname(final_hostname):
                        logger.warning(f"Redirect target blocked due to unsafe hostname: {final_url}")
                        return False
                # Check response status
                success = resp.status_code == 200
                logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
                return success
            finally:
                if hasattr(resp, 'close'):
                    resp.close()
        except requests.exceptions.TooManyRedirects:
            logger.warning(f"Too many redirects for URL: {url}")
            return False
        except requests.exceptions.Timeout:
            logger.debug(f"Timeout checking URL: {url}")
            return False
        except requests.exceptions.RequestException as e:
            logger.debug(f"Request error checking URL {url}: {e}")
            return False
        except Exception as e:
            logger.warning(f"Unexpected error checking URL {url}: {e}", exc_info=True)
            return False
    def _parse_robots_txt(self, base_url: str) -> list[str]:
        """
        Extract sitemap URLs from robots.txt.
        Args:
            base_url: Base URL to check robots.txt for
        Returns:
            List of sitemap URLs found in robots.txt
        """
        sitemaps: list[str] = []
        try:
            robots_url = urljoin(base_url, "robots.txt")
            logger.info(f"Checking robots.txt at {robots_url}")
            # Set safe User-Agent header
            headers = {
                'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'
            }
            resp = requests.get(robots_url, timeout=30, stream=True, verify=True, headers=headers)
            try:
                if resp.status_code != 200:
                    logger.info(f"No robots.txt found: HTTP {resp.status_code}")
                    return sitemaps
                # Read response with size limit
                content = self._read_response_with_limit(resp, robots_url)
                # Parse robots.txt content for sitemap directives
                for raw_line in content.splitlines():
                    line = raw_line.strip()
                    if line.lower().startswith("sitemap:"):
                        sitemap_value = line.split(":", 1)[1].strip()
                        if sitemap_value:
                            # Allow absolute and relative sitemap values
                            if sitemap_value.lower().startswith(("http://", "https://")):
                                sitemap_url = sitemap_value
                            else:
                                # Resolve relative path against base_url
                                sitemap_url = urljoin(base_url, sitemap_value)
                            # Validate scheme is HTTP/HTTPS only
                            parsed = urlparse(sitemap_url)
                            if parsed.scheme not in ("http", "https"):
                                logger.warning(f"Skipping non-HTTP(S) sitemap in robots.txt: {sitemap_url}")
                                continue
                            sitemaps.append(sitemap_url)
                            logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
            finally:
                resp.close()
        except requests.exceptions.RequestException:
            logger.exception(f"Network error fetching robots.txt from {base_url}")
        except ValueError as e:
            logger.warning(f"robots.txt too large at {base_url}: {e}")
        except Exception:
            logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
        return sitemaps
    def _parse_html_meta_tags(self, base_url: str) -> list[str]:
        """
        Extract sitemap references from HTML meta tags using proper HTML parsing.
        Args:
            base_url: Base URL to check HTML for meta tags
        Returns:
            List of sitemap URLs found in HTML meta tags
        """
        sitemaps: list[str] = []
        try:
            logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
            # Set safe User-Agent header
            headers = {
                'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'
            }
            resp = requests.get(base_url, timeout=30, stream=True, verify=True, headers=headers)
            try:
                if resp.status_code != 200:
                    logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
                    return sitemaps
                # Read response with size limit
                content = self._read_response_with_limit(resp, base_url)
                # Parse HTML using proper HTML parser
                parser = SitemapHTMLParser()
                try:
                    parser.feed(content)
                except Exception as e:
                    logger.warning(f"HTML parsing error for {base_url}: {e}")
                    return sitemaps
                # Process found sitemaps
                for tag_type, url in parser.sitemaps:
                    # Resolve relative URLs
                    sitemap_url = urljoin(base_url, url.strip())
                    # Validate scheme is HTTP/HTTPS
                    parsed = urlparse(sitemap_url)
                    if parsed.scheme not in ("http", "https"):
                        logger.debug(f"Skipping non-HTTP(S) sitemap URL: {sitemap_url}")
                        continue
                    sitemaps.append(sitemap_url)
                    logger.info(f"Found sitemap in HTML {tag_type} tag: {sitemap_url}")
            finally:
                resp.close()
        except requests.exceptions.RequestException:
            logger.exception(f"Network error fetching HTML from {base_url}")
        except ValueError as e:
            logger.warning(f"HTML response too large at {base_url}: {e}")
        except Exception:
            logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
        return sitemaps
    def _read_response_with_limit(self, response: requests.Response, url: str, max_size: int | None = None) -> str:
        """
        Read response content with size limit to prevent memory exhaustion.
        Args:
            response: The response object to read from
            url: URL being read (for logging)
            max_size: Maximum bytes to read (defaults to MAX_RESPONSE_SIZE)
        Returns:
            Response text content
        Raises:
            ValueError: If response exceeds size limit
        """
        if max_size is None:
            max_size = self.MAX_RESPONSE_SIZE
        try:
            chunks = []
            total_size = 0
            # Read response in chunks to enforce size limit
            for chunk in response.iter_content(chunk_size=8192, decode_unicode=False):
                if chunk:
                    total_size += len(chunk)
                    if total_size > max_size:
                        response.close()
                        size_mb = max_size / (1024 * 1024)
                        logger.warning(
                            f"Response size exceeded limit of {size_mb:.1f}MB for {url}, "
                            f"received {total_size / (1024 * 1024):.1f}MB"
                        )
                        raise ValueError(f"Response size exceeds {size_mb:.1f}MB limit")
                    chunks.append(chunk)
            # Decode the complete response
            content_bytes = b''.join(chunks)
            encoding = response.encoding or 'utf-8'
            try:
                return content_bytes.decode(encoding)
            except UnicodeDecodeError:
                # Fallback to utf-8 with error replacement
                return content_bytes.decode('utf-8', errors='replace')
        except Exception:
            response.close()
            raise
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -6,8 +6,8 @@ Handles URL transformations and validations.
 import hashlib
 import re
 from urllib.parse import urlparse, urljoin
 from typing import List, Optional
 from urllib.parse import urljoin, urlparse
 from ....config.logfire_config import get_logger
@@ -36,8 +36,8 @@ class URLHandler:
        except Exception as e:
            logger.warning(f"Error checking if URL is sitemap: {e}")
            return False
-    
+
-    @staticmethod  
+    @staticmethod
    def is_markdown(url: str) -> bool:
        """
        Check if a URL points to a markdown file (.md, .mdx, .markdown).
@@ -277,9 +277,9 @@ class URLHandler:
            # Fallback: use a hash of the error message + url to still get something unique
            fallback = f"error_{redacted}_{str(e)}"
            return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
-    
+
    @staticmethod
-    def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
+    def extract_markdown_links(content: str, base_url: str | None = None) -> list[str]:
        """
        Extract markdown-style links from text content.
@@ -385,9 +385,9 @@ class URLHandler:
        except Exception as e:
            logger.error(f"Error extracting markdown links with text: {e}", exc_info=True)
            return []
-    
+
    @staticmethod
-    def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
+    def is_link_collection_file(url: str, content: str | None = None) -> bool:
        """
        Check if a URL/file appears to be a link collection file like llms.txt.
@@ -402,56 +402,55 @@ class URLHandler:
            # Extract filename from URL
            parsed = urlparse(url)
            filename = parsed.path.split('/')[-1].lower()
-            
+
            # Check for specific link collection filenames
            # Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
            # Only includes commonly used formats found in the wild
            link_collection_patterns = [
                # .txt variants - files that typically contain lists of links
                'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
                # .md/.mdx/.markdown variants
                'llms.md', 'links.md', 'resources.md', 'references.md',
                'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
                'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
            ]
-            
+
            # Direct filename match
            if filename in link_collection_patterns:
                logger.info(f"Detected link collection file by filename: {filename}")
                return True
-            
+
            # Pattern-based detection for variations, but exclude "full" variants
            # Only match files that are likely link collections, not complete content files
-            if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
+            if filename.endswith('.txt'):
-                # Exclude files with "full" in the name - these typically contain complete content, not just links
+                # Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
-                if 'full' not in filename:
+                import re
                if not re.search(r'(^|[._-])full([._-]|$)', filename):
                    # Match files that start with common link collection prefixes
                    base_patterns = ['llms', 'links', 'resources', 'references']
                    if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
                        logger.info(f"Detected potential link collection file: {filename}")
                        return True
-            
+
            # Content-based detection if content is provided
            if content:
                # Never treat "full" variants as link collections to preserve single-page behavior
-                if 'full' in filename:
+                import re
                if re.search(r'(^|[._-])full([._-]|$)', filename):
                    logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
                    return False
                # Reuse extractor to avoid regex divergence and maintain consistency
                extracted_links = URLHandler.extract_markdown_links(content, url)
                total_links = len(extracted_links)
-                
+
                # Calculate link density (links per 100 characters)
                content_length = len(content.strip())
                if content_length > 0:
                    link_density = (total_links * 100) / content_length
-                    
+
                    # If more than 2% of content is links, likely a link collection
                    if link_density > 2.0 and total_links > 3:
                        logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
                        return True
-            
+
            return False
-            
+
        except Exception as e:
            logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
            return False
@@ -605,3 +604,104 @@ class URLHandler:
            logger.warning(f"Error extracting display name for {url}: {e}, using URL")
            # Fallback: return truncated URL
            return url[:50] + "..." if len(url) > 50 else url
    @staticmethod
    def is_robots_txt(url: str) -> bool:
        """
        Check if a URL is a robots.txt file with error handling.
        Args:
            url: URL to check
        Returns:
            True if URL is a robots.txt file, False otherwise
        """
        try:
            parsed = urlparse(url)
            # Normalize to lowercase and ignore query/fragment
            path = parsed.path.lower()
            # Only detect robots.txt at root level
            return path == '/robots.txt'
        except Exception as e:
            logger.warning(f"Error checking if URL is robots.txt: {e}", exc_info=True)
            return False
    @staticmethod
    def is_llms_variant(url: str) -> bool:
        """
        Check if a URL is a llms.txt/llms.md variant with error handling.
        Matches:
        - Exact filename matches: llms.txt, llms-full.txt, llms.md, etc.
        - Files in /llms/ directories: /llms/guides.txt, /llms/swift.txt, etc.
        Args:
            url: URL to check
        Returns:
            True if URL is a llms file variant, False otherwise
        """
        try:
            parsed = urlparse(url)
            # Normalize to lowercase and ignore query/fragment
            path = parsed.path.lower()
            filename = path.split('/')[-1] if '/' in path else path
            # Check for exact llms file variants (only standard spec files)
            llms_variants = ['llms.txt', 'llms-full.txt']
            if filename in llms_variants:
                return True
            # Check for .txt files in /llms/ directory (e.g., /llms/guides.txt, /llms/swift.txt)
            if '/llms/' in path and path.endswith('.txt'):
                return True
            return False
        except Exception as e:
            logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
            return False
    @staticmethod
    def is_well_known_file(url: str) -> bool:
        """
        Check if a URL is a .well-known/* file with error handling.
        Per RFC 8615, the path is case-sensitive and must be lowercase.
        Args:
            url: URL to check
        Returns:
            True if URL is a .well-known file, False otherwise
        """
        try:
            parsed = urlparse(url)
            # RFC 8615: path segments are case-sensitive, must be lowercase
            path = parsed.path
            # Only detect .well-known files at root level
            return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
        except Exception as e:
            logger.warning(f"Error checking if URL is well-known file: {e}", exc_info=True)
            return False
    @staticmethod
    def get_base_url(url: str) -> str:
        """
        Extract base domain URL for discovery with error handling.
        Args:
            url: URL to extract base from
        Returns:
            Base URL (scheme + netloc) or original URL if extraction fails
        """
        try:
            parsed = urlparse(url)
            # Ensure we have scheme and netloc
            if parsed.scheme and parsed.netloc:
                return f"{parsed.scheme}://{parsed.netloc}"
            else:
                logger.warning(f"URL missing scheme or netloc: {url}")
                return url
        except Exception as e:
            logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True)
            return url
--- a/python/src/server/services/crawling/progress_mapper.py
+++ b/python/src/server/services/crawling/progress_mapper.py
@@ -18,14 +18,18 @@ class ProgressMapper:
        "error": (-1, -1),            # Special case for errors
        "cancelled": (-1, -1),        # Special case for cancellation
        "completed": (100, 100),
        "complete": (100, 100),       # Alias
        # Crawl-specific stages - rebalanced based on actual time taken
        "analyzing": (1, 3),          # URL analysis is quick
-        "crawling": (3, 15),          # Crawling can take time for deep/many URLs
+        "discovery": (3, 4),          # File discovery is quick (new stage for discovery feature)
        "crawling": (4, 15),          # Crawling can take time for deep/many URLs
        "processing": (15, 20),       # Content processing/chunking
        "source_creation": (20, 25),  # DB operations
        "document_storage": (25, 40), # Embeddings generation takes significant time
        "code_extraction": (40, 90),  # Code extraction + summaries - still longest but more balanced
        "code_storage": (40, 90),     # Alias
        "extracting": (40, 90),       # Alias for code_extraction
        "finalization": (90, 100),    # Final steps and cleanup
        # Upload-specific stages
@@ -65,7 +69,7 @@ class ProgressMapper:
        start, end = self.STAGE_RANGES[stage]
        # Handle completion
-        if stage == "completed":
+        if stage in ["completed", "complete"]:
            self.last_overall_progress = 100
            return 100
--- a/python/src/server/services/crawling/strategies/single_page.py
+++ b/python/src/server/services/crawling/strategies/single_page.py
@@ -229,17 +229,43 @@ class SinglePageCrawlStrategy:
    ) -> list[dict[str, Any]]:
        """
        Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
-        
+
        Args:
            url: URL of the text/markdown file
            transform_url_func: Function to transform URLs (e.g., GitHub URLs)
            progress_callback: Optional callback for progress updates
-            start_progress: Starting progress percentage
+            start_progress: Starting progress percentage (must be 0-100)
-            end_progress: Ending progress percentage
+            end_progress: Ending progress percentage (must be 0-100 and > start_progress)
-            
+
        Returns:
            List containing the crawled document
        Raises:
            ValueError: If start_progress or end_progress are invalid
        """
        # Validate progress parameters before any async work or progress reporting
        if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)):
            raise ValueError(
                f"start_progress and end_progress must be int or float, "
                f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}"
            )
        if not (0 <= start_progress <= 100):
            raise ValueError(
                f"start_progress must be in range [0, 100], got {start_progress}"
            )
        if not (0 <= end_progress <= 100):
            raise ValueError(
                f"end_progress must be in range [0, 100], got {end_progress}"
            )
        if start_progress >= end_progress:
            raise ValueError(
                f"start_progress must be less than end_progress, "
                f"got start_progress={start_progress}, end_progress={end_progress}"
            )
        try:
            # Transform GitHub URLs to raw content URLs if applicable
            original_url = url
--- a/python/tests/progress_tracking/test_progress_mapper.py
+++ b/python/tests/progress_tracking/test_progress_mapper.py
@@ -13,109 +13,119 @@ class TestProgressMapper:
    def test_initialization(self):
        """Test ProgressMapper initialization"""
        mapper = ProgressMapper()
-        
+
        assert mapper.last_overall_progress == 0
        assert mapper.current_stage == "starting"
-        
+
    def test_map_progress_basic(self):
        """Test basic progress mapping"""
        mapper = ProgressMapper()
-        
+
        # Starting stage (0-1%)
        progress = mapper.map_progress("starting", 50)
        assert progress == 0  # 50% of 0-1 range
-        
+
        # Analyzing stage (1-3%)
        progress = mapper.map_progress("analyzing", 50)
        assert progress == 2  # 1 + (50% of 2) = 2
-        
+
-        # Crawling stage (3-15%)
+        # Discovery stage (3-4%) - NEW TEST FOR DISCOVERY FEATURE
        progress = mapper.map_progress("discovery", 50)
        assert progress == 4  # 3 + (50% of 1) = 3.5 -> 4 (rounds up)
        # Crawling stage (4-15%)
        progress = mapper.map_progress("crawling", 50)
-        assert progress == 9  # 3 + (50% of 12) = 9
+        assert progress == 10  # 4 + (50% of 11) = 9.5 -> 10 (rounds up)
-        
+
    def test_progress_never_goes_backwards(self):
        """Test that progress never decreases"""
        mapper = ProgressMapper()
-        
+
-        # Move to 50% of crawling (3-15%) = 9%
+        # Move to 50% of crawling (4-15%) = 9.5 -> 10%
        progress1 = mapper.map_progress("crawling", 50)
-        assert progress1 == 9
+        assert progress1 == 10
-        
+
-        # Try to go back to analyzing (1-3%) - should stay at 9%
+        # Try to go back to analyzing (1-3%) - should stay at 10%
        progress2 = mapper.map_progress("analyzing", 100)
-        assert progress2 == 9  # Should not go backwards
+        assert progress2 == 10  # Should not go backwards
-        
+
        # Can move forward to document_storage
        progress3 = mapper.map_progress("document_storage", 50)
        assert progress3 == 32  # 25 + (50% of 15) = 32.5 -> 32
-        
+
    def test_completion_handling(self):
        """Test completion status handling"""
        mapper = ProgressMapper()
-        
+
        # Jump straight to completed
        progress = mapper.map_progress("completed", 0)
        assert progress == 100
-        
+
        # Any percentage at completed should be 100
        progress = mapper.map_progress("completed", 50)
        assert progress == 100
-        
+
        # Test alias 'complete'
        mapper2 = ProgressMapper()
        progress = mapper2.map_progress("complete", 0)
        assert progress == 100
    def test_error_handling(self):
        """Test error status handling - preserves last known progress"""
        mapper = ProgressMapper()
-        
+
        # Error with no prior progress should return 0 (initial state)
        progress = mapper.map_progress("error", 50)
        assert progress == 0
-        
+
        # Set some progress first, then error should preserve it
        mapper.map_progress("crawling", 50)  # Should map to somewhere in the crawling range
        current_progress = mapper.last_overall_progress
        error_progress = mapper.map_progress("error", 50)
        assert error_progress == current_progress  # Should preserve the progress
-        
+
    def test_cancelled_handling(self):
        """Test cancelled status handling - preserves last known progress"""
        mapper = ProgressMapper()
-        
+
        # Cancelled with no prior progress should return 0 (initial state)
        progress = mapper.map_progress("cancelled", 50)
        assert progress == 0
-        
+
        # Set some progress first, then cancelled should preserve it
        mapper.map_progress("crawling", 75)  # Should map to somewhere in the crawling range
        current_progress = mapper.last_overall_progress
        cancelled_progress = mapper.map_progress("cancelled", 50)
        assert cancelled_progress == current_progress  # Should preserve the progress
-        
+
    def test_unknown_stage(self):
        """Test handling of unknown stages"""
        mapper = ProgressMapper()
-        
+
        # Set some initial progress
        mapper.map_progress("crawling", 50)
        current = mapper.last_overall_progress
-        
+
        # Unknown stage should maintain current progress
        progress = mapper.map_progress("unknown_stage", 50)
        assert progress == current
-        
+
-    def test_stage_ranges(self):
+    def test_stage_ranges_with_discovery(self):
-        """Test all defined stage ranges"""
+        """Test all defined stage ranges including discovery"""
        mapper = ProgressMapper()
-        
+
        # Verify ranges are correctly defined with new balanced values
        assert mapper.STAGE_RANGES["starting"] == (0, 1)
        assert mapper.STAGE_RANGES["analyzing"] == (1, 3)
-        assert mapper.STAGE_RANGES["crawling"] == (3, 15)
+        assert mapper.STAGE_RANGES["discovery"] == (3, 4)  # NEW DISCOVERY STAGE
        assert mapper.STAGE_RANGES["crawling"] == (4, 15)
        assert mapper.STAGE_RANGES["processing"] == (15, 20)
        assert mapper.STAGE_RANGES["source_creation"] == (20, 25)
        assert mapper.STAGE_RANGES["document_storage"] == (25, 40)
        assert mapper.STAGE_RANGES["code_extraction"] == (40, 90)
        assert mapper.STAGE_RANGES["finalization"] == (90, 100)
        assert mapper.STAGE_RANGES["completed"] == (100, 100)
-        
+
        # Upload-specific stages
        assert mapper.STAGE_RANGES["reading"] == (0, 5)
        assert mapper.STAGE_RANGES["text_extraction"] == (5, 10)
@@ -123,138 +133,167 @@ class TestProgressMapper:
        # Note: source_creation is shared between crawl and upload operations at (20, 25)
        assert mapper.STAGE_RANGES["summarizing"] == (25, 35)
        assert mapper.STAGE_RANGES["storing"] == (35, 100)
-        
+
    def test_calculate_stage_progress(self):
        """Test calculating percentage within a stage"""
        mapper = ProgressMapper()
-        
+
        # 5 out of 10 = 50%
        progress = mapper.calculate_stage_progress(5, 10)
        assert progress == 50.0
-        
+
        # 0 out of 10 = 0%
        progress = mapper.calculate_stage_progress(0, 10)
        assert progress == 0.0
-        
+
        # 10 out of 10 = 100%
        progress = mapper.calculate_stage_progress(10, 10)
        assert progress == 100.0
-        
+
        # Handle division by zero
        progress = mapper.calculate_stage_progress(5, 0)
        assert progress == 0.0
-        
+
    def test_map_batch_progress(self):
        """Test batch progress mapping"""
        mapper = ProgressMapper()
-        
+
        # Batch 1 of 5 in document_storage stage
        progress = mapper.map_batch_progress("document_storage", 1, 5)
        assert progress == 25  # Start of document_storage range (25-40)
-        
+
        # Batch 3 of 5
        progress = mapper.map_batch_progress("document_storage", 3, 5)
        assert progress == 31  # 40% through 25-40 range
-        
+
        # Batch 5 of 5
        progress = mapper.map_batch_progress("document_storage", 5, 5)
        assert progress == 37  # 80% through 25-40 range
-        
+
    def test_map_with_substage(self):
        """Test mapping with substage information"""
        mapper = ProgressMapper()
-        
+
        # Currently just uses main stage
        progress = mapper.map_with_substage("document_storage", "embeddings", 50)
        assert progress == 32  # 50% of 25-40 range = 32.5 -> 32
-        
+
    def test_reset(self):
        """Test resetting the mapper"""
        mapper = ProgressMapper()
-        
+
        # Set some progress
        mapper.map_progress("document_storage", 50)
        assert mapper.last_overall_progress == 32  # 25 + (50% of 15) = 32.5 -> 32
        assert mapper.current_stage == "document_storage"
-        
+
        # Reset
        mapper.reset()
        assert mapper.last_overall_progress == 0
        assert mapper.current_stage == "starting"
-        
+
    def test_get_current_stage(self):
        """Test getting current stage"""
        mapper = ProgressMapper()
-        
+
        assert mapper.get_current_stage() == "starting"
-        
+
        mapper.map_progress("crawling", 50)
        assert mapper.get_current_stage() == "crawling"
-        
+
        mapper.map_progress("code_extraction", 50)
        assert mapper.get_current_stage() == "code_extraction"
-        
+
    def test_get_current_progress(self):
        """Test getting current progress"""
        mapper = ProgressMapper()
-        
+
        assert mapper.get_current_progress() == 0
-        
+
        mapper.map_progress("crawling", 50)
-        assert mapper.get_current_progress() == 9  # 3 + (50% of 12) = 9
+        assert mapper.get_current_progress() == 10  # 4 + (50% of 11) = 9.5 -> 10
-        
+
        mapper.map_progress("code_extraction", 50)
        assert mapper.get_current_progress() == 65  # 40 + (50% of 50) = 65
-        
+
    def test_get_stage_range(self):
        """Test getting stage range"""
        mapper = ProgressMapper()
-        
+
        assert mapper.get_stage_range("starting") == (0, 1)
        assert mapper.get_stage_range("discovery") == (3, 4)  # Test discovery stage
        assert mapper.get_stage_range("code_extraction") == (40, 90)
        assert mapper.get_stage_range("unknown") == (0, 100)  # Default range
-        
+
-    def test_realistic_crawl_sequence(self):
+    def test_realistic_crawl_sequence_with_discovery(self):
-        """Test a realistic crawl progress sequence"""
+        """Test a realistic crawl progress sequence including discovery"""
        mapper = ProgressMapper()
-        
+
        # Starting
        assert mapper.map_progress("starting", 0) == 0
        assert mapper.map_progress("starting", 100) == 1
-        
+
        # Analyzing
        assert mapper.map_progress("analyzing", 0) == 1
        assert mapper.map_progress("analyzing", 100) == 3
-        
+
        # Discovery (NEW)
        assert mapper.map_progress("discovery", 0) == 3
        assert mapper.map_progress("discovery", 50) == 4  # 3 + (50% of 1) = 3.5 -> 4 (rounds up)
        assert mapper.map_progress("discovery", 100) == 4
        # Crawling
-        assert mapper.map_progress("crawling", 0) == 3
+        assert mapper.map_progress("crawling", 0) == 4
-        assert mapper.map_progress("crawling", 33) == 7  # 3 + (33% of 12) = 6.96 -> 7
+        assert mapper.map_progress("crawling", 33) == 8  # 4 + (33% of 11) = 7.63 -> 8 (rounds up)
-        assert mapper.map_progress("crawling", 66) == 11  # 3 + (66% of 12) = 10.92 -> 11
+        progress_crawl_66 = mapper.map_progress("crawling", 66)
        assert progress_crawl_66 in [11, 12]  # 4 + (66% of 11) = 11.26, could round to 11 or 12
        assert mapper.map_progress("crawling", 100) == 15
-        
+
        # Processing
        assert mapper.map_progress("processing", 0) == 15
        assert mapper.map_progress("processing", 100) == 20
-        
+
        # Source creation
        assert mapper.map_progress("source_creation", 0) == 20
        assert mapper.map_progress("source_creation", 100) == 25
-        
+
        # Document storage
        assert mapper.map_progress("document_storage", 0) == 25
        assert mapper.map_progress("document_storage", 50) == 32  # 25 + (50% of 15) = 32.5 -> 32
        assert mapper.map_progress("document_storage", 100) == 40
-        
+
        # Code extraction (longest phase)
        assert mapper.map_progress("code_extraction", 0) == 40
-        assert mapper.map_progress("code_extraction", 25) == 52  # 40 + (25% of 50) = 52.5 -> 52
+        progress_25 = mapper.map_progress("code_extraction", 25)
        assert progress_25 in [52, 53]  # 40 + (25% of 50) = 52.5, banker's rounding rounds to 52 (even)
        assert mapper.map_progress("code_extraction", 50) == 65  # 40 + (50% of 50) = 65
-        assert mapper.map_progress("code_extraction", 75) == 78  # 40 + (75% of 50) = 77.5 -> 78
+        progress_75 = mapper.map_progress("code_extraction", 75)
        assert progress_75 == 78  # 40 + (75% of 50) = 77.5 -> 78 (rounds to even per banker's rounding)
        assert mapper.map_progress("code_extraction", 100) == 90
-        
+
        # Finalization
        assert mapper.map_progress("finalization", 0) == 90
        assert mapper.map_progress("finalization", 100) == 100
-        
+
        # Completed
-        assert mapper.map_progress("completed", 0) == 100
+        assert mapper.map_progress("completed", 0) == 100
    def test_aliases_work_correctly(self):
        """Test that stage aliases work correctly"""
        mapper = ProgressMapper()
        # Test code_storage alias for code_extraction
        progress1 = mapper.map_progress("code_extraction", 50)
        mapper2 = ProgressMapper()
        progress2 = mapper2.map_progress("code_storage", 50)
        assert progress1 == progress2
        # Test extracting alias for code_extraction
        mapper3 = ProgressMapper()
        progress3 = mapper3.map_progress("extracting", 50)
        assert progress1 == progress3
        # Test complete alias for completed
        mapper4 = ProgressMapper()
        progress4 = mapper4.map_progress("complete", 0)
        assert progress4 == 100
--- a/python/tests/server/api_routes/test_bug_report_api.py
+++ b/python/tests/server/api_routes/test_bug_report_api.py
@@ -0,0 +1,177 @@
 """
 Unit tests for bug_report_api.py
 """
 import os
 from unittest.mock import AsyncMock, patch
 import pytest
 from fastapi.testclient import TestClient
 from src.server.config.version import GITHUB_REPO_NAME, GITHUB_REPO_OWNER
 from src.server.main import app
@pytest.fixture
 def client():
    """Create test client."""
    return TestClient(app)
@pytest.fixture
 def mock_bug_report():
    """Mock bug report data."""
    return {
        "title": "Test Bug",
        "description": "Test description",
        "stepsToReproduce": "Step 1\nStep 2",
        "expectedBehavior": "Expected result",
        "actualBehavior": "Actual result",
        "severity": "medium",
        "component": "ui",
        "context": {
            "error": {
                "name": "TypeError",
                "message": "Test error",
                "stack": "Test stack trace",
            },
            "app": {
                "version": "0.1.0",
                "url": "http://localhost:3737",
                "timestamp": "2025-10-17T12:00:00Z",
            },
            "system": {
                "platform": "linux",
                "memory": "8GB",
            },
            "services": {
                "server": True,
                "mcp": True,
                "agents": False,
            },
            "logs": ["Log line 1", "Log line 2"],
        },
    }
 def test_health_check_with_defaults(client):
    """Test health check returns correct default repository."""
    with patch.dict(os.environ, {}, clear=False):
        # Ensure no GITHUB_TOKEN or GITHUB_REPO env vars
        os.environ.pop("GITHUB_TOKEN", None)
        os.environ.pop("GITHUB_REPO", None)
        response = client.get("/api/bug-report/health")
        assert response.status_code == 200
        data = response.json()
        assert data["status"] == "degraded"  # No token
        assert data["github_token_configured"] is False
        assert data["github_repo_configured"] is False
        # Verify it uses the version.py constants
        assert data["repo"] == f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
        assert data["repo"] == "coleam00/Archon"
 def test_health_check_with_github_token(client):
    """Test health check when GitHub token is configured."""
    with patch.dict(os.environ, {"GITHUB_TOKEN": "test-token"}, clear=False):
        os.environ.pop("GITHUB_REPO", None)
        response = client.get("/api/bug-report/health")
        assert response.status_code == 200
        data = response.json()
        assert data["status"] == "healthy"
        assert data["github_token_configured"] is True
        assert data["github_repo_configured"] is False
        assert data["repo"] == f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
 def test_health_check_with_custom_repo(client):
    """Test health check with custom GITHUB_REPO environment variable."""
    with patch.dict(os.environ, {"GITHUB_REPO": "custom/repo"}, clear=False):
        response = client.get("/api/bug-report/health")
        assert response.status_code == 200
        data = response.json()
        assert data["github_repo_configured"] is True
        assert data["repo"] == "custom/repo"
 def test_manual_submission_url_uses_correct_repo(client, mock_bug_report):
    """Test that manual submission URL points to correct repository."""
    with patch.dict(os.environ, {}, clear=False):
        # No GITHUB_TOKEN, should create manual submission URL
        os.environ.pop("GITHUB_TOKEN", None)
        os.environ.pop("GITHUB_REPO", None)
        response = client.post("/api/bug-report/github", json=mock_bug_report)
        assert response.status_code == 200
        data = response.json()
        assert data["success"] is True
        assert data["issue_url"] is not None
        # Verify URL contains correct repository
        expected_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
        assert expected_repo in data["issue_url"]
        assert "coleam00/Archon" in data["issue_url"]
        # Ensure old repository is NOT in URL
        assert "dynamous-community" not in data["issue_url"]
        assert "Archon-V2-Alpha" not in data["issue_url"]
        # Verify URL contains required parameters including template
        assert "title=" in data["issue_url"]
        assert "body=" in data["issue_url"]
        assert "template=auto_bug_report.md" in data["issue_url"]
 def test_api_submission_with_token(client, mock_bug_report):
    """Test bug report submission with GitHub token."""
    mock_response_data = {
        "success": True,
        "issue_number": 123,
        "issue_url": f"https://github.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}/issues/123",
    }
    with patch.dict(os.environ, {"GITHUB_TOKEN": "test-token"}, clear=False):
        with patch("src.server.api_routes.bug_report_api.github_service") as mock_service:
            mock_service.token = "test-token"
            mock_service.repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
            mock_service.create_issue = AsyncMock(return_value=mock_response_data)
            response = client.post("/api/bug-report/github", json=mock_bug_report)
            assert response.status_code == 200
            data = response.json()
            assert data["success"] is True
            assert data["issue_number"] == 123
            # Verify issue URL contains correct repository
            assert f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}" in data["issue_url"]
            # Ensure old repository is NOT in URL
            assert "dynamous-community" not in data["issue_url"]
 def test_github_service_initialization():
    """Test GitHubService uses correct default repository."""
    from src.server.api_routes.bug_report_api import GitHubService
    with patch.dict(os.environ, {}, clear=False):
        os.environ.pop("GITHUB_REPO", None)
        service = GitHubService()
        # Verify service uses version.py constants as default
        expected_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
        assert service.repo == expected_repo
        assert service.repo == "coleam00/Archon"
        # Ensure old repository is NOT used
        assert service.repo != "dynamous-community/Archon-V2-Alpha"
 def test_github_service_with_custom_repo():
    """Test GitHubService respects GITHUB_REPO environment variable."""
    from src.server.api_routes.bug_report_api import GitHubService
    with patch.dict(os.environ, {"GITHUB_REPO": "custom/repo"}, clear=False):
        service = GitHubService()
        assert service.repo == "custom/repo"
--- a/python/tests/test_crawling_service_subdomain.py
+++ b/python/tests/test_crawling_service_subdomain.py
@@ -0,0 +1,152 @@
 """Unit tests for CrawlingService subdomain checking functionality."""
 import pytest
 from src.server.services.crawling.crawling_service import CrawlingService
 class TestCrawlingServiceSubdomain:
    """Test suite for CrawlingService subdomain checking methods."""
    @pytest.fixture
    def service(self):
        """Create a CrawlingService instance for testing."""
        # Create service without crawler or supabase for testing domain checking
        return CrawlingService(crawler=None, supabase_client=None)
    def test_is_same_domain_or_subdomain_exact_match(self, service):
        """Test exact domain matches."""
        # Same domain should match
        assert service._is_same_domain_or_subdomain(
            "https://supabase.com/docs",
            "https://supabase.com"
        ) is True
        assert service._is_same_domain_or_subdomain(
            "https://supabase.com/path/to/page",
            "https://supabase.com"
        ) is True
    def test_is_same_domain_or_subdomain_subdomains(self, service):
        """Test subdomain matching."""
        # Subdomain should match
        assert service._is_same_domain_or_subdomain(
            "https://docs.supabase.com/llms.txt",
            "https://supabase.com"
        ) is True
        assert service._is_same_domain_or_subdomain(
            "https://api.supabase.com/v1/endpoint",
            "https://supabase.com"
        ) is True
        # Multiple subdomain levels
        assert service._is_same_domain_or_subdomain(
            "https://dev.api.supabase.com/test",
            "https://supabase.com"
        ) is True
    def test_is_same_domain_or_subdomain_different_domains(self, service):
        """Test that different domains are rejected."""
        # Different domain should not match
        assert service._is_same_domain_or_subdomain(
            "https://external.com/llms.txt",
            "https://supabase.com"
        ) is False
        assert service._is_same_domain_or_subdomain(
            "https://docs.other-site.com",
            "https://supabase.com"
        ) is False
        # Similar but different domains
        assert service._is_same_domain_or_subdomain(
            "https://supabase.org",
            "https://supabase.com"
        ) is False
    def test_is_same_domain_or_subdomain_protocols(self, service):
        """Test that protocol differences don't affect matching."""
        # Different protocols should still match
        assert service._is_same_domain_or_subdomain(
            "http://supabase.com/docs",
            "https://supabase.com"
        ) is True
        assert service._is_same_domain_or_subdomain(
            "https://docs.supabase.com",
            "http://supabase.com"
        ) is True
    def test_is_same_domain_or_subdomain_ports(self, service):
        """Test handling of port numbers."""
        # Same root domain with different ports should match
        assert service._is_same_domain_or_subdomain(
            "https://supabase.com:8080/api",
            "https://supabase.com"
        ) is True
        assert service._is_same_domain_or_subdomain(
            "http://localhost:3000/dev",
            "http://localhost:8080"
        ) is True
    def test_is_same_domain_or_subdomain_edge_cases(self, service):
        """Test edge cases and error handling."""
        # Empty or malformed URLs should return False
        assert service._is_same_domain_or_subdomain(
            "",
            "https://supabase.com"
        ) is False
        assert service._is_same_domain_or_subdomain(
            "https://supabase.com",
            ""
        ) is False
        assert service._is_same_domain_or_subdomain(
            "not-a-url",
            "https://supabase.com"
        ) is False
    def test_is_same_domain_or_subdomain_real_world_examples(self, service):
        """Test with real-world examples."""
        # GitHub examples
        assert service._is_same_domain_or_subdomain(
            "https://api.github.com/repos",
            "https://github.com"
        ) is True
        assert service._is_same_domain_or_subdomain(
            "https://raw.githubusercontent.com/owner/repo",
            "https://github.com"
        ) is False  # githubusercontent.com is different root domain
        # Documentation sites
        assert service._is_same_domain_or_subdomain(
            "https://docs.python.org/3/library",
            "https://python.org"
        ) is True
        assert service._is_same_domain_or_subdomain(
            "https://api.stripe.com/v1",
            "https://stripe.com"
        ) is True
    def test_is_same_domain_backward_compatibility(self, service):
        """Test that _is_same_domain still works correctly for exact matches."""
        # Exact domain match should work
        assert service._is_same_domain(
            "https://supabase.com/docs",
            "https://supabase.com"
        ) is True
        # Subdomain should NOT match with _is_same_domain (only with _is_same_domain_or_subdomain)
        assert service._is_same_domain(
            "https://docs.supabase.com/llms.txt",
            "https://supabase.com"
        ) is False
        # Different domain should not match
        assert service._is_same_domain(
            "https://external.com/llms.txt",
            "https://supabase.com"
        ) is False
--- a/python/tests/test_discovery_service.py
+++ b/python/tests/test_discovery_service.py
@@ -0,0 +1,353 @@
 """Unit tests for DiscoveryService class."""
 import socket
 from unittest.mock import Mock, patch
 from src.server.services.crawling.discovery_service import DiscoveryService
 def create_mock_dns_response():
    """Create mock DNS response for safe public IPs."""
    # Return a safe public IP for testing
    return [
        (socket.AF_INET, socket.SOCK_STREAM, 6, '', ('93.184.216.34', 0))  # example.com's actual IP
    ]
 def create_mock_response(status_code: int, text: str = "", url: str = "https://example.com") -> Mock:
    """Create a mock response object that supports streaming API."""
    response = Mock()
    response.status_code = status_code
    response.text = text
    response.encoding = 'utf-8'
    response.history = []  # Empty list for no redirects
    response.url = url  # Mock URL for redirect checks (must be string, not Mock)
    # Mock iter_content to yield text in chunks as bytes
    text_bytes = text.encode('utf-8')
    chunk_size = 8192
    chunks = [text_bytes[i:i+chunk_size] for i in range(0, len(text_bytes), chunk_size)]
    if not chunks:
        chunks = [b'']  # Ensure at least one empty chunk
    response.iter_content = Mock(return_value=iter(chunks))
    # Mock close method
    response.close = Mock()
    return response
 class TestDiscoveryService:
    """Test suite for DiscoveryService class."""
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_discover_files_basic(self, mock_get, mock_session, mock_dns):
        """Test main discovery method returns single best file."""
        service = DiscoveryService()
        base_url = "https://example.com"
        # Mock robots.txt response (no sitemaps)
        robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
        # Mock file existence - llms-full.txt doesn't exist, but llms.txt does
        def mock_get_side_effect(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif url.endswith('llms-full.txt'):
                return create_mock_response(404)  # Highest priority doesn't exist
            elif url.endswith('llms.txt'):
                return create_mock_response(200)  # Second priority exists
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_get_side_effect
        mock_session.return_value.get.side_effect = mock_get_side_effect
        result = service.discover_files(base_url)
        # Should return single URL string (not dict, not list)
        assert isinstance(result, str)
        assert result == 'https://example.com/llms.txt'
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_discover_files_no_files_found(self, mock_get, mock_session, mock_dns):
        """Test discovery when no files are found."""
        service = DiscoveryService()
        base_url = "https://example.com"
        # Mock all HTTP requests to return 404
        mock_get.return_value = create_mock_response(404)
        mock_session.return_value.get.return_value = create_mock_response(404)
        result = service.discover_files(base_url)
        # Should return None when no files found
        assert result is None
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_discover_files_priority_order(self, mock_get, mock_session, mock_dns):
        """Test that discovery follows the correct priority order."""
        service = DiscoveryService()
        base_url = "https://example.com"
        # Mock robots.txt response (no sitemaps declared)
        robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
        # Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority
        def mock_get_side_effect(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
                return create_mock_response(200)  # Both exist
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_get_side_effect
        mock_session.return_value.get.side_effect = mock_get_side_effect
        result = service.discover_files(base_url)
        # Should return llms.txt since it has higher priority than sitemap.xml
        assert result == 'https://example.com/llms.txt'
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_discover_files_robots_sitemap_priority(self, mock_get, mock_session, mock_dns):
        """Test that llms files have priority over robots.txt sitemap declarations."""
        service = DiscoveryService()
        base_url = "https://example.com"
        # Mock robots.txt response WITH sitemap declaration
        robots_response = create_mock_response(200, "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml")
        # Mock other files also exist (both llms and sitemap files)
        def mock_get_side_effect(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif 'llms' in url or 'sitemap' in url:
                return create_mock_response(200)
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_get_side_effect
        mock_session.return_value.get.side_effect = mock_get_side_effect
        result = service.discover_files(base_url)
        # Should return llms.txt (highest priority llms file) since llms files have priority over sitemaps
        # even when sitemaps are declared in robots.txt
        assert result == 'https://example.com/llms.txt'
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_discover_files_subdirectory_fallback(self, mock_get, mock_session, mock_dns):
        """Test discovery falls back to subdirectories for llms files."""
        service = DiscoveryService()
        base_url = "https://example.com"
        # Mock robots.txt response (no sitemaps declared)
        robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
        # Mock file existence - no root llms files, but static/llms.txt exists
        def mock_get_side_effect(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif '/static/llms.txt' in url:
                return create_mock_response(200)  # Found in subdirectory
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_get_side_effect
        mock_session.return_value.get.side_effect = mock_get_side_effect
        result = service.discover_files(base_url)
        # Should find the file in static subdirectory
        assert result == 'https://example.com/static/llms.txt'
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_check_url_exists(self, mock_get, mock_session, mock_dns):
        """Test URL existence checking."""
        service = DiscoveryService()
        # Test successful response
        mock_get.return_value = create_mock_response(200)
        mock_session.return_value.get.return_value = create_mock_response(200)
        assert service._check_url_exists("https://example.com/exists") is True
        # Test 404 response
        mock_get.return_value = create_mock_response(404)
        mock_session.return_value.get.return_value = create_mock_response(404)
        assert service._check_url_exists("https://example.com/not-found") is False
        # Test network error
        mock_get.side_effect = Exception
        mock_session.return_value.get.side_effect = Exception("Network error")
        assert service._check_url_exists("https://example.com/error") is False
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_parse_robots_txt_with_sitemap(self, mock_get, mock_session, mock_dns):
        """Test robots.txt parsing with sitemap directives."""
        service = DiscoveryService()
        # Mock successful robots.txt response
        robots_text = """User-agent: *
 Disallow: /admin/
 Sitemap: https://example.com/sitemap.xml
 Sitemap: https://example.com/sitemap-news.xml"""
        mock_get.return_value = create_mock_response(200, robots_text)
        result = service._parse_robots_txt("https://example.com")
        assert len(result) == 2
        assert "https://example.com/sitemap.xml" in result
        assert "https://example.com/sitemap-news.xml" in result
        mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'})
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_parse_robots_txt_no_sitemap(self, mock_get, mock_session, mock_dns):
        """Test robots.txt parsing without sitemap directives."""
        service = DiscoveryService()
        # Mock robots.txt without sitemaps
        robots_text = """User-agent: *
 Disallow: /admin/
 Allow: /public/"""
        mock_get.return_value = create_mock_response(200, robots_text)
        result = service._parse_robots_txt("https://example.com")
        assert len(result) == 0
        mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'})
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_parse_html_meta_tags(self, mock_get, mock_session, mock_dns):
        """Test HTML meta tag parsing for sitemaps."""
        service = DiscoveryService()
        # Mock HTML with sitemap references
        html_content = """
        <html>
        <head>
            <link rel="sitemap" href="/sitemap.xml">
            <meta name="sitemap" content="https://example.com/sitemap-meta.xml">
        </head>
        <body>Content here</body>
        </html>
        """
        mock_get.return_value = create_mock_response(200, html_content)
        result = service._parse_html_meta_tags("https://example.com")
        # Should find sitemaps from both link and meta tags
        assert len(result) >= 1
        assert any('sitemap' in url.lower() for url in result)
        mock_get.assert_called_once_with("https://example.com", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'})
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_discovery_priority_behavior(self, mock_get, mock_session, mock_dns):
        """Test that discovery returns highest-priority file when multiple files exist."""
        service = DiscoveryService()
        base_url = "https://example.com"
        # Mock robots.txt response (no sitemaps declared)
        robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
        # Scenario 1: All files exist - should return llms.txt (highest priority)
        def mock_all_exist(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif any(file in url for file in ['llms.txt', 'llms-full.txt', 'sitemap.xml']):
                return create_mock_response(200)
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_all_exist
        mock_session.return_value.get.side_effect = mock_all_exist
        result = service.discover_files(base_url)
        assert result == 'https://example.com/llms.txt', "Should return llms.txt when all files exist (highest priority)"
        # Scenario 2: llms.txt missing, others exist - should return llms-full.txt
        def mock_without_txt(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif url.endswith('llms.txt'):
                return create_mock_response(404)
            elif any(file in url for file in ['llms-full.txt', 'sitemap.xml']):
                return create_mock_response(200)
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_without_txt
        mock_session.return_value.get.side_effect = mock_without_txt
        result = service.discover_files(base_url)
        assert result == 'https://example.com/llms-full.txt', "Should return llms-full.txt when llms.txt is missing"
        # Scenario 3: Only sitemap files exist - should return sitemap.xml
        def mock_only_sitemaps(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif any(file in url for file in ['llms.txt', 'llms-full.txt']):
                return create_mock_response(404)
            elif url.endswith('sitemap.xml'):
                return create_mock_response(200)
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_only_sitemaps
        mock_session.return_value.get.side_effect = mock_only_sitemaps
        result = service.discover_files(base_url)
        assert result == 'https://example.com/sitemap.xml', "Should return sitemap.xml when llms files are missing"
        # Scenario 4: llms files have priority over sitemap files
        def mock_llms_and_sitemap(url, **kwargs):
            if url.endswith('robots.txt'):
                return robots_response
            elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
                return create_mock_response(200)
            else:
                return create_mock_response(404)
        mock_get.side_effect = mock_llms_and_sitemap
        mock_session.return_value.get.side_effect = mock_llms_and_sitemap
        result = service.discover_files(base_url)
        assert result == 'https://example.com/llms.txt', "Should prefer llms.txt over sitemap.xml"
    @patch('socket.getaddrinfo', return_value=create_mock_dns_response())
    @patch('requests.Session')
    @patch('requests.get')
    def test_network_error_handling(self, mock_get, mock_session, mock_dns):
        """Test error scenarios with network failures."""
        service = DiscoveryService()
        # Mock network error
        mock_get.side_effect = Exception("Network error")
        mock_session.return_value.get.side_effect = Exception("Network error")
        # Should not raise exception, but return None
        result = service.discover_files("https://example.com")
        assert result is None
        # Individual methods should also handle errors gracefully
        result = service._parse_robots_txt("https://example.com")
        assert result == []
        result = service._parse_html_meta_tags("https://example.com")
        assert result == []
--- a/python/tests/test_llms_txt_link_following.py
+++ b/python/tests/test_llms_txt_link_following.py
@@ -0,0 +1,217 @@
 """Integration tests for llms.txt link following functionality."""
 import pytest
 from unittest.mock import AsyncMock, MagicMock, patch
 from src.server.services.crawling.crawling_service import CrawlingService
 class TestLlmsTxtLinkFollowing:
    """Test suite for llms.txt link following feature."""
    @pytest.fixture
    def service(self):
        """Create a CrawlingService instance for testing."""
        return CrawlingService(crawler=None, supabase_client=None)
    @pytest.fixture
    def supabase_llms_content(self):
        """Return the actual Supabase llms.txt content."""
        return """# Supabase Docs
 - [Supabase Guides](https://supabase.com/llms/guides.txt)
 - [Supabase Reference (JavaScript)](https://supabase.com/llms/js.txt)
 - [Supabase Reference (Dart)](https://supabase.com/llms/dart.txt)
 - [Supabase Reference (Swift)](https://supabase.com/llms/swift.txt)
 - [Supabase Reference (Kotlin)](https://supabase.com/llms/kotlin.txt)
 - [Supabase Reference (Python)](https://supabase.com/llms/python.txt)
 - [Supabase Reference (C#)](https://supabase.com/llms/csharp.txt)
 - [Supabase CLI Reference](https://supabase.com/llms/cli.txt)
 """
    def test_extract_links_from_supabase_llms_txt(self, service, supabase_llms_content):
        """Test that links are correctly extracted from Supabase llms.txt."""
        url = "https://supabase.com/docs/llms.txt"
        extracted_links = service.url_handler.extract_markdown_links_with_text(
            supabase_llms_content, url
        )
        # Should extract 8 links
        assert len(extracted_links) == 8
        # Verify all extracted links
        expected_links = [
            "https://supabase.com/llms/guides.txt",
            "https://supabase.com/llms/js.txt",
            "https://supabase.com/llms/dart.txt",
            "https://supabase.com/llms/swift.txt",
            "https://supabase.com/llms/kotlin.txt",
            "https://supabase.com/llms/python.txt",
            "https://supabase.com/llms/csharp.txt",
            "https://supabase.com/llms/cli.txt",
        ]
        extracted_urls = [link for link, _ in extracted_links]
        assert extracted_urls == expected_links
    def test_all_links_are_llms_variants(self, service, supabase_llms_content):
        """Test that all extracted links are recognized as llms.txt variants."""
        url = "https://supabase.com/docs/llms.txt"
        extracted_links = service.url_handler.extract_markdown_links_with_text(
            supabase_llms_content, url
        )
        # All links should be recognized as llms variants
        for link, _ in extracted_links:
            is_llms = service.url_handler.is_llms_variant(link)
            assert is_llms, f"Link {link} should be recognized as llms.txt variant"
    def test_all_links_are_same_domain(self, service, supabase_llms_content):
        """Test that all extracted links are from the same domain."""
        url = "https://supabase.com/docs/llms.txt"
        original_domain = "https://supabase.com"
        extracted_links = service.url_handler.extract_markdown_links_with_text(
            supabase_llms_content, url
        )
        # All links should be from the same domain
        for link, _ in extracted_links:
            is_same = service._is_same_domain_or_subdomain(link, original_domain)
            assert is_same, f"Link {link} should match domain {original_domain}"
    def test_filter_llms_links_from_supabase(self, service, supabase_llms_content):
        """Test the complete filtering logic for Supabase llms.txt."""
        url = "https://supabase.com/docs/llms.txt"
        original_domain = "https://supabase.com"
        # Extract all links
        extracted_links = service.url_handler.extract_markdown_links_with_text(
            supabase_llms_content, url
        )
        # Filter for llms.txt files on same domain (mimics actual code)
        llms_links = []
        for link, text in extracted_links:
            if service.url_handler.is_llms_variant(link):
                if service._is_same_domain_or_subdomain(link, original_domain):
                    llms_links.append((link, text))
        # Should have all 8 links
        assert len(llms_links) == 8, f"Expected 8 llms links, got {len(llms_links)}"
    @pytest.mark.asyncio
    async def test_llms_txt_link_following_integration(self, service, supabase_llms_content):
        """Integration test for the complete llms.txt link following flow."""
        url = "https://supabase.com/docs/llms.txt"
        # Mock the crawl_batch_with_progress to verify it's called with correct URLs
        mock_batch_results = [
            {'url': f'https://supabase.com/llms/{name}.txt', 'markdown': f'# {name}', 'title': f'{name}'}
            for name in ['guides', 'js', 'dart', 'swift', 'kotlin', 'python', 'csharp', 'cli']
        ]
        service.crawl_batch_with_progress = AsyncMock(return_value=mock_batch_results)
        service.crawl_markdown_file = AsyncMock(return_value=[{
            'url': url,
            'markdown': supabase_llms_content,
            'title': 'Supabase Docs'
        }])
        # Create progress tracker mock
        service.progress_tracker = MagicMock()
        service.progress_tracker.update = AsyncMock()
        # Simulate the request that would come from orchestration
        request = {
            "is_discovery_target": True,
            "original_domain": "https://supabase.com",
            "max_concurrent": 5
        }
        # Call the actual crawl method
        crawl_results, crawl_type = await service._crawl_by_url_type(url, request)
        # Verify batch crawl was called with the 8 llms.txt URLs
        service.crawl_batch_with_progress.assert_called_once()
        call_args = service.crawl_batch_with_progress.call_args
        crawled_urls = call_args[0][0]  # First positional argument
        assert len(crawled_urls) == 8, f"Should crawl 8 linked files, got {len(crawled_urls)}"
        expected_urls = [
            "https://supabase.com/llms/guides.txt",
            "https://supabase.com/llms/js.txt",
            "https://supabase.com/llms/dart.txt",
            "https://supabase.com/llms/swift.txt",
            "https://supabase.com/llms/kotlin.txt",
            "https://supabase.com/llms/python.txt",
            "https://supabase.com/llms/csharp.txt",
            "https://supabase.com/llms/cli.txt",
        ]
        assert set(crawled_urls) == set(expected_urls)
        # Verify total results include main file + linked pages
        assert len(crawl_results) == 9, f"Should have 9 total pages (1 main + 8 linked), got {len(crawl_results)}"
        # Verify crawl type
        assert crawl_type == "llms_txt_with_linked_pages"
    def test_external_llms_links_are_filtered(self, service):
        """Test that external domain llms.txt links are filtered out."""
        content = """# Test llms.txt
 - [Internal Link](https://supabase.com/llms/internal.txt)
 - [External Link](https://external.com/llms/external.txt)
 - [Another Internal](https://docs.supabase.com/llms/docs.txt)
 """
        url = "https://supabase.com/llms.txt"
        original_domain = "https://supabase.com"
        extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
        # Filter for same-domain llms links
        llms_links = []
        for link, text in extracted_links:
            if service.url_handler.is_llms_variant(link):
                if service._is_same_domain_or_subdomain(link, original_domain):
                    llms_links.append((link, text))
        # Should only have 2 links (internal and subdomain), external filtered out
        assert len(llms_links) == 2
        urls = [link for link, _ in llms_links]
        assert "https://supabase.com/llms/internal.txt" in urls
        assert "https://docs.supabase.com/llms/docs.txt" in urls
        assert "https://external.com/llms/external.txt" not in urls
    def test_non_llms_links_are_filtered(self, service):
        """Test that non-llms.txt links are filtered out."""
        content = """# Test llms.txt
 - [LLMs Link](https://supabase.com/llms/guide.txt)
 - [Regular Doc](https://supabase.com/docs/guide)
 - [PDF File](https://supabase.com/docs/guide.pdf)
 - [Another LLMs](https://supabase.com/llms/api.txt)
 """
        url = "https://supabase.com/llms.txt"
        original_domain = "https://supabase.com"
        extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
        # Filter for llms links only
        llms_links = []
        for link, text in extracted_links:
            if service.url_handler.is_llms_variant(link):
                if service._is_same_domain_or_subdomain(link, original_domain):
                    llms_links.append((link, text))
        # Should only have 2 llms.txt links
        assert len(llms_links) == 2
        urls = [link for link, _ in llms_links]
        assert "https://supabase.com/llms/guide.txt" in urls
        assert "https://supabase.com/llms/api.txt" in urls
        assert "https://supabase.com/docs/guide" not in urls
        assert "https://supabase.com/docs/guide.pdf" not in urls
--- a/python/tests/test_url_handler.py
+++ b/python/tests/test_url_handler.py
@@ -122,4 +122,120 @@ class TestURLHandler:
        # Should not transform non-GitHub URLs
        other = "https://example.com/file"
-        assert handler.transform_github_url(other) == other
+        assert handler.transform_github_url(other) == other
    def test_is_robots_txt(self):
        """Test robots.txt detection."""
        handler = URLHandler()
        # Standard robots.txt URLs
        assert handler.is_robots_txt("https://example.com/robots.txt") is True
        assert handler.is_robots_txt("http://example.com/robots.txt") is True
        assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
        # Case sensitivity
        assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
        assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
        # With query parameters (should still be detected)
        assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
        assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
        # Not robots.txt files
        assert handler.is_robots_txt("https://example.com/robots") is False
        assert handler.is_robots_txt("https://example.com/robots.html") is False
        assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
        assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
        assert handler.is_robots_txt("https://example.com/") is False
        # Edge case: malformed URL should not crash
        assert handler.is_robots_txt("not-a-url") is False
    def test_is_llms_variant(self):
        """Test llms file variant detection."""
        handler = URLHandler()
        # Standard llms.txt spec variants (only txt files)
        assert handler.is_llms_variant("https://example.com/llms.txt") is True
        assert handler.is_llms_variant("https://example.com/llms-full.txt") is True
        # Case sensitivity
        assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
        assert handler.is_llms_variant("https://example.com/LLMS-FULL.TXT") is True
        # With paths (should still detect)
        assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
        assert handler.is_llms_variant("https://example.com/public/llms-full.txt") is True
        # With query parameters
        assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
        assert handler.is_llms_variant("https://example.com/llms-full.txt#section") is True
        # Not llms files
        assert handler.is_llms_variant("https://example.com/llms") is False
        assert handler.is_llms_variant("https://example.com/llms.html") is False
        assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
        assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
        assert handler.is_llms_variant("https://example.com/readme.txt") is False
        # Edge case: malformed URL should not crash
        assert handler.is_llms_variant("not-a-url") is False
    def test_is_well_known_file(self):
        """Test .well-known file detection."""
        handler = URLHandler()
        # Standard .well-known files
        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
        assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
        assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
        # Case sensitivity - RFC 8615 requires lowercase .well-known
        assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is False
        assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is False
        # With query parameters
        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
        assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
        # Not .well-known files
        assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
        assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
        assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
        assert handler.is_well_known_file("https://example.com/ai.txt") is False
        assert handler.is_well_known_file("https://example.com/") is False
        # Edge case: malformed URL should not crash
        assert handler.is_well_known_file("not-a-url") is False
    def test_get_base_url(self):
        """Test base URL extraction."""
        handler = URLHandler()
        # Standard URLs
        assert handler.get_base_url("https://example.com") == "https://example.com"
        assert handler.get_base_url("https://example.com/") == "https://example.com"
        assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
        assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
        assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
        # HTTP vs HTTPS
        assert handler.get_base_url("http://example.com/path") == "http://example.com"
        assert handler.get_base_url("https://example.com/path") == "https://example.com"
        # Subdomains and ports
        assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
        assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
        assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
        # Complex cases
        assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
        # Edge cases - malformed URLs should return original
        assert handler.get_base_url("not-a-url") == "not-a-url"
        assert handler.get_base_url("") == ""
        assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
        # Missing scheme or netloc
        assert handler.get_base_url("//example.com/path") == "//example.com/path"  # Should return original
        assert handler.get_base_url("/path/to/resource") == "/path/to/resource"  # Should return original
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -257,6 +257,7 @@ server = [
    { name = "python-multipart" },
    { name = "slowapi" },
    { name = "supabase" },
    { name = "tldextract" },
    { name = "uvicorn" },
    { name = "watchfiles" },
 ]
@@ -362,6 +363,7 @@ server = [
    { name = "python-multipart", specifier = ">=0.0.20" },
    { name = "slowapi", specifier = ">=0.1.9" },
    { name = "supabase", specifier = "==2.15.1" },
    { name = "tldextract", specifier = ">=5.0.0" },
    { name = "uvicorn", specifier = ">=0.24.0" },
    { name = "watchfiles", specifier = ">=0.18" },
 ]
@@ -2646,6 +2648,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
 ]
 [[package]]
 name = "requests-file"
 version = "3.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "requests" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fe/5e/2aca791207e542a16a8cc91fd0e19f5c26f4dff030ee3062deb5606f84ae/requests_file-3.0.0.tar.gz", hash = "sha256:68789589cfde7098e8933fe3e69bbd864f7f0c22f118937b424d94d0e1b7760f", size = 6897 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/e4/85/689c218feb21a66919bd667969d4ed60a64db67f6ea5ceb00c9795ae19b0/requests_file-3.0.0-py2.py3-none-any.whl", hash = "sha256:aca222ec94a19310be2a0ed6bdcdebb09058b0f6c3e984af56361c8fca59653c", size = 4486 },
 ]
 [[package]]
 name = "rich"
 version = "14.0.0"
@@ -3131,6 +3145,21 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
 ]
 [[package]]
 name = "tldextract"
 version = "5.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock" },
    { name = "idna" },
    { name = "requests" },
    { name = "requests-file" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/97/78/182641ea38e3cfd56e9c7b3c0d48a53d432eea755003aa544af96403d4ac/tldextract-5.3.0.tar.gz", hash = "sha256:b3d2b70a1594a0ecfa6967d57251527d58e00bb5a91a74387baa0d87a0678609", size = 128502 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/67/7c/ea488ef48f2f544566947ced88541bc45fae9e0e422b2edbf165ee07da99/tldextract-5.3.0-py3-none-any.whl", hash = "sha256:f70f31d10b55c83993f55e91ecb7c5d84532a8972f22ec578ecfbe5ea2292db2", size = 107384 },
 ]
 [[package]]
 name = "tokenizers"
 version = "0.21.1"