mirror of
https://github.com/coleam00/Archon.git
synced 2025-12-24 02:39:17 -05:00
Merge remote-tracking branch 'origin/ui/agent-work-order' into feat/agent_work_orders_ui
This commit is contained in:
11
.github/ISSUE_TEMPLATE/auto_bug_report.md
vendored
Normal file
11
.github/ISSUE_TEMPLATE/auto_bug_report.md
vendored
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
name: Auto Bug Report
|
||||||
|
about: Automated bug report from Archon
|
||||||
|
title: ''
|
||||||
|
labels: bug, auto-report
|
||||||
|
assignees: ''
|
||||||
|
---
|
||||||
|
|
||||||
|
<!-- This template is used for automated bug reports submitted through the Archon UI -->
|
||||||
|
<!-- The form data below is automatically filled by the bug reporter -->
|
||||||
|
|
||||||
10
CLAUDE.md
10
CLAUDE.md
@@ -216,6 +216,16 @@ SUPABASE_SERVICE_KEY=your-service-key-here # Use legacy key format for clou
|
|||||||
Optional variables and full configuration:
|
Optional variables and full configuration:
|
||||||
See `python/.env.example` for complete list
|
See `python/.env.example` for complete list
|
||||||
|
|
||||||
|
### Repository Configuration
|
||||||
|
|
||||||
|
Repository information (owner, name) is centralized in `python/src/server/config/version.py`:
|
||||||
|
- `GITHUB_REPO_OWNER` - GitHub repository owner (default: "coleam00")
|
||||||
|
- `GITHUB_REPO_NAME` - GitHub repository name (default: "Archon")
|
||||||
|
|
||||||
|
This is the single source of truth for repository configuration. All services (version checking, bug reports, etc.) should import these constants rather than hardcoding repository URLs.
|
||||||
|
|
||||||
|
Environment variable override: `GITHUB_REPO="owner/repo"` can be set to override defaults.
|
||||||
|
|
||||||
## Common Development Tasks
|
## Common Development Tasks
|
||||||
|
|
||||||
### Add a new API endpoint
|
### Add a new API endpoint
|
||||||
|
|||||||
@@ -113,7 +113,7 @@ This new vision for Archon replaces the old one (the agenteer). Archon used to b
|
|||||||
|
|
||||||
Once everything is running:
|
Once everything is running:
|
||||||
|
|
||||||
1. **Test Web Crawling**: Go to http://localhost:3737 → Knowledge Base → "Crawl Website" → Enter a doc URL (such as https://ai.pydantic.dev/llms-full.txt)
|
1. **Test Web Crawling**: Go to http://localhost:3737 → Knowledge Base → "Crawl Website" → Enter a doc URL (such as https://ai.pydantic.dev/llms.txt)
|
||||||
2. **Test Document Upload**: Knowledge Base → Upload a PDF
|
2. **Test Document Upload**: Knowledge Base → Upload a PDF
|
||||||
3. **Test Projects**: Projects → Create a new project and add tasks
|
3. **Test Projects**: Projects → Create a new project and add tasks
|
||||||
4. **Integrate with your AI coding assistant**: MCP Dashboard → Copy connection config for your AI coding assistant
|
4. **Integrate with your AI coding assistant**: MCP Dashboard → Copy connection config for your AI coding assistant
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import { Button } from "../../ui/primitives";
|
|||||||
import { cn } from "../../ui/primitives/styles";
|
import { cn } from "../../ui/primitives/styles";
|
||||||
import { useCrawlProgressPolling } from "../hooks";
|
import { useCrawlProgressPolling } from "../hooks";
|
||||||
import type { ActiveOperation } from "../types/progress";
|
import type { ActiveOperation } from "../types/progress";
|
||||||
|
import { isValidHttpUrl } from "../utils/urlValidation";
|
||||||
|
|
||||||
interface CrawlingProgressProps {
|
interface CrawlingProgressProps {
|
||||||
onSwitchToBrowse: () => void;
|
onSwitchToBrowse: () => void;
|
||||||
@@ -129,6 +130,7 @@ export const CrawlingProgress: React.FC<CrawlingProgressProps> = ({ onSwitchToBr
|
|||||||
"in_progress",
|
"in_progress",
|
||||||
"starting",
|
"starting",
|
||||||
"initializing",
|
"initializing",
|
||||||
|
"discovery",
|
||||||
"analyzing",
|
"analyzing",
|
||||||
"storing",
|
"storing",
|
||||||
"source_creation",
|
"source_creation",
|
||||||
@@ -245,6 +247,63 @@ export const CrawlingProgress: React.FC<CrawlingProgressProps> = ({ onSwitchToBr
|
|||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Discovery Information */}
|
||||||
|
{operation.discovered_file && (
|
||||||
|
<div className="pt-2 border-t border-white/10">
|
||||||
|
<div className="flex items-center gap-2 mb-2">
|
||||||
|
<span className="text-xs font-semibold text-cyan-400">Discovery Result</span>
|
||||||
|
{operation.discovered_file_type && (
|
||||||
|
<span className="px-2 py-0.5 text-xs rounded bg-cyan-500/10 border border-cyan-500/20 text-cyan-300">
|
||||||
|
{operation.discovered_file_type}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
{isValidHttpUrl(operation.discovered_file) ? (
|
||||||
|
<a
|
||||||
|
href={operation.discovered_file}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="text-sm text-gray-400 hover:text-cyan-400 transition-colors truncate block"
|
||||||
|
>
|
||||||
|
{operation.discovered_file}
|
||||||
|
</a>
|
||||||
|
) : (
|
||||||
|
<span className="text-sm text-gray-400 truncate block">
|
||||||
|
{operation.discovered_file}
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Linked Files */}
|
||||||
|
{operation.linked_files && operation.linked_files.length > 0 && (
|
||||||
|
<div className="pt-2 border-t border-white/10">
|
||||||
|
<div className="text-xs font-semibold text-cyan-400 mb-2">
|
||||||
|
Following {operation.linked_files.length} Linked File
|
||||||
|
{operation.linked_files.length > 1 ? "s" : ""}
|
||||||
|
</div>
|
||||||
|
<div className="space-y-1 max-h-32 overflow-y-auto">
|
||||||
|
{operation.linked_files.map((file: string, idx: number) => (
|
||||||
|
isValidHttpUrl(file) ? (
|
||||||
|
<a
|
||||||
|
key={idx}
|
||||||
|
href={file}
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="text-xs text-gray-400 hover:text-cyan-400 transition-colors truncate block"
|
||||||
|
>
|
||||||
|
• {file}
|
||||||
|
</a>
|
||||||
|
) : (
|
||||||
|
<span key={idx} className="text-xs text-gray-400 truncate block">
|
||||||
|
• {file}
|
||||||
|
</span>
|
||||||
|
)
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Current Action or Operation Type Info */}
|
{/* Current Action or Operation Type Info */}
|
||||||
{(operation.current_url || operation.operation_type) && (
|
{(operation.current_url || operation.operation_type) && (
|
||||||
<div className="pt-2 border-t border-white/10">
|
<div className="pt-2 border-t border-white/10">
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
export type ProgressStatus =
|
export type ProgressStatus =
|
||||||
| "starting"
|
| "starting"
|
||||||
| "initializing"
|
| "initializing"
|
||||||
|
| "discovery"
|
||||||
| "analyzing"
|
| "analyzing"
|
||||||
| "crawling"
|
| "crawling"
|
||||||
| "processing"
|
| "processing"
|
||||||
@@ -24,7 +25,16 @@ export type ProgressStatus =
|
|||||||
| "cancelled"
|
| "cancelled"
|
||||||
| "stopping";
|
| "stopping";
|
||||||
|
|
||||||
export type CrawlType = "normal" | "sitemap" | "llms-txt" | "text_file" | "refresh";
|
export type CrawlType =
|
||||||
|
| "normal"
|
||||||
|
| "sitemap"
|
||||||
|
| "llms-txt"
|
||||||
|
| "text_file"
|
||||||
|
| "refresh"
|
||||||
|
| "llms_txt_with_linked_files"
|
||||||
|
| "llms_txt_linked_files"
|
||||||
|
| "discovery_single_file"
|
||||||
|
| "discovery_sitemap";
|
||||||
export type UploadType = "document";
|
export type UploadType = "document";
|
||||||
|
|
||||||
export interface BaseProgressData {
|
export interface BaseProgressData {
|
||||||
@@ -48,6 +58,10 @@ export interface CrawlProgressData extends BaseProgressData {
|
|||||||
codeBlocksFound?: number;
|
codeBlocksFound?: number;
|
||||||
totalSummaries?: number;
|
totalSummaries?: number;
|
||||||
completedSummaries?: number;
|
completedSummaries?: number;
|
||||||
|
// Discovery-related fields
|
||||||
|
discoveredFile?: string;
|
||||||
|
discoveredFileType?: string;
|
||||||
|
linkedFiles?: string[];
|
||||||
originalCrawlParams?: {
|
originalCrawlParams?: {
|
||||||
url: string;
|
url: string;
|
||||||
knowledge_type?: string;
|
knowledge_type?: string;
|
||||||
@@ -100,6 +114,10 @@ export interface ActiveOperation {
|
|||||||
code_examples_found?: number;
|
code_examples_found?: number;
|
||||||
current_operation?: string;
|
current_operation?: string;
|
||||||
};
|
};
|
||||||
|
// Discovery information
|
||||||
|
discovered_file?: string;
|
||||||
|
discovered_file_type?: string;
|
||||||
|
linked_files?: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ActiveOperationsResponse {
|
export interface ActiveOperationsResponse {
|
||||||
@@ -127,6 +145,13 @@ export interface ProgressResponse {
|
|||||||
codeBlocksFound?: number;
|
codeBlocksFound?: number;
|
||||||
totalSummaries?: number;
|
totalSummaries?: number;
|
||||||
completedSummaries?: number;
|
completedSummaries?: number;
|
||||||
|
// Discovery-related fields
|
||||||
|
discoveredFile?: string;
|
||||||
|
discovered_file?: string; // Snake case from backend
|
||||||
|
discoveredFileType?: string;
|
||||||
|
discovered_file_type?: string; // Snake case from backend
|
||||||
|
linkedFiles?: string[];
|
||||||
|
linked_files?: string[]; // Snake case from backend
|
||||||
fileName?: string;
|
fileName?: string;
|
||||||
fileSize?: number;
|
fileSize?: number;
|
||||||
chunksProcessed?: number;
|
chunksProcessed?: number;
|
||||||
|
|||||||
44
archon-ui-main/src/features/progress/utils/urlValidation.ts
Normal file
44
archon-ui-main/src/features/progress/utils/urlValidation.ts
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
/**
|
||||||
|
* Client-side URL validation utility for discovered files.
|
||||||
|
* Ensures only safe HTTP/HTTPS URLs are rendered as clickable links.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const SAFE_PROTOCOLS = ["http:", "https:"];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates that a URL is safe to render as a clickable link.
|
||||||
|
* Only allows http: and https: protocols.
|
||||||
|
*
|
||||||
|
* @param url - URL string to validate
|
||||||
|
* @returns true if URL is safe (http/https), false otherwise
|
||||||
|
*/
|
||||||
|
export function isValidHttpUrl(url: string | undefined | null): boolean {
|
||||||
|
if (!url || typeof url !== "string") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim whitespace
|
||||||
|
const trimmed = url.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const parsed = new URL(trimmed);
|
||||||
|
|
||||||
|
// Only allow http and https protocols
|
||||||
|
if (!SAFE_PROTOCOLS.includes(parsed.protocol)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Basic hostname validation (must have at least one dot or be localhost)
|
||||||
|
if (!parsed.hostname.includes(".") && parsed.hostname !== "localhost") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
// URL parsing failed - not a valid URL
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,332 @@
|
|||||||
|
import { AnimatePresence, motion } from "framer-motion";
|
||||||
|
import { ChevronDown, ChevronUp, ExternalLink, Plus, User } from "lucide-react";
|
||||||
|
import { useState } from "react";
|
||||||
|
import { Button } from "@/features/ui/primitives/button";
|
||||||
|
import { Card } from "@/features/ui/primitives/card";
|
||||||
|
import { cn } from "@/features/ui/primitives/styles";
|
||||||
|
import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/features/ui/primitives/tooltip";
|
||||||
|
import { StepHistoryCard } from "./components/StepHistoryCard";
|
||||||
|
import { WorkflowStepButton } from "./components/WorkflowStepButton";
|
||||||
|
|
||||||
|
const MOCK_WORK_ORDER = {
|
||||||
|
id: "wo-1",
|
||||||
|
title: "Create comprehensive documentation",
|
||||||
|
status: "in_progress" as const,
|
||||||
|
workflow: {
|
||||||
|
currentStep: 2,
|
||||||
|
steps: [
|
||||||
|
{ id: "1", name: "Create Branch", status: "completed", duration: "33s" },
|
||||||
|
{ id: "2", name: "Planning", status: "in_progress", duration: "2m 11s" },
|
||||||
|
{ id: "3", name: "Execute", status: "pending", duration: null },
|
||||||
|
{ id: "4", name: "Commit", status: "pending", duration: null },
|
||||||
|
{ id: "5", name: "Create PR", status: "pending", duration: null },
|
||||||
|
],
|
||||||
|
},
|
||||||
|
stepHistory: [
|
||||||
|
{
|
||||||
|
id: "step-1",
|
||||||
|
stepName: "Create Branch",
|
||||||
|
timestamp: "7 minutes ago",
|
||||||
|
output: "docs/remove-archon-mentions",
|
||||||
|
session: "Session: a342d9ac-56c4-43ae-95b8-9ddf18143961",
|
||||||
|
collapsible: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: "step-2",
|
||||||
|
stepName: "Planning",
|
||||||
|
timestamp: "5 minutes ago",
|
||||||
|
output: `## Report
|
||||||
|
|
||||||
|
**Work completed:**
|
||||||
|
|
||||||
|
- Conducted comprehensive codebase audit for "archon" and "Archon" mentions
|
||||||
|
- Verified main README.md is already breach (no archon mentions present)
|
||||||
|
- Identified 14 subdirectory README files that need verification
|
||||||
|
- Discovered historical git commits that added "hello from archon" but content has been removed
|
||||||
|
- Identified 3 remote branches with "archon" in their names (out of scope for this task)
|
||||||
|
- Created comprehensive PRP plan for documentation cleanup and verification`,
|
||||||
|
session: "Session: e3889823-b272-43c0-b11d-7a786d7e3c88",
|
||||||
|
collapsible: true,
|
||||||
|
isHumanInLoop: true,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
document: {
|
||||||
|
id: "doc-1",
|
||||||
|
title: "Planning Document",
|
||||||
|
content: {
|
||||||
|
markdown: `# Documentation Cleanup Plan
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
This document outlines the plan to remove all "archon" mentions from the codebase.
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
1. Audit all README files
|
||||||
|
2. Check git history for sensitive content
|
||||||
|
3. Verify no configuration files reference "archon"
|
||||||
|
4. Update documentation
|
||||||
|
|
||||||
|
## Progress
|
||||||
|
- [x] Initial audit complete
|
||||||
|
- [ ] README updates pending
|
||||||
|
- [ ] Configuration review pending`,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const AgentWorkOrderExample = () => {
|
||||||
|
const [hoveredStepIndex, setHoveredStepIndex] = useState<number | null>(null);
|
||||||
|
const [expandedSteps, setExpandedSteps] = useState<Set<string>>(new Set(["step-2"]));
|
||||||
|
const [showDetails, setShowDetails] = useState(false);
|
||||||
|
const [humanInLoopCheckpoints, setHumanInLoopCheckpoints] = useState<Set<number>>(new Set());
|
||||||
|
|
||||||
|
const toggleStepExpansion = (stepId: string) => {
|
||||||
|
setExpandedSteps((prev) => {
|
||||||
|
const newSet = new Set(prev);
|
||||||
|
if (newSet.has(stepId)) {
|
||||||
|
newSet.delete(stepId);
|
||||||
|
} else {
|
||||||
|
newSet.add(stepId);
|
||||||
|
}
|
||||||
|
return newSet;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
const addHumanInLoopCheckpoint = (index: number) => {
|
||||||
|
setHumanInLoopCheckpoints((prev) => {
|
||||||
|
const newSet = new Set(prev);
|
||||||
|
newSet.add(index);
|
||||||
|
return newSet;
|
||||||
|
});
|
||||||
|
setHoveredStepIndex(null);
|
||||||
|
};
|
||||||
|
|
||||||
|
const removeHumanInLoopCheckpoint = (index: number) => {
|
||||||
|
setHumanInLoopCheckpoints((prev) => {
|
||||||
|
const newSet = new Set(prev);
|
||||||
|
newSet.delete(index);
|
||||||
|
return newSet;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-6">
|
||||||
|
{/* Explanation Text */}
|
||||||
|
<p className="text-sm text-gray-600 dark:text-gray-400">
|
||||||
|
<strong>Use this layout for:</strong> Agent work order workflows with step-by-step progress tracking,
|
||||||
|
collapsible history, and integrated document editing for human-in-the-loop approval.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
{/* Workflow Progress Bar */}
|
||||||
|
<Card blur="md" transparency="light" edgePosition="top" edgeColor="cyan" size="lg" className="overflow-visible">
|
||||||
|
<div className="flex items-center justify-between mb-6">
|
||||||
|
<h3 className="text-lg font-semibold text-gray-900 dark:text-white">{MOCK_WORK_ORDER.title}</h3>
|
||||||
|
<Button
|
||||||
|
variant="ghost"
|
||||||
|
size="sm"
|
||||||
|
onClick={() => setShowDetails(!showDetails)}
|
||||||
|
className="text-cyan-600 dark:text-cyan-400 hover:bg-cyan-500/10"
|
||||||
|
aria-label={showDetails ? "Hide details" : "Show details"}
|
||||||
|
>
|
||||||
|
{showDetails ? (
|
||||||
|
<ChevronUp className="w-4 h-4 mr-1" aria-hidden="true" />
|
||||||
|
) : (
|
||||||
|
<ChevronDown className="w-4 h-4 mr-1" aria-hidden="true" />
|
||||||
|
)}
|
||||||
|
Details
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className="flex items-center justify-center gap-0">
|
||||||
|
{MOCK_WORK_ORDER.workflow.steps.map((step, index) => (
|
||||||
|
<div key={step.id} className="flex items-center">
|
||||||
|
{/* Step Button */}
|
||||||
|
<WorkflowStepButton
|
||||||
|
isCompleted={step.status === "completed"}
|
||||||
|
isActive={step.status === "in_progress"}
|
||||||
|
stepName={step.name}
|
||||||
|
color="cyan"
|
||||||
|
size={50}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Connecting Line - only show between steps */}
|
||||||
|
{index < MOCK_WORK_ORDER.workflow.steps.length - 1 && (
|
||||||
|
// biome-ignore lint/a11y/noStaticElementInteractions: Visual hover effect container for showing plus button
|
||||||
|
<div
|
||||||
|
className="relative flex-shrink-0"
|
||||||
|
style={{ width: "80px", height: "50px" }}
|
||||||
|
onMouseEnter={() => setHoveredStepIndex(index)}
|
||||||
|
onMouseLeave={() => setHoveredStepIndex(null)}
|
||||||
|
>
|
||||||
|
{/* Neon line */}
|
||||||
|
<div
|
||||||
|
className={cn(
|
||||||
|
"absolute top-1/2 left-0 right-0 h-[2px] transition-all duration-200",
|
||||||
|
step.status === "completed"
|
||||||
|
? "border-t-2 border-cyan-400 shadow-[0_0_8px_rgba(34,211,238,0.6)]"
|
||||||
|
: "border-t-2 border-gray-600 dark:border-gray-700",
|
||||||
|
hoveredStepIndex === index &&
|
||||||
|
step.status !== "completed" &&
|
||||||
|
"border-cyan-400/50 shadow-[0_0_6px_rgba(34,211,238,0.3)]",
|
||||||
|
)}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Human-in-Loop Checkpoint Indicator */}
|
||||||
|
{humanInLoopCheckpoints.has(index) && (
|
||||||
|
<TooltipProvider>
|
||||||
|
<Tooltip>
|
||||||
|
<TooltipTrigger asChild>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => removeHumanInLoopCheckpoint(index)}
|
||||||
|
className="absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2 bg-orange-500 hover:bg-orange-600 rounded-full p-1.5 shadow-lg shadow-orange-500/50 border-2 border-orange-400 transition-colors cursor-pointer"
|
||||||
|
aria-label="Remove Human-in-Loop checkpoint"
|
||||||
|
>
|
||||||
|
<User className="w-3.5 h-3.5 text-white" aria-hidden="true" />
|
||||||
|
</button>
|
||||||
|
</TooltipTrigger>
|
||||||
|
<TooltipContent>Click to remove</TooltipContent>
|
||||||
|
</Tooltip>
|
||||||
|
</TooltipProvider>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Plus button on hover - only show if no checkpoint exists */}
|
||||||
|
{hoveredStepIndex === index && !humanInLoopCheckpoints.has(index) && (
|
||||||
|
<TooltipProvider>
|
||||||
|
<Tooltip>
|
||||||
|
<TooltipTrigger asChild>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
onClick={() => addHumanInLoopCheckpoint(index)}
|
||||||
|
className="absolute top-1/2 left-1/2 transform -translate-x-1/2 -translate-y-1/2 w-8 h-8 rounded-full bg-orange-500 hover:bg-orange-600 transition-colors shadow-lg shadow-orange-500/50 flex items-center justify-center text-white"
|
||||||
|
aria-label="Add Human-in-Loop step"
|
||||||
|
>
|
||||||
|
<Plus className="w-4 h-4" aria-hidden="true" />
|
||||||
|
</button>
|
||||||
|
</TooltipTrigger>
|
||||||
|
<TooltipContent>Add Human-in-Loop</TooltipContent>
|
||||||
|
</Tooltip>
|
||||||
|
</TooltipProvider>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Collapsible Details Section */}
|
||||||
|
<AnimatePresence>
|
||||||
|
{showDetails && (
|
||||||
|
<motion.div
|
||||||
|
initial={{ height: 0, opacity: 0 }}
|
||||||
|
animate={{ height: "auto", opacity: 1 }}
|
||||||
|
exit={{ height: 0, opacity: 0 }}
|
||||||
|
transition={{
|
||||||
|
height: {
|
||||||
|
duration: 0.3,
|
||||||
|
ease: [0.04, 0.62, 0.23, 0.98],
|
||||||
|
},
|
||||||
|
opacity: {
|
||||||
|
duration: 0.2,
|
||||||
|
ease: "easeInOut",
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
style={{ overflow: "hidden" }}
|
||||||
|
className="mt-6"
|
||||||
|
>
|
||||||
|
<motion.div
|
||||||
|
initial={{ y: -20 }}
|
||||||
|
animate={{ y: 0 }}
|
||||||
|
exit={{ y: -20 }}
|
||||||
|
transition={{
|
||||||
|
duration: 0.2,
|
||||||
|
ease: "easeOut",
|
||||||
|
}}
|
||||||
|
className="grid grid-cols-1 md:grid-cols-2 gap-6 pt-6 border-t border-gray-200/50 dark:border-gray-700/30"
|
||||||
|
>
|
||||||
|
{/* Left Column */}
|
||||||
|
<div className="space-y-4">
|
||||||
|
<div>
|
||||||
|
<h4 className="text-xs font-semibold text-gray-500 dark:text-gray-400 uppercase tracking-wider mb-2">
|
||||||
|
Details
|
||||||
|
</h4>
|
||||||
|
<div className="space-y-3">
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Status</p>
|
||||||
|
<p className="text-sm font-medium text-blue-600 dark:text-blue-400 mt-0.5">Running</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Sandbox Type</p>
|
||||||
|
<p className="text-sm font-medium text-gray-900 dark:text-white mt-0.5">git_branch</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Repository</p>
|
||||||
|
<a
|
||||||
|
href="https://github.com/Wirasm/dylan"
|
||||||
|
target="_blank"
|
||||||
|
rel="noopener noreferrer"
|
||||||
|
className="text-sm font-medium text-cyan-600 dark:text-cyan-400 hover:underline inline-flex items-center gap-1 mt-0.5"
|
||||||
|
>
|
||||||
|
https://github.com/Wirasm/dylan
|
||||||
|
<ExternalLink className="w-3 h-3" aria-hidden="true" />
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Branch</p>
|
||||||
|
<p className="text-sm font-medium font-mono text-gray-900 dark:text-white mt-0.5">
|
||||||
|
docs/remove-archon-mentions
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Work Order ID</p>
|
||||||
|
<p className="text-sm font-medium font-mono text-gray-700 dark:text-gray-300 mt-0.5">
|
||||||
|
wo-7fd39c8d
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Right Column */}
|
||||||
|
<div className="space-y-4">
|
||||||
|
<div>
|
||||||
|
<h4 className="text-xs font-semibold text-gray-500 dark:text-gray-400 uppercase tracking-wider mb-2">
|
||||||
|
Statistics
|
||||||
|
</h4>
|
||||||
|
<div className="space-y-3">
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Commits</p>
|
||||||
|
<p className="text-2xl font-bold text-gray-900 dark:text-white mt-0.5">0</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Files Changed</p>
|
||||||
|
<p className="text-2xl font-bold text-gray-900 dark:text-white mt-0.5">0</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">Steps Completed</p>
|
||||||
|
<p className="text-2xl font-bold text-gray-900 dark:text-white mt-0.5">2 / 2</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</motion.div>
|
||||||
|
</motion.div>
|
||||||
|
)}
|
||||||
|
</AnimatePresence>
|
||||||
|
</Card>
|
||||||
|
|
||||||
|
{/* Step History Section */}
|
||||||
|
<div className="space-y-4">
|
||||||
|
<h3 className="text-lg font-semibold text-gray-900 dark:text-white">Step History</h3>
|
||||||
|
{MOCK_WORK_ORDER.stepHistory.map((step) => (
|
||||||
|
<StepHistoryCard
|
||||||
|
key={step.id}
|
||||||
|
step={step}
|
||||||
|
isExpanded={expandedSteps.has(step.id)}
|
||||||
|
onToggle={() => toggleStepExpansion(step.id)}
|
||||||
|
document={step.isHumanInLoop ? MOCK_WORK_ORDER.document : undefined}
|
||||||
|
/>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -0,0 +1,265 @@
|
|||||||
|
import { AnimatePresence, motion } from "framer-motion";
|
||||||
|
import { AlertCircle, CheckCircle2, ChevronDown, ChevronUp, Edit3, Eye } from "lucide-react";
|
||||||
|
import { useState } from "react";
|
||||||
|
import ReactMarkdown from "react-markdown";
|
||||||
|
import { Button } from "@/features/ui/primitives/button";
|
||||||
|
import { Card } from "@/features/ui/primitives/card";
|
||||||
|
import { cn } from "@/features/ui/primitives/styles";
|
||||||
|
|
||||||
|
interface StepHistoryCardProps {
|
||||||
|
step: {
|
||||||
|
id: string;
|
||||||
|
stepName: string;
|
||||||
|
timestamp: string;
|
||||||
|
output: string;
|
||||||
|
session: string;
|
||||||
|
collapsible: boolean;
|
||||||
|
isHumanInLoop?: boolean;
|
||||||
|
};
|
||||||
|
isExpanded: boolean;
|
||||||
|
onToggle: () => void;
|
||||||
|
document?: {
|
||||||
|
title: string;
|
||||||
|
content: {
|
||||||
|
markdown: string;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export const StepHistoryCard = ({ step, isExpanded, onToggle, document }: StepHistoryCardProps) => {
|
||||||
|
const [isEditingDocument, setIsEditingDocument] = useState(false);
|
||||||
|
const [editedContent, setEditedContent] = useState("");
|
||||||
|
const [hasChanges, setHasChanges] = useState(false);
|
||||||
|
|
||||||
|
const handleToggleEdit = () => {
|
||||||
|
if (!isEditingDocument && document) {
|
||||||
|
setEditedContent(document.content.markdown);
|
||||||
|
}
|
||||||
|
setIsEditingDocument(!isEditingDocument);
|
||||||
|
setHasChanges(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleContentChange = (value: string) => {
|
||||||
|
setEditedContent(value);
|
||||||
|
setHasChanges(document ? value !== document.content.markdown : false);
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleApproveAndContinue = () => {
|
||||||
|
console.log("Approved and continuing to next step");
|
||||||
|
setHasChanges(false);
|
||||||
|
setIsEditingDocument(false);
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<Card
|
||||||
|
blur="md"
|
||||||
|
transparency="light"
|
||||||
|
edgePosition="left"
|
||||||
|
edgeColor={step.isHumanInLoop ? "orange" : "blue"}
|
||||||
|
size="md"
|
||||||
|
className="overflow-visible"
|
||||||
|
>
|
||||||
|
{/* Header */}
|
||||||
|
<div className="flex items-center justify-between mb-3">
|
||||||
|
<div className="flex-1">
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<h4 className="font-semibold text-gray-900 dark:text-white">{step.stepName}</h4>
|
||||||
|
{step.isHumanInLoop && (
|
||||||
|
<span className="inline-flex items-center gap-1 px-2 py-1 text-xs font-medium rounded-md bg-orange-500/10 text-orange-600 dark:text-orange-400 border border-orange-500/20">
|
||||||
|
<AlertCircle className="w-3 h-3" aria-hidden="true" />
|
||||||
|
Human-in-Loop
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400 mt-1">{step.timestamp}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Collapse toggle - only show if collapsible */}
|
||||||
|
{step.collapsible && (
|
||||||
|
<Button
|
||||||
|
variant="ghost"
|
||||||
|
size="sm"
|
||||||
|
onClick={onToggle}
|
||||||
|
className={cn(
|
||||||
|
"px-2 transition-colors",
|
||||||
|
step.isHumanInLoop
|
||||||
|
? "text-orange-500 hover:text-orange-600 dark:hover:text-orange-400"
|
||||||
|
: "text-cyan-500 hover:text-cyan-600 dark:hover:text-cyan-400",
|
||||||
|
)}
|
||||||
|
aria-label={isExpanded ? "Collapse step" : "Expand step"}
|
||||||
|
aria-expanded={isExpanded}
|
||||||
|
>
|
||||||
|
{isExpanded ? <ChevronUp className="w-4 h-4" /> : <ChevronDown className="w-4 h-4" />}
|
||||||
|
</Button>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Content - collapsible with animation */}
|
||||||
|
<AnimatePresence mode="wait">
|
||||||
|
{(isExpanded || !step.collapsible) && (
|
||||||
|
<motion.div
|
||||||
|
initial={{ height: 0, opacity: 0 }}
|
||||||
|
animate={{ height: "auto", opacity: 1 }}
|
||||||
|
exit={{ height: 0, opacity: 0 }}
|
||||||
|
transition={{
|
||||||
|
height: {
|
||||||
|
duration: 0.3,
|
||||||
|
ease: [0.04, 0.62, 0.23, 0.98],
|
||||||
|
},
|
||||||
|
opacity: {
|
||||||
|
duration: 0.2,
|
||||||
|
ease: "easeInOut",
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
style={{ overflow: "hidden" }}
|
||||||
|
>
|
||||||
|
<motion.div
|
||||||
|
initial={{ y: -20 }}
|
||||||
|
animate={{ y: 0 }}
|
||||||
|
exit={{ y: -20 }}
|
||||||
|
transition={{
|
||||||
|
duration: 0.2,
|
||||||
|
ease: "easeOut",
|
||||||
|
}}
|
||||||
|
className="space-y-3"
|
||||||
|
>
|
||||||
|
{/* Output content */}
|
||||||
|
<div
|
||||||
|
className={cn(
|
||||||
|
"p-4 rounded-lg border",
|
||||||
|
step.isHumanInLoop
|
||||||
|
? "bg-orange-50/50 dark:bg-orange-950/10 border-orange-200/50 dark:border-orange-800/30"
|
||||||
|
: "bg-cyan-50/30 dark:bg-cyan-950/10 border-cyan-200/50 dark:border-cyan-800/30",
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<pre className="text-xs font-mono text-gray-700 dark:text-gray-300 whitespace-pre-wrap leading-relaxed">
|
||||||
|
{step.output}
|
||||||
|
</pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Session info */}
|
||||||
|
<p
|
||||||
|
className={cn(
|
||||||
|
"text-xs font-mono",
|
||||||
|
step.isHumanInLoop ? "text-orange-600 dark:text-orange-400" : "text-cyan-600 dark:text-cyan-400",
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
{step.session}
|
||||||
|
</p>
|
||||||
|
|
||||||
|
{/* Review and Approve Plan - only for human-in-loop steps with documents */}
|
||||||
|
{step.isHumanInLoop && document && (
|
||||||
|
<div className="mt-6 space-y-3">
|
||||||
|
<h4 className="text-sm font-semibold text-gray-900 dark:text-white">Review and Approve Plan</h4>
|
||||||
|
|
||||||
|
{/* Document Card */}
|
||||||
|
<Card blur="md" transparency="light" size="md" className="overflow-visible">
|
||||||
|
{/* View/Edit toggle in top right */}
|
||||||
|
<div className="flex items-center justify-end mb-3">
|
||||||
|
<Button
|
||||||
|
variant="ghost"
|
||||||
|
size="sm"
|
||||||
|
onClick={handleToggleEdit}
|
||||||
|
className="text-gray-600 dark:text-gray-400 hover:bg-gray-500/10"
|
||||||
|
aria-label={isEditingDocument ? "Switch to preview mode" : "Switch to edit mode"}
|
||||||
|
>
|
||||||
|
{isEditingDocument ? (
|
||||||
|
<Eye className="w-4 h-4" aria-hidden="true" />
|
||||||
|
) : (
|
||||||
|
<Edit3 className="w-4 h-4" aria-hidden="true" />
|
||||||
|
)}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{isEditingDocument ? (
|
||||||
|
<div className="space-y-4">
|
||||||
|
<textarea
|
||||||
|
value={editedContent}
|
||||||
|
onChange={(e) => handleContentChange(e.target.value)}
|
||||||
|
className={cn(
|
||||||
|
"w-full min-h-[300px] p-4 rounded-lg",
|
||||||
|
"bg-white/50 dark:bg-black/30",
|
||||||
|
"border border-gray-300 dark:border-gray-700",
|
||||||
|
"text-gray-900 dark:text-white font-mono text-sm",
|
||||||
|
"focus:outline-none focus:border-orange-400 focus:ring-2 focus:ring-orange-400/20",
|
||||||
|
"resize-y",
|
||||||
|
)}
|
||||||
|
placeholder="Enter markdown content..."
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="prose prose-sm dark:prose-invert max-w-none">
|
||||||
|
<ReactMarkdown
|
||||||
|
components={{
|
||||||
|
h1: ({ node, ...props }) => (
|
||||||
|
<h1 className="text-xl font-bold text-gray-900 dark:text-white mb-3 mt-4" {...props} />
|
||||||
|
),
|
||||||
|
h2: ({ node, ...props }) => (
|
||||||
|
<h2
|
||||||
|
className="text-lg font-semibold text-gray-900 dark:text-white mb-2 mt-3"
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
),
|
||||||
|
h3: ({ node, ...props }) => (
|
||||||
|
<h3
|
||||||
|
className="text-base font-semibold text-gray-900 dark:text-white mb-2 mt-3"
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
),
|
||||||
|
p: ({ node, ...props }) => (
|
||||||
|
<p className="text-sm text-gray-700 dark:text-gray-300 mb-2 leading-relaxed" {...props} />
|
||||||
|
),
|
||||||
|
ul: ({ node, ...props }) => (
|
||||||
|
<ul
|
||||||
|
className="list-disc list-inside text-sm text-gray-700 dark:text-gray-300 mb-2 space-y-1"
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
),
|
||||||
|
li: ({ node, ...props }) => <li className="ml-4" {...props} />,
|
||||||
|
code: ({ node, ...props }) => (
|
||||||
|
<code
|
||||||
|
className="bg-gray-100 dark:bg-gray-800 px-1.5 py-0.5 rounded text-xs font-mono text-orange-600 dark:text-orange-400"
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
),
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{document.content.markdown}
|
||||||
|
</ReactMarkdown>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
|
{/* Approve button - always visible with glass styling */}
|
||||||
|
<div className="flex items-center justify-between mt-4 pt-4 border-t border-gray-200/50 dark:border-gray-700/30">
|
||||||
|
<p className="text-xs text-gray-500 dark:text-gray-400">
|
||||||
|
{hasChanges ? "Unsaved changes" : "No changes"}
|
||||||
|
</p>
|
||||||
|
<Button
|
||||||
|
onClick={handleApproveAndContinue}
|
||||||
|
className={cn(
|
||||||
|
"backdrop-blur-md",
|
||||||
|
"bg-gradient-to-b from-green-100/80 to-white/60",
|
||||||
|
"dark:from-green-500/20 dark:to-green-500/10",
|
||||||
|
"text-green-700 dark:text-green-100",
|
||||||
|
"border border-green-300/50 dark:border-green-500/50",
|
||||||
|
"hover:from-green-200/90 hover:to-green-100/70",
|
||||||
|
"dark:hover:from-green-400/30 dark:hover:to-green-500/20",
|
||||||
|
"hover:shadow-[0_0_20px_rgba(34,197,94,0.5)]",
|
||||||
|
"dark:hover:shadow-[0_0_25px_rgba(34,197,94,0.7)]",
|
||||||
|
"shadow-lg shadow-green-500/20",
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<CheckCircle2 className="w-4 h-4 mr-2" aria-hidden="true" />
|
||||||
|
Approve and Move to Next Step
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</Card>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</motion.div>
|
||||||
|
</motion.div>
|
||||||
|
)}
|
||||||
|
</AnimatePresence>
|
||||||
|
</Card>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -0,0 +1,170 @@
|
|||||||
|
import { motion } from "framer-motion";
|
||||||
|
import type React from "react";
|
||||||
|
|
||||||
|
interface WorkflowStepButtonProps {
|
||||||
|
isCompleted: boolean;
|
||||||
|
isActive: boolean;
|
||||||
|
stepName: string;
|
||||||
|
onClick?: () => void;
|
||||||
|
color?: "cyan" | "green" | "blue" | "purple";
|
||||||
|
size?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to get color hex values for animations
|
||||||
|
const getColorValue = (color: string) => {
|
||||||
|
const colorValues = {
|
||||||
|
purple: "rgb(168,85,247)",
|
||||||
|
green: "rgb(34,197,94)",
|
||||||
|
blue: "rgb(59,130,246)",
|
||||||
|
cyan: "rgb(34,211,238)",
|
||||||
|
};
|
||||||
|
return colorValues[color as keyof typeof colorValues] || colorValues.blue;
|
||||||
|
};
|
||||||
|
|
||||||
|
export const WorkflowStepButton: React.FC<WorkflowStepButtonProps> = ({
|
||||||
|
isCompleted,
|
||||||
|
isActive,
|
||||||
|
stepName,
|
||||||
|
onClick,
|
||||||
|
color = "cyan",
|
||||||
|
size = 40,
|
||||||
|
}) => {
|
||||||
|
const colorMap = {
|
||||||
|
purple: {
|
||||||
|
border: "border-purple-400",
|
||||||
|
glow: "shadow-[0_0_15px_rgba(168,85,247,0.8)]",
|
||||||
|
glowHover: "hover:shadow-[0_0_25px_rgba(168,85,247,1)]",
|
||||||
|
fill: "bg-purple-400",
|
||||||
|
innerGlow: "shadow-[inset_0_0_10px_rgba(168,85,247,0.8)]",
|
||||||
|
},
|
||||||
|
green: {
|
||||||
|
border: "border-green-400",
|
||||||
|
glow: "shadow-[0_0_15px_rgba(34,197,94,0.8)]",
|
||||||
|
glowHover: "hover:shadow-[0_0_25px_rgba(34,197,94,1)]",
|
||||||
|
fill: "bg-green-400",
|
||||||
|
innerGlow: "shadow-[inset_0_0_10px_rgba(34,197,94,0.8)]",
|
||||||
|
},
|
||||||
|
blue: {
|
||||||
|
border: "border-blue-400",
|
||||||
|
glow: "shadow-[0_0_15px_rgba(59,130,246,0.8)]",
|
||||||
|
glowHover: "hover:shadow-[0_0_25px_rgba(59,130,246,1)]",
|
||||||
|
fill: "bg-blue-400",
|
||||||
|
innerGlow: "shadow-[inset_0_0_10px_rgba(59,130,246,0.8)]",
|
||||||
|
},
|
||||||
|
cyan: {
|
||||||
|
border: "border-cyan-400",
|
||||||
|
glow: "shadow-[0_0_15px_rgba(34,211,238,0.8)]",
|
||||||
|
glowHover: "hover:shadow-[0_0_25px_rgba(34,211,238,1)]",
|
||||||
|
fill: "bg-cyan-400",
|
||||||
|
innerGlow: "shadow-[inset_0_0_10px_rgba(34,211,238,0.8)]",
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
const styles = colorMap[color];
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="flex flex-col items-center gap-2">
|
||||||
|
<motion.button
|
||||||
|
onClick={onClick}
|
||||||
|
className={`
|
||||||
|
relative rounded-full border-2 transition-all duration-300
|
||||||
|
${styles.border}
|
||||||
|
${isCompleted ? styles.glow : "shadow-[0_0_5px_rgba(0,0,0,0.3)]"}
|
||||||
|
${styles.glowHover}
|
||||||
|
bg-gradient-to-b from-gray-900 to-black
|
||||||
|
hover:scale-110
|
||||||
|
active:scale-95
|
||||||
|
`}
|
||||||
|
style={{ width: size, height: size }}
|
||||||
|
whileHover={{ scale: 1.1 }}
|
||||||
|
whileTap={{ scale: 0.95 }}
|
||||||
|
type="button"
|
||||||
|
aria-label={`${stepName} - ${isCompleted ? "completed" : isActive ? "in progress" : "pending"}`}
|
||||||
|
>
|
||||||
|
{/* Outer ring glow effect */}
|
||||||
|
<motion.div
|
||||||
|
className={`
|
||||||
|
absolute inset-[-4px] rounded-full border-2
|
||||||
|
${isCompleted ? styles.border : "border-transparent"}
|
||||||
|
blur-sm
|
||||||
|
`}
|
||||||
|
animate={{
|
||||||
|
opacity: isCompleted ? [0.3, 0.6, 0.3] : 0,
|
||||||
|
}}
|
||||||
|
transition={{
|
||||||
|
duration: 2,
|
||||||
|
repeat: Infinity,
|
||||||
|
ease: "easeInOut",
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Inner glow effect */}
|
||||||
|
<motion.div
|
||||||
|
className={`
|
||||||
|
absolute inset-[2px] rounded-full
|
||||||
|
${isCompleted ? styles.fill : ""}
|
||||||
|
blur-md opacity-20
|
||||||
|
`}
|
||||||
|
animate={{
|
||||||
|
opacity: isCompleted ? [0.1, 0.3, 0.1] : 0,
|
||||||
|
}}
|
||||||
|
transition={{
|
||||||
|
duration: 2,
|
||||||
|
repeat: Infinity,
|
||||||
|
ease: "easeInOut",
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
|
||||||
|
{/* Checkmark icon container */}
|
||||||
|
<div className="relative w-full h-full flex items-center justify-center">
|
||||||
|
<motion.svg
|
||||||
|
width={size * 0.5}
|
||||||
|
height={size * 0.5}
|
||||||
|
viewBox="0 0 24 24"
|
||||||
|
fill="none"
|
||||||
|
className="relative z-10"
|
||||||
|
role="img"
|
||||||
|
aria-label={`${stepName} status indicator`}
|
||||||
|
animate={{
|
||||||
|
filter: isCompleted
|
||||||
|
? [
|
||||||
|
`drop-shadow(0 0 8px ${getColorValue(color)}) drop-shadow(0 0 12px ${getColorValue(color)})`,
|
||||||
|
`drop-shadow(0 0 12px ${getColorValue(color)}) drop-shadow(0 0 16px ${getColorValue(color)})`,
|
||||||
|
`drop-shadow(0 0 8px ${getColorValue(color)}) drop-shadow(0 0 12px ${getColorValue(color)})`,
|
||||||
|
]
|
||||||
|
: "none",
|
||||||
|
}}
|
||||||
|
transition={{
|
||||||
|
duration: 2,
|
||||||
|
repeat: Infinity,
|
||||||
|
ease: "easeInOut",
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{/* Checkmark path */}
|
||||||
|
<path
|
||||||
|
d="M20 6L9 17l-5-5"
|
||||||
|
stroke="currentColor"
|
||||||
|
strokeWidth="3"
|
||||||
|
strokeLinecap="round"
|
||||||
|
strokeLinejoin="round"
|
||||||
|
className={isCompleted ? "text-white" : "text-gray-600"}
|
||||||
|
/>
|
||||||
|
</motion.svg>
|
||||||
|
</div>
|
||||||
|
</motion.button>
|
||||||
|
|
||||||
|
{/* Step name label */}
|
||||||
|
<span
|
||||||
|
className={`text-xs font-medium transition-colors ${
|
||||||
|
isCompleted
|
||||||
|
? "text-cyan-400 dark:text-cyan-300"
|
||||||
|
: isActive
|
||||||
|
? "text-blue-500 dark:text-blue-400"
|
||||||
|
: "text-gray-500 dark:text-gray-400"
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
{stepName}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
@@ -1,4 +1,7 @@
|
|||||||
|
import { ChevronLeft, ChevronRight } from "lucide-react";
|
||||||
import type { ReactNode } from "react";
|
import type { ReactNode } from "react";
|
||||||
|
import { useState } from "react";
|
||||||
|
import { Button } from "@/features/ui/primitives/button";
|
||||||
import { cn } from "@/features/ui/primitives/styles";
|
import { cn } from "@/features/ui/primitives/styles";
|
||||||
|
|
||||||
export interface SideNavigationSection {
|
export interface SideNavigationSection {
|
||||||
@@ -14,9 +17,23 @@ interface SideNavigationProps {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export const SideNavigation = ({ sections, activeSection, onSectionClick }: SideNavigationProps) => {
|
export const SideNavigation = ({ sections, activeSection, onSectionClick }: SideNavigationProps) => {
|
||||||
|
const [isCollapsed, setIsCollapsed] = useState(false);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="w-32 flex-shrink-0">
|
<div className={cn("flex-shrink-0 transition-all duration-300", isCollapsed ? "w-12" : "w-32")}>
|
||||||
<div className="sticky top-4 space-y-0.5">
|
<div className="sticky top-4 space-y-0.5">
|
||||||
|
{/* Collapse/Expand button */}
|
||||||
|
<div className="mb-2 flex justify-end">
|
||||||
|
<Button
|
||||||
|
variant="ghost"
|
||||||
|
size="sm"
|
||||||
|
onClick={() => setIsCollapsed(!isCollapsed)}
|
||||||
|
className="px-2 py-1 h-auto text-gray-500 hover:text-gray-700 dark:hover:text-gray-300"
|
||||||
|
aria-label={isCollapsed ? "Expand navigation" : "Collapse navigation"}
|
||||||
|
>
|
||||||
|
{isCollapsed ? <ChevronRight className="w-4 h-4" /> : <ChevronLeft className="w-4 h-4" />}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
{sections.map((section) => {
|
{sections.map((section) => {
|
||||||
const isActive = activeSection === section.id;
|
const isActive = activeSection === section.id;
|
||||||
return (
|
return (
|
||||||
@@ -24,16 +41,18 @@ export const SideNavigation = ({ sections, activeSection, onSectionClick }: Side
|
|||||||
key={section.id}
|
key={section.id}
|
||||||
type="button"
|
type="button"
|
||||||
onClick={() => onSectionClick(section.id)}
|
onClick={() => onSectionClick(section.id)}
|
||||||
|
title={isCollapsed ? section.label : undefined}
|
||||||
className={cn(
|
className={cn(
|
||||||
"w-full text-left px-2 py-1.5 rounded-md transition-all duration-200",
|
"w-full text-left px-2 py-1.5 rounded-md transition-all duration-200",
|
||||||
"flex items-center gap-1.5",
|
"flex items-center gap-1.5",
|
||||||
isActive
|
isActive
|
||||||
? "bg-blue-500/10 dark:bg-blue-400/10 text-blue-700 dark:text-blue-300 border-l-2 border-blue-500"
|
? "bg-blue-500/10 dark:bg-blue-400/10 text-blue-700 dark:text-blue-300 border-l-2 border-blue-500"
|
||||||
: "text-gray-600 dark:text-gray-400 hover:bg-white/5 dark:hover:bg-white/5 border-l-2 border-transparent",
|
: "text-gray-600 dark:text-gray-400 hover:bg-white/5 dark:hover:bg-white/5 border-l-2 border-transparent",
|
||||||
|
isCollapsed && "justify-center",
|
||||||
)}
|
)}
|
||||||
>
|
>
|
||||||
{section.icon && <span className="flex-shrink-0 w-3 h-3">{section.icon}</span>}
|
{section.icon && <span className="flex-shrink-0 w-3 h-3">{section.icon}</span>}
|
||||||
<span className="text-xs font-medium truncate">{section.label}</span>
|
{!isCollapsed && <span className="text-xs font-medium truncate">{section.label}</span>}
|
||||||
</button>
|
</button>
|
||||||
);
|
);
|
||||||
})}
|
})}
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import { Database, FileText, FolderKanban, Navigation, Settings } from "lucide-react";
|
import { Briefcase, Database, FileText, FolderKanban, Navigation, Settings } from "lucide-react";
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
|
import { AgentWorkOrderExample } from "../layouts/AgentWorkOrderExample";
|
||||||
import { DocumentBrowserExample } from "../layouts/DocumentBrowserExample";
|
import { DocumentBrowserExample } from "../layouts/DocumentBrowserExample";
|
||||||
import { KnowledgeLayoutExample } from "../layouts/KnowledgeLayoutExample";
|
import { KnowledgeLayoutExample } from "../layouts/KnowledgeLayoutExample";
|
||||||
import { NavigationExplanation } from "../layouts/NavigationExplanation";
|
import { NavigationExplanation } from "../layouts/NavigationExplanation";
|
||||||
@@ -16,6 +17,7 @@ export const LayoutsTab = () => {
|
|||||||
{ id: "settings", label: "Settings", icon: <Settings className="w-4 h-4" /> },
|
{ id: "settings", label: "Settings", icon: <Settings className="w-4 h-4" /> },
|
||||||
{ id: "knowledge", label: "Knowledge", icon: <Database className="w-4 h-4" /> },
|
{ id: "knowledge", label: "Knowledge", icon: <Database className="w-4 h-4" /> },
|
||||||
{ id: "document-browser", label: "Document Browser", icon: <FileText className="w-4 h-4" /> },
|
{ id: "document-browser", label: "Document Browser", icon: <FileText className="w-4 h-4" /> },
|
||||||
|
{ id: "agent-work-orders", label: "Agent Work Orders", icon: <Briefcase className="w-4 h-4" /> },
|
||||||
];
|
];
|
||||||
|
|
||||||
// Render content based on active section
|
// Render content based on active section
|
||||||
@@ -68,6 +70,16 @@ export const LayoutsTab = () => {
|
|||||||
<DocumentBrowserExample />
|
<DocumentBrowserExample />
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
case "agent-work-orders":
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<h2 className="text-2xl font-bold mb-4 text-gray-900 dark:text-white">Agent Work Orders Layout</h2>
|
||||||
|
<p className="text-gray-600 dark:text-gray-400 mb-4">
|
||||||
|
Workflow progress visualization with step-by-step history and integrated document editing.
|
||||||
|
</p>
|
||||||
|
<AgentWorkOrderExample />
|
||||||
|
</div>
|
||||||
|
);
|
||||||
default:
|
default:
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ server = [
|
|||||||
"pydantic>=2.0.0",
|
"pydantic>=2.0.0",
|
||||||
"python-dotenv>=1.0.0",
|
"python-dotenv>=1.0.0",
|
||||||
"docker>=6.1.0",
|
"docker>=6.1.0",
|
||||||
|
"tldextract>=5.0.0",
|
||||||
# Logging
|
# Logging
|
||||||
"logfire>=0.30.0",
|
"logfire>=0.30.0",
|
||||||
# Testing (needed for UI-triggered tests)
|
# Testing (needed for UI-triggered tests)
|
||||||
@@ -127,6 +128,7 @@ all = [
|
|||||||
"cryptography>=41.0.0",
|
"cryptography>=41.0.0",
|
||||||
"slowapi>=0.1.9",
|
"slowapi>=0.1.9",
|
||||||
"docker>=6.1.0",
|
"docker>=6.1.0",
|
||||||
|
"tldextract>=5.0.0",
|
||||||
"logfire>=0.30.0",
|
"logfire>=0.30.0",
|
||||||
# MCP specific (mcp version)
|
# MCP specific (mcp version)
|
||||||
"mcp==1.12.2",
|
"mcp==1.12.2",
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from fastapi import APIRouter, HTTPException
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from ..config.logfire_config import get_logger
|
from ..config.logfire_config import get_logger
|
||||||
|
from ..config.version import GITHUB_REPO_NAME, GITHUB_REPO_OWNER
|
||||||
|
|
||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
@@ -47,7 +48,9 @@ class BugReportResponse(BaseModel):
|
|||||||
class GitHubService:
|
class GitHubService:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.token = os.getenv("GITHUB_TOKEN")
|
self.token = os.getenv("GITHUB_TOKEN")
|
||||||
self.repo = os.getenv("GITHUB_REPO", "dynamous-community/Archon-V2-Alpha")
|
# Use centralized version config with environment override
|
||||||
|
default_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
self.repo = os.getenv("GITHUB_REPO", default_repo)
|
||||||
|
|
||||||
async def create_issue(self, bug_report: BugReportRequest) -> dict[str, Any]:
|
async def create_issue(self, bug_report: BugReportRequest) -> dict[str, Any]:
|
||||||
"""Create a GitHub issue from a bug report."""
|
"""Create a GitHub issue from a bug report."""
|
||||||
@@ -243,14 +246,14 @@ def _create_manual_submission_response(bug_report: BugReportRequest) -> BugRepor
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
base_url = f"https://github.com/{github_service.repo}/issues/new"
|
base_url = f"https://github.com/{github_service.repo}/issues/new"
|
||||||
params = {
|
|
||||||
"template": "bug_report.yml",
|
|
||||||
"title": bug_report.title,
|
|
||||||
"labels": f"bug,auto-report,severity:{bug_report.severity},component:{bug_report.component}",
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add the formatted body as a parameter
|
# Use Markdown template for structured layout with URL pre-filling support
|
||||||
params["body"] = issue_body
|
# YAML templates don't support URL parameters, but Markdown templates do
|
||||||
|
params = {
|
||||||
|
"template": "auto_bug_report.md",
|
||||||
|
"title": bug_report.title,
|
||||||
|
"body": issue_body,
|
||||||
|
}
|
||||||
|
|
||||||
# Build the URL
|
# Build the URL
|
||||||
query_string = urllib.parse.urlencode(params)
|
query_string = urllib.parse.urlencode(params)
|
||||||
@@ -271,10 +274,13 @@ async def bug_report_health():
|
|||||||
github_configured = bool(os.getenv("GITHUB_TOKEN"))
|
github_configured = bool(os.getenv("GITHUB_TOKEN"))
|
||||||
repo_configured = bool(os.getenv("GITHUB_REPO"))
|
repo_configured = bool(os.getenv("GITHUB_REPO"))
|
||||||
|
|
||||||
|
# Use centralized version config with environment override
|
||||||
|
default_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"status": "healthy" if github_configured else "degraded",
|
"status": "healthy" if github_configured else "degraded",
|
||||||
"github_token_configured": github_configured,
|
"github_token_configured": github_configured,
|
||||||
"github_repo_configured": repo_configured,
|
"github_repo_configured": repo_configured,
|
||||||
"repo": os.getenv("GITHUB_REPO", "dynamous-community/Archon-V2-Alpha"),
|
"repo": os.getenv("GITHUB_REPO", default_repo),
|
||||||
"message": "Bug reporting is ready" if github_configured else "GitHub token not configured",
|
"message": "Bug reporting is ready" if github_configured else "GitHub token not configured",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,8 @@ import uuid
|
|||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
import tldextract
|
||||||
|
|
||||||
from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
|
from ...config.logfire_config import get_logger, safe_logfire_error, safe_logfire_info
|
||||||
from ...utils import get_supabase_client
|
from ...utils import get_supabase_client
|
||||||
from ...utils.progress.progress_tracker import ProgressTracker
|
from ...utils.progress.progress_tracker import ProgressTracker
|
||||||
@@ -18,12 +20,13 @@ from ..credential_service import credential_service
|
|||||||
|
|
||||||
# Import strategies
|
# Import strategies
|
||||||
# Import operations
|
# Import operations
|
||||||
|
from .discovery_service import DiscoveryService
|
||||||
from .document_storage_operations import DocumentStorageOperations
|
from .document_storage_operations import DocumentStorageOperations
|
||||||
from .page_storage_operations import PageStorageOperations
|
|
||||||
from .helpers.site_config import SiteConfig
|
from .helpers.site_config import SiteConfig
|
||||||
|
|
||||||
# Import helpers
|
# Import helpers
|
||||||
from .helpers.url_handler import URLHandler
|
from .helpers.url_handler import URLHandler
|
||||||
|
from .page_storage_operations import PageStorageOperations
|
||||||
from .progress_mapper import ProgressMapper
|
from .progress_mapper import ProgressMapper
|
||||||
from .strategies.batch import BatchCrawlStrategy
|
from .strategies.batch import BatchCrawlStrategy
|
||||||
from .strategies.recursive import RecursiveCrawlStrategy
|
from .strategies.recursive import RecursiveCrawlStrategy
|
||||||
@@ -37,6 +40,34 @@ _active_orchestrations: dict[str, "CrawlingService"] = {}
|
|||||||
_orchestration_lock: asyncio.Lock | None = None
|
_orchestration_lock: asyncio.Lock | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_root_domain(host: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract the root domain from a hostname using tldextract.
|
||||||
|
Handles multi-part public suffixes correctly (e.g., .co.uk, .com.au).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
host: Hostname to extract root domain from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Root domain (domain + suffix) or original host if extraction fails
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- "docs.example.com" -> "example.com"
|
||||||
|
- "api.example.co.uk" -> "example.co.uk"
|
||||||
|
- "localhost" -> "localhost"
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
extracted = tldextract.extract(host)
|
||||||
|
# Return domain.suffix if both are present
|
||||||
|
if extracted.domain and extracted.suffix:
|
||||||
|
return f"{extracted.domain}.{extracted.suffix}"
|
||||||
|
# Fallback to original host if extraction yields no domain or suffix
|
||||||
|
return host
|
||||||
|
except Exception:
|
||||||
|
# If extraction fails, return original host
|
||||||
|
return host
|
||||||
|
|
||||||
|
|
||||||
def _ensure_orchestration_lock() -> asyncio.Lock:
|
def _ensure_orchestration_lock() -> asyncio.Lock:
|
||||||
global _orchestration_lock
|
global _orchestration_lock
|
||||||
if _orchestration_lock is None:
|
if _orchestration_lock is None:
|
||||||
@@ -99,6 +130,7 @@ class CrawlingService:
|
|||||||
|
|
||||||
# Initialize operations
|
# Initialize operations
|
||||||
self.doc_storage_ops = DocumentStorageOperations(self.supabase_client)
|
self.doc_storage_ops = DocumentStorageOperations(self.supabase_client)
|
||||||
|
self.discovery_service = DiscoveryService()
|
||||||
self.page_storage_ops = PageStorageOperations(self.supabase_client)
|
self.page_storage_ops = PageStorageOperations(self.supabase_client)
|
||||||
|
|
||||||
# Track progress state across all stages to prevent UI resets
|
# Track progress state across all stages to prevent UI resets
|
||||||
@@ -196,13 +228,16 @@ class CrawlingService:
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def crawl_markdown_file(
|
async def crawl_markdown_file(
|
||||||
self, url: str, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None
|
self, url: str, progress_callback: Callable[[str, int, str], Awaitable[None]] | None = None,
|
||||||
|
start_progress: int = 10, end_progress: int = 20
|
||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""Crawl a .txt or markdown file."""
|
"""Crawl a .txt or markdown file."""
|
||||||
return await self.single_page_strategy.crawl_markdown_file(
|
return await self.single_page_strategy.crawl_markdown_file(
|
||||||
url,
|
url,
|
||||||
self.url_handler.transform_github_url,
|
self.url_handler.transform_github_url,
|
||||||
progress_callback,
|
progress_callback,
|
||||||
|
start_progress,
|
||||||
|
end_progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse_sitemap(self, sitemap_url: str) -> list[str]:
|
def parse_sitemap(self, sitemap_url: str) -> list[str]:
|
||||||
@@ -351,15 +386,102 @@ class CrawlingService:
|
|||||||
# Check for cancellation before proceeding
|
# Check for cancellation before proceeding
|
||||||
self._check_cancellation()
|
self._check_cancellation()
|
||||||
|
|
||||||
# Analyzing stage - report initial page count (at least 1)
|
# Discovery phase - find the single best related file
|
||||||
await update_mapped_progress(
|
discovered_urls = []
|
||||||
"analyzing", 50, f"Analyzing URL type for {url}",
|
# Skip discovery if the URL itself is already a discovery target (sitemap, llms file, etc.)
|
||||||
total_pages=1, # We know we have at least the start URL
|
is_already_discovery_target = (
|
||||||
processed_pages=0
|
self.url_handler.is_sitemap(url) or
|
||||||
|
self.url_handler.is_llms_variant(url) or
|
||||||
|
self.url_handler.is_robots_txt(url) or
|
||||||
|
self.url_handler.is_well_known_file(url) or
|
||||||
|
self.url_handler.is_txt(url) # Also skip for any .txt file that user provides directly
|
||||||
)
|
)
|
||||||
|
|
||||||
# Detect URL type and perform crawl
|
if is_already_discovery_target:
|
||||||
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
|
safe_logfire_info(f"Skipping discovery - URL is already a discovery target file: {url}")
|
||||||
|
|
||||||
|
if request.get("auto_discovery", True) and not is_already_discovery_target: # Default enabled, but skip if already a discovery file
|
||||||
|
await update_mapped_progress(
|
||||||
|
"discovery", 25, f"Discovering best related file for {url}", current_url=url
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
# Offload potential sync I/O to avoid blocking the event loop
|
||||||
|
discovered_file = await asyncio.to_thread(self.discovery_service.discover_files, url)
|
||||||
|
|
||||||
|
# Add the single best discovered file to crawl list
|
||||||
|
if discovered_file:
|
||||||
|
safe_logfire_info(f"Discovery found file: {discovered_file}")
|
||||||
|
# Filter through is_binary_file() check like existing code
|
||||||
|
if not self.url_handler.is_binary_file(discovered_file):
|
||||||
|
discovered_urls.append(discovered_file)
|
||||||
|
safe_logfire_info(f"Adding discovered file to crawl: {discovered_file}")
|
||||||
|
|
||||||
|
# Determine file type for user feedback
|
||||||
|
discovered_file_type = "unknown"
|
||||||
|
if self.url_handler.is_llms_variant(discovered_file):
|
||||||
|
discovered_file_type = "llms.txt"
|
||||||
|
elif self.url_handler.is_sitemap(discovered_file):
|
||||||
|
discovered_file_type = "sitemap"
|
||||||
|
elif self.url_handler.is_robots_txt(discovered_file):
|
||||||
|
discovered_file_type = "robots.txt"
|
||||||
|
|
||||||
|
await update_mapped_progress(
|
||||||
|
"discovery", 100,
|
||||||
|
f"Discovery completed: found {discovered_file_type} file",
|
||||||
|
current_url=url,
|
||||||
|
discovered_file=discovered_file,
|
||||||
|
discovered_file_type=discovered_file_type
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
safe_logfire_info(f"Skipping binary file: {discovered_file}")
|
||||||
|
else:
|
||||||
|
safe_logfire_info(f"Discovery found no files for {url}")
|
||||||
|
await update_mapped_progress(
|
||||||
|
"discovery", 100,
|
||||||
|
"Discovery completed: no special files found, will crawl main URL",
|
||||||
|
current_url=url
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
safe_logfire_error(f"Discovery phase failed: {e}")
|
||||||
|
# Continue with regular crawl even if discovery fails
|
||||||
|
await update_mapped_progress(
|
||||||
|
"discovery", 100, "Discovery phase failed, continuing with regular crawl", current_url=url
|
||||||
|
)
|
||||||
|
|
||||||
|
# Analyzing stage - determine what to crawl
|
||||||
|
if discovered_urls:
|
||||||
|
# Discovery found a file - crawl ONLY the discovered file, not the main URL
|
||||||
|
total_urls_to_crawl = len(discovered_urls)
|
||||||
|
await update_mapped_progress(
|
||||||
|
"analyzing", 50, f"Analyzing discovered file: {discovered_urls[0]}",
|
||||||
|
total_pages=total_urls_to_crawl,
|
||||||
|
processed_pages=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Crawl only the discovered file with discovery context
|
||||||
|
discovered_url = discovered_urls[0]
|
||||||
|
safe_logfire_info(f"Crawling discovered file instead of main URL: {discovered_url}")
|
||||||
|
|
||||||
|
# Mark this as a discovery target for domain filtering
|
||||||
|
discovery_request = request.copy()
|
||||||
|
discovery_request["is_discovery_target"] = True
|
||||||
|
discovery_request["original_domain"] = self.url_handler.get_base_url(discovered_url)
|
||||||
|
|
||||||
|
crawl_results, crawl_type = await self._crawl_by_url_type(discovered_url, discovery_request)
|
||||||
|
|
||||||
|
else:
|
||||||
|
# No discovery - crawl the main URL normally
|
||||||
|
total_urls_to_crawl = 1
|
||||||
|
await update_mapped_progress(
|
||||||
|
"analyzing", 50, f"Analyzing URL type for {url}",
|
||||||
|
total_pages=total_urls_to_crawl,
|
||||||
|
processed_pages=0
|
||||||
|
)
|
||||||
|
|
||||||
|
# Crawl the main URL
|
||||||
|
safe_logfire_info(f"No discovery file found, crawling main URL: {url}")
|
||||||
|
crawl_results, crawl_type = await self._crawl_by_url_type(url, request)
|
||||||
|
|
||||||
# Update progress tracker with crawl type
|
# Update progress tracker with crawl type
|
||||||
if self.progress_tracker and crawl_type:
|
if self.progress_tracker and crawl_type:
|
||||||
@@ -531,7 +653,7 @@ class CrawlingService:
|
|||||||
logger.error("Code extraction failed, continuing crawl without code examples", exc_info=True)
|
logger.error("Code extraction failed, continuing crawl without code examples", exc_info=True)
|
||||||
safe_logfire_error(f"Code extraction failed | error={e}")
|
safe_logfire_error(f"Code extraction failed | error={e}")
|
||||||
code_examples_count = 0
|
code_examples_count = 0
|
||||||
|
|
||||||
# Report code extraction failure to progress tracker
|
# Report code extraction failure to progress tracker
|
||||||
if self.progress_tracker:
|
if self.progress_tracker:
|
||||||
await self.progress_tracker.update(
|
await self.progress_tracker.update(
|
||||||
@@ -628,6 +750,66 @@ class CrawlingService:
|
|||||||
f"Unregistered orchestration service on error | progress_id={self.progress_id}"
|
f"Unregistered orchestration service on error | progress_id={self.progress_id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _is_same_domain(self, url: str, base_domain: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL belongs to the same domain as the base domain.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
base_domain: Base domain URL to compare against
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the URL is from the same domain
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
u, b = urlparse(url), urlparse(base_domain)
|
||||||
|
url_host = (u.hostname or "").lower()
|
||||||
|
base_host = (b.hostname or "").lower()
|
||||||
|
return bool(url_host) and url_host == base_host
|
||||||
|
except Exception:
|
||||||
|
# If parsing fails, be conservative and exclude the URL
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _is_same_domain_or_subdomain(self, url: str, base_domain: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL belongs to the same root domain or subdomain.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- docs.supabase.com matches supabase.com (subdomain)
|
||||||
|
- api.supabase.com matches supabase.com (subdomain)
|
||||||
|
- supabase.com matches supabase.com (exact match)
|
||||||
|
- external.com does NOT match supabase.com
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
base_domain: Base domain URL to compare against
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if the URL is from the same root domain or subdomain
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
u, b = urlparse(url), urlparse(base_domain)
|
||||||
|
url_host = (u.hostname or "").lower()
|
||||||
|
base_host = (b.hostname or "").lower()
|
||||||
|
|
||||||
|
if not url_host or not base_host:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Exact match
|
||||||
|
if url_host == base_host:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if url_host is a subdomain of base_host using tldextract
|
||||||
|
url_root = get_root_domain(url_host)
|
||||||
|
base_root = get_root_domain(base_host)
|
||||||
|
|
||||||
|
return url_root == base_root
|
||||||
|
except Exception:
|
||||||
|
# If parsing fails, be conservative and exclude the URL
|
||||||
|
return False
|
||||||
|
|
||||||
def _is_self_link(self, link: str, base_url: str) -> bool:
|
def _is_self_link(self, link: str, base_url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a link is a self-referential link to the base URL.
|
Check if a link is a self-referential link to the base URL.
|
||||||
@@ -700,6 +882,63 @@ class CrawlingService:
|
|||||||
if crawl_results and len(crawl_results) > 0:
|
if crawl_results and len(crawl_results) > 0:
|
||||||
content = crawl_results[0].get('markdown', '')
|
content = crawl_results[0].get('markdown', '')
|
||||||
if self.url_handler.is_link_collection_file(url, content):
|
if self.url_handler.is_link_collection_file(url, content):
|
||||||
|
# If this file was selected by discovery, check if it's an llms.txt file
|
||||||
|
if request.get("is_discovery_target"):
|
||||||
|
# Check if this is an llms.txt file (not sitemap or other discovery targets)
|
||||||
|
is_llms_file = self.url_handler.is_llms_variant(url)
|
||||||
|
|
||||||
|
if is_llms_file:
|
||||||
|
logger.info(f"Discovery llms.txt mode: following ALL same-domain links from {url}")
|
||||||
|
|
||||||
|
# Extract all links from the file
|
||||||
|
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
|
||||||
|
|
||||||
|
# Filter for same-domain links (all types, not just llms.txt)
|
||||||
|
same_domain_links = []
|
||||||
|
if extracted_links_with_text:
|
||||||
|
original_domain = request.get("original_domain")
|
||||||
|
if original_domain:
|
||||||
|
for link, text in extracted_links_with_text:
|
||||||
|
# Check same domain/subdomain for ALL links
|
||||||
|
if self._is_same_domain_or_subdomain(link, original_domain):
|
||||||
|
same_domain_links.append((link, text))
|
||||||
|
logger.debug(f"Found same-domain link: {link}")
|
||||||
|
|
||||||
|
if same_domain_links:
|
||||||
|
# Build mapping and extract just URLs
|
||||||
|
url_to_link_text = dict(same_domain_links)
|
||||||
|
extracted_urls = [link for link, _ in same_domain_links]
|
||||||
|
|
||||||
|
logger.info(f"Following {len(extracted_urls)} same-domain links from llms.txt")
|
||||||
|
|
||||||
|
# Notify user about linked files being crawled
|
||||||
|
await update_crawl_progress(
|
||||||
|
60, # 60% of crawling stage
|
||||||
|
f"Found {len(extracted_urls)} links in llms.txt, crawling them now...",
|
||||||
|
crawl_type="llms_txt_linked_files",
|
||||||
|
linked_files=extracted_urls
|
||||||
|
)
|
||||||
|
|
||||||
|
# Crawl all same-domain links from llms.txt (no recursion, just one level)
|
||||||
|
batch_results = await self.crawl_batch_with_progress(
|
||||||
|
extracted_urls,
|
||||||
|
max_concurrent=request.get('max_concurrent'),
|
||||||
|
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||||
|
link_text_fallbacks=url_to_link_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Combine original llms.txt with linked pages
|
||||||
|
crawl_results.extend(batch_results)
|
||||||
|
crawl_type = "llms_txt_with_linked_pages"
|
||||||
|
logger.info(f"llms.txt crawling completed: {len(crawl_results)} total pages (1 llms.txt + {len(batch_results)} linked pages)")
|
||||||
|
return crawl_results, crawl_type
|
||||||
|
|
||||||
|
# For non-llms.txt discovery targets (sitemaps, robots.txt), keep single-file mode
|
||||||
|
logger.info(f"Discovery single-file mode: skipping link extraction for {url}")
|
||||||
|
crawl_type = "discovery_single_file"
|
||||||
|
logger.info(f"Discovery file crawling completed: {len(crawl_results)} result")
|
||||||
|
return crawl_results, crawl_type
|
||||||
|
|
||||||
# Extract links WITH text from the content
|
# Extract links WITH text from the content
|
||||||
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
|
extracted_links_with_text = self.url_handler.extract_markdown_links_with_text(content, url)
|
||||||
|
|
||||||
@@ -714,6 +953,19 @@ class CrawlingService:
|
|||||||
if self_filtered_count > 0:
|
if self_filtered_count > 0:
|
||||||
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
|
logger.info(f"Filtered out {self_filtered_count} self-referential links from {original_count} extracted links")
|
||||||
|
|
||||||
|
# For discovery targets, only follow same-domain links
|
||||||
|
if extracted_links_with_text and request.get("is_discovery_target"):
|
||||||
|
original_domain = request.get("original_domain")
|
||||||
|
if original_domain:
|
||||||
|
original_count = len(extracted_links_with_text)
|
||||||
|
extracted_links_with_text = [
|
||||||
|
(link, text) for link, text in extracted_links_with_text
|
||||||
|
if self._is_same_domain(link, original_domain)
|
||||||
|
]
|
||||||
|
domain_filtered_count = original_count - len(extracted_links_with_text)
|
||||||
|
if domain_filtered_count > 0:
|
||||||
|
safe_logfire_info(f"Discovery mode: filtered out {domain_filtered_count} external links, keeping {len(extracted_links_with_text)} same-domain links")
|
||||||
|
|
||||||
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
|
# Filter out binary files (PDFs, images, archives, etc.) to avoid wasteful crawling
|
||||||
if extracted_links_with_text:
|
if extracted_links_with_text:
|
||||||
original_count = len(extracted_links_with_text)
|
original_count = len(extracted_links_with_text)
|
||||||
@@ -724,26 +976,39 @@ class CrawlingService:
|
|||||||
|
|
||||||
if extracted_links_with_text:
|
if extracted_links_with_text:
|
||||||
# Build mapping of URL -> link text for title fallback
|
# Build mapping of URL -> link text for title fallback
|
||||||
url_to_link_text = {link: text for link, text in extracted_links_with_text}
|
url_to_link_text = dict(extracted_links_with_text)
|
||||||
extracted_links = [link for link, _ in extracted_links_with_text]
|
extracted_links = [link for link, _ in extracted_links_with_text]
|
||||||
|
|
||||||
# Crawl the extracted links using batch crawling
|
# For discovery targets, respect max_depth for same-domain links
|
||||||
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
|
max_depth = request.get('max_depth', 2) if request.get("is_discovery_target") else request.get('max_depth', 1)
|
||||||
batch_results = await self.crawl_batch_with_progress(
|
|
||||||
extracted_links,
|
if max_depth > 1 and request.get("is_discovery_target"):
|
||||||
max_concurrent=request.get('max_concurrent'), # None -> use DB settings
|
# Use recursive crawling to respect depth limit for same-domain links
|
||||||
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
logger.info(f"Crawling {len(extracted_links)} same-domain links with max_depth={max_depth-1}")
|
||||||
link_text_fallbacks=url_to_link_text, # Pass link text for title fallback
|
batch_results = await self.crawl_recursive_with_progress(
|
||||||
)
|
extracted_links,
|
||||||
|
max_depth=max_depth - 1, # Reduce depth since we're already 1 level deep
|
||||||
|
max_concurrent=request.get('max_concurrent'),
|
||||||
|
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Use normal batch crawling (with link text fallbacks)
|
||||||
|
logger.info(f"Crawling {len(extracted_links)} extracted links from {url}")
|
||||||
|
batch_results = await self.crawl_batch_with_progress(
|
||||||
|
extracted_links,
|
||||||
|
max_concurrent=request.get('max_concurrent'), # None -> use DB settings
|
||||||
|
progress_callback=await self._create_crawl_progress_callback("crawling"),
|
||||||
|
link_text_fallbacks=url_to_link_text, # Pass link text for title fallback
|
||||||
|
)
|
||||||
|
|
||||||
# Combine original text file results with batch results
|
# Combine original text file results with batch results
|
||||||
crawl_results.extend(batch_results)
|
crawl_results.extend(batch_results)
|
||||||
crawl_type = "link_collection_with_crawled_links"
|
crawl_type = "link_collection_with_crawled_links"
|
||||||
|
|
||||||
logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
|
logger.info(f"Link collection crawling completed: {len(crawl_results)} total results (1 text file + {len(batch_results)} extracted links)")
|
||||||
else:
|
else:
|
||||||
logger.info(f"No valid links found in link collection file: {url}")
|
logger.info(f"No valid links found in link collection file: {url}")
|
||||||
logger.info(f"Text file crawling completed: {len(crawl_results)} results")
|
logger.info(f"Text file crawling completed: {len(crawl_results)} results")
|
||||||
|
|
||||||
elif self.url_handler.is_sitemap(url):
|
elif self.url_handler.is_sitemap(url):
|
||||||
# Handle sitemaps
|
# Handle sitemaps
|
||||||
@@ -753,6 +1018,20 @@ class CrawlingService:
|
|||||||
"Detected sitemap, parsing URLs...",
|
"Detected sitemap, parsing URLs...",
|
||||||
crawl_type=crawl_type
|
crawl_type=crawl_type
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If this sitemap was selected by discovery, just return the sitemap itself (single-file mode)
|
||||||
|
if request.get("is_discovery_target"):
|
||||||
|
logger.info(f"Discovery single-file mode: returning sitemap itself without crawling URLs from {url}")
|
||||||
|
crawl_type = "discovery_sitemap"
|
||||||
|
# Return the sitemap file as the result
|
||||||
|
crawl_results = [{
|
||||||
|
'url': url,
|
||||||
|
'markdown': f"# Sitemap: {url}\n\nThis is a sitemap file discovered and returned in single-file mode.",
|
||||||
|
'title': f"Sitemap - {self.url_handler.extract_display_name(url)}",
|
||||||
|
'crawl_type': crawl_type
|
||||||
|
}]
|
||||||
|
return crawl_results, crawl_type
|
||||||
|
|
||||||
sitemap_urls = self.parse_sitemap(url)
|
sitemap_urls = self.parse_sitemap(url)
|
||||||
|
|
||||||
if sitemap_urls:
|
if sitemap_urls:
|
||||||
|
|||||||
558
python/src/server/services/crawling/discovery_service.py
Normal file
558
python/src/server/services/crawling/discovery_service.py
Normal file
@@ -0,0 +1,558 @@
|
|||||||
|
"""
|
||||||
|
Discovery Service for Automatic File Detection
|
||||||
|
|
||||||
|
Handles automatic discovery and parsing of llms.txt, sitemap.xml, and related files
|
||||||
|
to enhance crawling capabilities with priority-based discovery methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ipaddress
|
||||||
|
import socket
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from ...config.logfire_config import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SitemapHTMLParser(HTMLParser):
|
||||||
|
"""HTML parser for extracting sitemap references from link and meta tags."""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.sitemaps = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]):
|
||||||
|
"""Handle start tags to find sitemap references."""
|
||||||
|
attrs_dict = {k.lower(): v for k, v in attrs if v is not None}
|
||||||
|
|
||||||
|
# Check <link rel="sitemap" href="...">
|
||||||
|
if tag == 'link':
|
||||||
|
rel = attrs_dict.get('rel', '').lower()
|
||||||
|
# Handle multi-valued rel attributes (space-separated)
|
||||||
|
rel_values = rel.split() if rel else []
|
||||||
|
if 'sitemap' in rel_values:
|
||||||
|
href = attrs_dict.get('href')
|
||||||
|
if href:
|
||||||
|
self.sitemaps.append(('link', href))
|
||||||
|
|
||||||
|
# Check <meta name="sitemap" content="...">
|
||||||
|
elif tag == 'meta':
|
||||||
|
name = attrs_dict.get('name', '').lower()
|
||||||
|
if name == 'sitemap':
|
||||||
|
content = attrs_dict.get('content')
|
||||||
|
if content:
|
||||||
|
self.sitemaps.append(('meta', content))
|
||||||
|
|
||||||
|
|
||||||
|
class DiscoveryService:
|
||||||
|
"""Service for discovering related files automatically during crawls."""
|
||||||
|
|
||||||
|
# Maximum response size to prevent memory exhaustion (10MB default)
|
||||||
|
MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10 MB
|
||||||
|
|
||||||
|
# Global priority order - select ONE best file from all categories
|
||||||
|
# Based on actual usage research - only includes files commonly found in the wild
|
||||||
|
DISCOVERY_PRIORITY = [
|
||||||
|
# LLMs files (highest priority - most comprehensive AI guidance)
|
||||||
|
"llms.txt", # Standard llms.txt spec - widely adopted
|
||||||
|
"llms-full.txt", # Part of llms.txt spec - comprehensive content
|
||||||
|
# Sitemap files (structural crawling guidance)
|
||||||
|
"sitemap.xml", # Universal standard for site structure
|
||||||
|
# Robots file (basic crawling rules)
|
||||||
|
"robots.txt", # Universal standard for crawl directives
|
||||||
|
# Well-known variants (alternative locations per RFC 8615)
|
||||||
|
".well-known/ai.txt",
|
||||||
|
".well-known/llms.txt",
|
||||||
|
".well-known/sitemap.xml"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Known file extensions for path detection
|
||||||
|
FILE_EXTENSIONS = {
|
||||||
|
'.html', '.htm', '.xml', '.json', '.txt', '.md', '.csv',
|
||||||
|
'.rss', '.yaml', '.yml', '.pdf', '.zip'
|
||||||
|
}
|
||||||
|
|
||||||
|
def discover_files(self, base_url: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Main discovery orchestrator - selects ONE best file across all categories.
|
||||||
|
All files contain similar AI/crawling guidance, so we only need the best one.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: Base URL to discover files for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Single best URL found, or None if no files discovered
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Starting single-file discovery for {base_url}")
|
||||||
|
|
||||||
|
# Extract directory path from base URL
|
||||||
|
base_dir = self._extract_directory(base_url)
|
||||||
|
|
||||||
|
# Try each file in priority order
|
||||||
|
for filename in self.DISCOVERY_PRIORITY:
|
||||||
|
discovered_url = self._try_locations(base_url, base_dir, filename)
|
||||||
|
if discovered_url:
|
||||||
|
logger.info(f"Discovery found best file: {discovered_url}")
|
||||||
|
return discovered_url
|
||||||
|
|
||||||
|
# Fallback: Check HTML meta tags for sitemap references
|
||||||
|
html_sitemaps = self._parse_html_meta_tags(base_url)
|
||||||
|
if html_sitemaps:
|
||||||
|
best_file = html_sitemaps[0]
|
||||||
|
logger.info(f"Discovery found best file from HTML meta tags: {best_file}")
|
||||||
|
return best_file
|
||||||
|
|
||||||
|
logger.info(f"Discovery completed for {base_url}: no files found")
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"Unexpected error during discovery for {base_url}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _extract_directory(self, base_url: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract directory path from URL, handling both file URLs and directory URLs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: URL to extract directory from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Directory path (without trailing slash)
|
||||||
|
"""
|
||||||
|
parsed = urlparse(base_url)
|
||||||
|
base_path = parsed.path.rstrip('/')
|
||||||
|
|
||||||
|
# Check if last segment is a file (has known extension)
|
||||||
|
last_segment = base_path.split('/')[-1] if base_path else ''
|
||||||
|
has_file_extension = any(last_segment.lower().endswith(ext) for ext in self.FILE_EXTENSIONS)
|
||||||
|
|
||||||
|
if has_file_extension:
|
||||||
|
# Remove filename to get directory
|
||||||
|
return '/'.join(base_path.split('/')[:-1])
|
||||||
|
else:
|
||||||
|
# Last segment is a directory
|
||||||
|
return base_path
|
||||||
|
|
||||||
|
def _try_locations(self, base_url: str, base_dir: str, filename: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Try different locations for a given filename in priority order.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. Same directory as base_url (if not root)
|
||||||
|
2. Root level
|
||||||
|
3. Common subdirectories (based on file type)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: Original base URL
|
||||||
|
base_dir: Extracted directory path
|
||||||
|
filename: Filename to search for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
URL if file found, None otherwise
|
||||||
|
"""
|
||||||
|
parsed = urlparse(base_url)
|
||||||
|
|
||||||
|
# Priority 1: Check same directory (if not root)
|
||||||
|
if base_dir and base_dir != '/':
|
||||||
|
same_dir_url = f"{parsed.scheme}://{parsed.netloc}{base_dir}/{filename}"
|
||||||
|
if self._check_url_exists(same_dir_url):
|
||||||
|
return same_dir_url
|
||||||
|
|
||||||
|
# Priority 2: Check root level
|
||||||
|
root_url = urljoin(base_url, filename)
|
||||||
|
if self._check_url_exists(root_url):
|
||||||
|
return root_url
|
||||||
|
|
||||||
|
# Priority 3: Check common subdirectories
|
||||||
|
subdirs = self._get_subdirs_for_file(base_dir, filename)
|
||||||
|
for subdir in subdirs:
|
||||||
|
subdir_url = urljoin(base_url, f"{subdir}/{filename}")
|
||||||
|
if self._check_url_exists(subdir_url):
|
||||||
|
return subdir_url
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _get_subdirs_for_file(self, base_dir: str, filename: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Get relevant subdirectories to check based on file type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_dir: Base directory path
|
||||||
|
filename: Filename being searched for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of subdirectory names to check
|
||||||
|
"""
|
||||||
|
subdirs = []
|
||||||
|
|
||||||
|
# Include base directory name if available
|
||||||
|
if base_dir and base_dir != '/':
|
||||||
|
base_dir_name = base_dir.split('/')[-1]
|
||||||
|
if base_dir_name:
|
||||||
|
subdirs.append(base_dir_name)
|
||||||
|
|
||||||
|
# Add type-specific subdirectories
|
||||||
|
if filename.startswith('llms') or filename.endswith('.txt') or filename.endswith('.md'):
|
||||||
|
# LLMs files commonly in these locations
|
||||||
|
subdirs.extend(["docs", "static", "public", "assets", "doc", "api"])
|
||||||
|
elif filename.endswith('.xml') and not filename.startswith('.well-known'):
|
||||||
|
# Sitemap files commonly in these locations
|
||||||
|
subdirs.extend(["docs", "sitemaps", "sitemap", "xml", "feed"])
|
||||||
|
|
||||||
|
return subdirs
|
||||||
|
|
||||||
|
def _is_safe_ip(self, ip_str: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if an IP address is safe (not private, loopback, link-local, or cloud metadata).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ip_str: IP address string to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if IP is safe for outbound requests, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
ip = ipaddress.ip_address(ip_str)
|
||||||
|
|
||||||
|
# Block private networks
|
||||||
|
if ip.is_private:
|
||||||
|
logger.warning(f"Blocked private IP address: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Block loopback (127.0.0.0/8, ::1)
|
||||||
|
if ip.is_loopback:
|
||||||
|
logger.warning(f"Blocked loopback IP address: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Block link-local (169.254.0.0/16, fe80::/10)
|
||||||
|
if ip.is_link_local:
|
||||||
|
logger.warning(f"Blocked link-local IP address: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Block multicast
|
||||||
|
if ip.is_multicast:
|
||||||
|
logger.warning(f"Blocked multicast IP address: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Block reserved ranges
|
||||||
|
if ip.is_reserved:
|
||||||
|
logger.warning(f"Blocked reserved IP address: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Additional explicit checks for cloud metadata services
|
||||||
|
# AWS metadata service
|
||||||
|
if str(ip) == "169.254.169.254":
|
||||||
|
logger.warning(f"Blocked AWS metadata service IP: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# GCP metadata service
|
||||||
|
if str(ip) == "169.254.169.254":
|
||||||
|
logger.warning(f"Blocked GCP metadata service IP: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except ValueError:
|
||||||
|
logger.warning(f"Invalid IP address format: {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _resolve_and_validate_hostname(self, hostname: str) -> bool:
|
||||||
|
"""
|
||||||
|
Resolve hostname to IP and validate it's safe.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
hostname: Hostname to resolve and validate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if hostname resolves to safe IPs only, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Resolve hostname to IP addresses
|
||||||
|
addr_info = socket.getaddrinfo(hostname, None, socket.AF_UNSPEC, socket.SOCK_STREAM)
|
||||||
|
|
||||||
|
# Check all resolved IPs
|
||||||
|
for info in addr_info:
|
||||||
|
ip_str = info[4][0]
|
||||||
|
if not self._is_safe_ip(ip_str):
|
||||||
|
logger.warning(f"Hostname {hostname} resolves to unsafe IP {ip_str}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except socket.gaierror as e:
|
||||||
|
logger.warning(f"DNS resolution failed for {hostname}: {e}")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error resolving hostname {hostname}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _check_url_exists(self, url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL exists and returns a successful response.
|
||||||
|
Includes SSRF protection by validating hostnames and blocking private IPs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if URL returns 200, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Parse URL to extract hostname
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if not parsed.scheme or not parsed.netloc:
|
||||||
|
logger.warning(f"Invalid URL format: {url}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Only allow HTTP/HTTPS
|
||||||
|
if parsed.scheme not in ('http', 'https'):
|
||||||
|
logger.warning(f"Blocked non-HTTP(S) scheme: {parsed.scheme}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate initial hostname
|
||||||
|
hostname = parsed.netloc.split(':')[0] # Remove port if present
|
||||||
|
if not self._resolve_and_validate_hostname(hostname):
|
||||||
|
logger.warning(f"URL check blocked due to unsafe hostname: {url}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Set safe User-Agent header
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create a session with limited redirects
|
||||||
|
session = requests.Session()
|
||||||
|
session.max_redirects = 3
|
||||||
|
|
||||||
|
# Make request with redirect validation
|
||||||
|
resp = session.get(
|
||||||
|
url,
|
||||||
|
timeout=5,
|
||||||
|
allow_redirects=True,
|
||||||
|
verify=True,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check if there were redirects (history attribute exists on real responses)
|
||||||
|
if hasattr(resp, 'history') and resp.history:
|
||||||
|
logger.debug(f"URL {url} had {len(resp.history)} redirect(s)")
|
||||||
|
|
||||||
|
# Validate final destination
|
||||||
|
final_url = resp.url
|
||||||
|
final_parsed = urlparse(final_url)
|
||||||
|
|
||||||
|
# Only allow HTTP/HTTPS for final destination
|
||||||
|
if final_parsed.scheme not in ('http', 'https'):
|
||||||
|
logger.warning(f"Blocked redirect to non-HTTP(S) scheme: {final_parsed.scheme}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Validate final hostname
|
||||||
|
final_hostname = final_parsed.netloc.split(':')[0]
|
||||||
|
if not self._resolve_and_validate_hostname(final_hostname):
|
||||||
|
logger.warning(f"Redirect target blocked due to unsafe hostname: {final_url}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check response status
|
||||||
|
success = resp.status_code == 200
|
||||||
|
logger.debug(f"URL check: {url} -> {resp.status_code} ({'exists' if success else 'not found'})")
|
||||||
|
return success
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if hasattr(resp, 'close'):
|
||||||
|
resp.close()
|
||||||
|
|
||||||
|
except requests.exceptions.TooManyRedirects:
|
||||||
|
logger.warning(f"Too many redirects for URL: {url}")
|
||||||
|
return False
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
logger.debug(f"Timeout checking URL: {url}")
|
||||||
|
return False
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.debug(f"Request error checking URL {url}: {e}")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Unexpected error checking URL {url}: {e}", exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _parse_robots_txt(self, base_url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extract sitemap URLs from robots.txt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: Base URL to check robots.txt for
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of sitemap URLs found in robots.txt
|
||||||
|
"""
|
||||||
|
sitemaps: list[str] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
robots_url = urljoin(base_url, "robots.txt")
|
||||||
|
logger.info(f"Checking robots.txt at {robots_url}")
|
||||||
|
|
||||||
|
# Set safe User-Agent header
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.get(robots_url, timeout=30, stream=True, verify=True, headers=headers)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.info(f"No robots.txt found: HTTP {resp.status_code}")
|
||||||
|
return sitemaps
|
||||||
|
|
||||||
|
# Read response with size limit
|
||||||
|
content = self._read_response_with_limit(resp, robots_url)
|
||||||
|
|
||||||
|
# Parse robots.txt content for sitemap directives
|
||||||
|
for raw_line in content.splitlines():
|
||||||
|
line = raw_line.strip()
|
||||||
|
if line.lower().startswith("sitemap:"):
|
||||||
|
sitemap_value = line.split(":", 1)[1].strip()
|
||||||
|
if sitemap_value:
|
||||||
|
# Allow absolute and relative sitemap values
|
||||||
|
if sitemap_value.lower().startswith(("http://", "https://")):
|
||||||
|
sitemap_url = sitemap_value
|
||||||
|
else:
|
||||||
|
# Resolve relative path against base_url
|
||||||
|
sitemap_url = urljoin(base_url, sitemap_value)
|
||||||
|
|
||||||
|
# Validate scheme is HTTP/HTTPS only
|
||||||
|
parsed = urlparse(sitemap_url)
|
||||||
|
if parsed.scheme not in ("http", "https"):
|
||||||
|
logger.warning(f"Skipping non-HTTP(S) sitemap in robots.txt: {sitemap_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
sitemaps.append(sitemap_url)
|
||||||
|
logger.info(f"Found sitemap in robots.txt: {sitemap_url}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
resp.close()
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
logger.exception(f"Network error fetching robots.txt from {base_url}")
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"robots.txt too large at {base_url}: {e}")
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"Unexpected error parsing robots.txt from {base_url}")
|
||||||
|
|
||||||
|
return sitemaps
|
||||||
|
|
||||||
|
def _parse_html_meta_tags(self, base_url: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extract sitemap references from HTML meta tags using proper HTML parsing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_url: Base URL to check HTML for meta tags
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of sitemap URLs found in HTML meta tags
|
||||||
|
"""
|
||||||
|
sitemaps: list[str] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Checking HTML meta tags for sitemaps at {base_url}")
|
||||||
|
|
||||||
|
# Set safe User-Agent header
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'
|
||||||
|
}
|
||||||
|
|
||||||
|
resp = requests.get(base_url, timeout=30, stream=True, verify=True, headers=headers)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.debug(f"Could not fetch HTML for meta tag parsing: HTTP {resp.status_code}")
|
||||||
|
return sitemaps
|
||||||
|
|
||||||
|
# Read response with size limit
|
||||||
|
content = self._read_response_with_limit(resp, base_url)
|
||||||
|
|
||||||
|
# Parse HTML using proper HTML parser
|
||||||
|
parser = SitemapHTMLParser()
|
||||||
|
try:
|
||||||
|
parser.feed(content)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"HTML parsing error for {base_url}: {e}")
|
||||||
|
return sitemaps
|
||||||
|
|
||||||
|
# Process found sitemaps
|
||||||
|
for tag_type, url in parser.sitemaps:
|
||||||
|
# Resolve relative URLs
|
||||||
|
sitemap_url = urljoin(base_url, url.strip())
|
||||||
|
|
||||||
|
# Validate scheme is HTTP/HTTPS
|
||||||
|
parsed = urlparse(sitemap_url)
|
||||||
|
if parsed.scheme not in ("http", "https"):
|
||||||
|
logger.debug(f"Skipping non-HTTP(S) sitemap URL: {sitemap_url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
sitemaps.append(sitemap_url)
|
||||||
|
logger.info(f"Found sitemap in HTML {tag_type} tag: {sitemap_url}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
resp.close()
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
logger.exception(f"Network error fetching HTML from {base_url}")
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"HTML response too large at {base_url}: {e}")
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"Unexpected error parsing HTML meta tags from {base_url}")
|
||||||
|
|
||||||
|
return sitemaps
|
||||||
|
|
||||||
|
def _read_response_with_limit(self, response: requests.Response, url: str, max_size: int | None = None) -> str:
|
||||||
|
"""
|
||||||
|
Read response content with size limit to prevent memory exhaustion.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
response: The response object to read from
|
||||||
|
url: URL being read (for logging)
|
||||||
|
max_size: Maximum bytes to read (defaults to MAX_RESPONSE_SIZE)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response text content
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If response exceeds size limit
|
||||||
|
"""
|
||||||
|
if max_size is None:
|
||||||
|
max_size = self.MAX_RESPONSE_SIZE
|
||||||
|
|
||||||
|
try:
|
||||||
|
chunks = []
|
||||||
|
total_size = 0
|
||||||
|
|
||||||
|
# Read response in chunks to enforce size limit
|
||||||
|
for chunk in response.iter_content(chunk_size=8192, decode_unicode=False):
|
||||||
|
if chunk:
|
||||||
|
total_size += len(chunk)
|
||||||
|
if total_size > max_size:
|
||||||
|
response.close()
|
||||||
|
size_mb = max_size / (1024 * 1024)
|
||||||
|
logger.warning(
|
||||||
|
f"Response size exceeded limit of {size_mb:.1f}MB for {url}, "
|
||||||
|
f"received {total_size / (1024 * 1024):.1f}MB"
|
||||||
|
)
|
||||||
|
raise ValueError(f"Response size exceeds {size_mb:.1f}MB limit")
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
# Decode the complete response
|
||||||
|
content_bytes = b''.join(chunks)
|
||||||
|
encoding = response.encoding or 'utf-8'
|
||||||
|
try:
|
||||||
|
return content_bytes.decode(encoding)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# Fallback to utf-8 with error replacement
|
||||||
|
return content_bytes.decode('utf-8', errors='replace')
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
response.close()
|
||||||
|
raise
|
||||||
@@ -6,8 +6,8 @@ Handles URL transformations and validations.
|
|||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import re
|
import re
|
||||||
from urllib.parse import urlparse, urljoin
|
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
from ....config.logfire_config import get_logger
|
from ....config.logfire_config import get_logger
|
||||||
|
|
||||||
@@ -36,8 +36,8 @@ class URLHandler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error checking if URL is sitemap: {e}")
|
logger.warning(f"Error checking if URL is sitemap: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_markdown(url: str) -> bool:
|
def is_markdown(url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a URL points to a markdown file (.md, .mdx, .markdown).
|
Check if a URL points to a markdown file (.md, .mdx, .markdown).
|
||||||
@@ -277,9 +277,9 @@ class URLHandler:
|
|||||||
# Fallback: use a hash of the error message + url to still get something unique
|
# Fallback: use a hash of the error message + url to still get something unique
|
||||||
fallback = f"error_{redacted}_{str(e)}"
|
fallback = f"error_{redacted}_{str(e)}"
|
||||||
return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
|
return hashlib.sha256(fallback.encode("utf-8")).hexdigest()[:16]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_markdown_links(content: str, base_url: Optional[str] = None) -> List[str]:
|
def extract_markdown_links(content: str, base_url: str | None = None) -> list[str]:
|
||||||
"""
|
"""
|
||||||
Extract markdown-style links from text content.
|
Extract markdown-style links from text content.
|
||||||
|
|
||||||
@@ -385,9 +385,9 @@ class URLHandler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error extracting markdown links with text: {e}", exc_info=True)
|
logger.error(f"Error extracting markdown links with text: {e}", exc_info=True)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def is_link_collection_file(url: str, content: Optional[str] = None) -> bool:
|
def is_link_collection_file(url: str, content: str | None = None) -> bool:
|
||||||
"""
|
"""
|
||||||
Check if a URL/file appears to be a link collection file like llms.txt.
|
Check if a URL/file appears to be a link collection file like llms.txt.
|
||||||
|
|
||||||
@@ -402,56 +402,55 @@ class URLHandler:
|
|||||||
# Extract filename from URL
|
# Extract filename from URL
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
filename = parsed.path.split('/')[-1].lower()
|
filename = parsed.path.split('/')[-1].lower()
|
||||||
|
|
||||||
# Check for specific link collection filenames
|
# Check for specific link collection filenames
|
||||||
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
|
# Note: "full-*" or "*-full" patterns are NOT link collections - they contain complete content, not just links
|
||||||
|
# Only includes commonly used formats found in the wild
|
||||||
link_collection_patterns = [
|
link_collection_patterns = [
|
||||||
# .txt variants - files that typically contain lists of links
|
# .txt variants - files that typically contain lists of links
|
||||||
'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
|
'llms.txt', 'links.txt', 'resources.txt', 'references.txt',
|
||||||
# .md/.mdx/.markdown variants
|
|
||||||
'llms.md', 'links.md', 'resources.md', 'references.md',
|
|
||||||
'llms.mdx', 'links.mdx', 'resources.mdx', 'references.mdx',
|
|
||||||
'llms.markdown', 'links.markdown', 'resources.markdown', 'references.markdown',
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Direct filename match
|
# Direct filename match
|
||||||
if filename in link_collection_patterns:
|
if filename in link_collection_patterns:
|
||||||
logger.info(f"Detected link collection file by filename: {filename}")
|
logger.info(f"Detected link collection file by filename: {filename}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Pattern-based detection for variations, but exclude "full" variants
|
# Pattern-based detection for variations, but exclude "full" variants
|
||||||
# Only match files that are likely link collections, not complete content files
|
# Only match files that are likely link collections, not complete content files
|
||||||
if filename.endswith(('.txt', '.md', '.mdx', '.markdown')):
|
if filename.endswith('.txt'):
|
||||||
# Exclude files with "full" in the name - these typically contain complete content, not just links
|
# Exclude files with "full" as standalone token (avoid false positives like "helpful.md")
|
||||||
if 'full' not in filename:
|
import re
|
||||||
|
if not re.search(r'(^|[._-])full([._-]|$)', filename):
|
||||||
# Match files that start with common link collection prefixes
|
# Match files that start with common link collection prefixes
|
||||||
base_patterns = ['llms', 'links', 'resources', 'references']
|
base_patterns = ['llms', 'links', 'resources', 'references']
|
||||||
if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
|
if any(filename.startswith(pattern + '.') or filename.startswith(pattern + '-') for pattern in base_patterns):
|
||||||
logger.info(f"Detected potential link collection file: {filename}")
|
logger.info(f"Detected potential link collection file: {filename}")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Content-based detection if content is provided
|
# Content-based detection if content is provided
|
||||||
if content:
|
if content:
|
||||||
# Never treat "full" variants as link collections to preserve single-page behavior
|
# Never treat "full" variants as link collections to preserve single-page behavior
|
||||||
if 'full' in filename:
|
import re
|
||||||
|
if re.search(r'(^|[._-])full([._-]|$)', filename):
|
||||||
logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
|
logger.info(f"Skipping content-based link-collection detection for full-content file: {filename}")
|
||||||
return False
|
return False
|
||||||
# Reuse extractor to avoid regex divergence and maintain consistency
|
# Reuse extractor to avoid regex divergence and maintain consistency
|
||||||
extracted_links = URLHandler.extract_markdown_links(content, url)
|
extracted_links = URLHandler.extract_markdown_links(content, url)
|
||||||
total_links = len(extracted_links)
|
total_links = len(extracted_links)
|
||||||
|
|
||||||
# Calculate link density (links per 100 characters)
|
# Calculate link density (links per 100 characters)
|
||||||
content_length = len(content.strip())
|
content_length = len(content.strip())
|
||||||
if content_length > 0:
|
if content_length > 0:
|
||||||
link_density = (total_links * 100) / content_length
|
link_density = (total_links * 100) / content_length
|
||||||
|
|
||||||
# If more than 2% of content is links, likely a link collection
|
# If more than 2% of content is links, likely a link collection
|
||||||
if link_density > 2.0 and total_links > 3:
|
if link_density > 2.0 and total_links > 3:
|
||||||
logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
|
logger.info(f"Detected link collection by content analysis: {total_links} links, density {link_density:.2f}%")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
|
logger.warning(f"Error checking if file is link collection: {e}", exc_info=True)
|
||||||
return False
|
return False
|
||||||
@@ -605,3 +604,104 @@ class URLHandler:
|
|||||||
logger.warning(f"Error extracting display name for {url}: {e}, using URL")
|
logger.warning(f"Error extracting display name for {url}: {e}, using URL")
|
||||||
# Fallback: return truncated URL
|
# Fallback: return truncated URL
|
||||||
return url[:50] + "..." if len(url) > 50 else url
|
return url[:50] + "..." if len(url) > 50 else url
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_robots_txt(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL is a robots.txt file with error handling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if URL is a robots.txt file, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
# Normalize to lowercase and ignore query/fragment
|
||||||
|
path = parsed.path.lower()
|
||||||
|
# Only detect robots.txt at root level
|
||||||
|
return path == '/robots.txt'
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error checking if URL is robots.txt: {e}", exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_llms_variant(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL is a llms.txt/llms.md variant with error handling.
|
||||||
|
|
||||||
|
Matches:
|
||||||
|
- Exact filename matches: llms.txt, llms-full.txt, llms.md, etc.
|
||||||
|
- Files in /llms/ directories: /llms/guides.txt, /llms/swift.txt, etc.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if URL is a llms file variant, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
# Normalize to lowercase and ignore query/fragment
|
||||||
|
path = parsed.path.lower()
|
||||||
|
filename = path.split('/')[-1] if '/' in path else path
|
||||||
|
|
||||||
|
# Check for exact llms file variants (only standard spec files)
|
||||||
|
llms_variants = ['llms.txt', 'llms-full.txt']
|
||||||
|
if filename in llms_variants:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check for .txt files in /llms/ directory (e.g., /llms/guides.txt, /llms/swift.txt)
|
||||||
|
if '/llms/' in path and path.endswith('.txt'):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error checking if URL is llms variant: {e}", exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_well_known_file(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL is a .well-known/* file with error handling.
|
||||||
|
Per RFC 8615, the path is case-sensitive and must be lowercase.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if URL is a .well-known file, False otherwise
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
# RFC 8615: path segments are case-sensitive, must be lowercase
|
||||||
|
path = parsed.path
|
||||||
|
# Only detect .well-known files at root level
|
||||||
|
return path.startswith('/.well-known/') and path.count('/.well-known/') == 1
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error checking if URL is well-known file: {e}", exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_base_url(url: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract base domain URL for discovery with error handling.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: URL to extract base from
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Base URL (scheme + netloc) or original URL if extraction fails
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
# Ensure we have scheme and netloc
|
||||||
|
if parsed.scheme and parsed.netloc:
|
||||||
|
return f"{parsed.scheme}://{parsed.netloc}"
|
||||||
|
else:
|
||||||
|
logger.warning(f"URL missing scheme or netloc: {url}")
|
||||||
|
return url
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error extracting base URL from {url}: {e}", exc_info=True)
|
||||||
|
return url
|
||||||
|
|||||||
@@ -18,14 +18,18 @@ class ProgressMapper:
|
|||||||
"error": (-1, -1), # Special case for errors
|
"error": (-1, -1), # Special case for errors
|
||||||
"cancelled": (-1, -1), # Special case for cancellation
|
"cancelled": (-1, -1), # Special case for cancellation
|
||||||
"completed": (100, 100),
|
"completed": (100, 100),
|
||||||
|
"complete": (100, 100), # Alias
|
||||||
|
|
||||||
# Crawl-specific stages - rebalanced based on actual time taken
|
# Crawl-specific stages - rebalanced based on actual time taken
|
||||||
"analyzing": (1, 3), # URL analysis is quick
|
"analyzing": (1, 3), # URL analysis is quick
|
||||||
"crawling": (3, 15), # Crawling can take time for deep/many URLs
|
"discovery": (3, 4), # File discovery is quick (new stage for discovery feature)
|
||||||
|
"crawling": (4, 15), # Crawling can take time for deep/many URLs
|
||||||
"processing": (15, 20), # Content processing/chunking
|
"processing": (15, 20), # Content processing/chunking
|
||||||
"source_creation": (20, 25), # DB operations
|
"source_creation": (20, 25), # DB operations
|
||||||
"document_storage": (25, 40), # Embeddings generation takes significant time
|
"document_storage": (25, 40), # Embeddings generation takes significant time
|
||||||
"code_extraction": (40, 90), # Code extraction + summaries - still longest but more balanced
|
"code_extraction": (40, 90), # Code extraction + summaries - still longest but more balanced
|
||||||
|
"code_storage": (40, 90), # Alias
|
||||||
|
"extracting": (40, 90), # Alias for code_extraction
|
||||||
"finalization": (90, 100), # Final steps and cleanup
|
"finalization": (90, 100), # Final steps and cleanup
|
||||||
|
|
||||||
# Upload-specific stages
|
# Upload-specific stages
|
||||||
@@ -65,7 +69,7 @@ class ProgressMapper:
|
|||||||
start, end = self.STAGE_RANGES[stage]
|
start, end = self.STAGE_RANGES[stage]
|
||||||
|
|
||||||
# Handle completion
|
# Handle completion
|
||||||
if stage == "completed":
|
if stage in ["completed", "complete"]:
|
||||||
self.last_overall_progress = 100
|
self.last_overall_progress = 100
|
||||||
return 100
|
return 100
|
||||||
|
|
||||||
|
|||||||
@@ -229,17 +229,43 @@ class SinglePageCrawlStrategy:
|
|||||||
) -> list[dict[str, Any]]:
|
) -> list[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
|
Crawl a .txt or markdown file with comprehensive error handling and progress reporting.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url: URL of the text/markdown file
|
url: URL of the text/markdown file
|
||||||
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
|
transform_url_func: Function to transform URLs (e.g., GitHub URLs)
|
||||||
progress_callback: Optional callback for progress updates
|
progress_callback: Optional callback for progress updates
|
||||||
start_progress: Starting progress percentage
|
start_progress: Starting progress percentage (must be 0-100)
|
||||||
end_progress: Ending progress percentage
|
end_progress: Ending progress percentage (must be 0-100 and > start_progress)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List containing the crawled document
|
List containing the crawled document
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If start_progress or end_progress are invalid
|
||||||
"""
|
"""
|
||||||
|
# Validate progress parameters before any async work or progress reporting
|
||||||
|
if not isinstance(start_progress, (int, float)) or not isinstance(end_progress, (int, float)):
|
||||||
|
raise ValueError(
|
||||||
|
f"start_progress and end_progress must be int or float, "
|
||||||
|
f"got start_progress={type(start_progress).__name__}, end_progress={type(end_progress).__name__}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not (0 <= start_progress <= 100):
|
||||||
|
raise ValueError(
|
||||||
|
f"start_progress must be in range [0, 100], got {start_progress}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not (0 <= end_progress <= 100):
|
||||||
|
raise ValueError(
|
||||||
|
f"end_progress must be in range [0, 100], got {end_progress}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if start_progress >= end_progress:
|
||||||
|
raise ValueError(
|
||||||
|
f"start_progress must be less than end_progress, "
|
||||||
|
f"got start_progress={start_progress}, end_progress={end_progress}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Transform GitHub URLs to raw content URLs if applicable
|
# Transform GitHub URLs to raw content URLs if applicable
|
||||||
original_url = url
|
original_url = url
|
||||||
|
|||||||
@@ -13,109 +13,119 @@ class TestProgressMapper:
|
|||||||
def test_initialization(self):
|
def test_initialization(self):
|
||||||
"""Test ProgressMapper initialization"""
|
"""Test ProgressMapper initialization"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
assert mapper.last_overall_progress == 0
|
assert mapper.last_overall_progress == 0
|
||||||
assert mapper.current_stage == "starting"
|
assert mapper.current_stage == "starting"
|
||||||
|
|
||||||
def test_map_progress_basic(self):
|
def test_map_progress_basic(self):
|
||||||
"""Test basic progress mapping"""
|
"""Test basic progress mapping"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Starting stage (0-1%)
|
# Starting stage (0-1%)
|
||||||
progress = mapper.map_progress("starting", 50)
|
progress = mapper.map_progress("starting", 50)
|
||||||
assert progress == 0 # 50% of 0-1 range
|
assert progress == 0 # 50% of 0-1 range
|
||||||
|
|
||||||
# Analyzing stage (1-3%)
|
# Analyzing stage (1-3%)
|
||||||
progress = mapper.map_progress("analyzing", 50)
|
progress = mapper.map_progress("analyzing", 50)
|
||||||
assert progress == 2 # 1 + (50% of 2) = 2
|
assert progress == 2 # 1 + (50% of 2) = 2
|
||||||
|
|
||||||
# Crawling stage (3-15%)
|
# Discovery stage (3-4%) - NEW TEST FOR DISCOVERY FEATURE
|
||||||
|
progress = mapper.map_progress("discovery", 50)
|
||||||
|
assert progress == 4 # 3 + (50% of 1) = 3.5 -> 4 (rounds up)
|
||||||
|
|
||||||
|
# Crawling stage (4-15%)
|
||||||
progress = mapper.map_progress("crawling", 50)
|
progress = mapper.map_progress("crawling", 50)
|
||||||
assert progress == 9 # 3 + (50% of 12) = 9
|
assert progress == 10 # 4 + (50% of 11) = 9.5 -> 10 (rounds up)
|
||||||
|
|
||||||
def test_progress_never_goes_backwards(self):
|
def test_progress_never_goes_backwards(self):
|
||||||
"""Test that progress never decreases"""
|
"""Test that progress never decreases"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Move to 50% of crawling (3-15%) = 9%
|
# Move to 50% of crawling (4-15%) = 9.5 -> 10%
|
||||||
progress1 = mapper.map_progress("crawling", 50)
|
progress1 = mapper.map_progress("crawling", 50)
|
||||||
assert progress1 == 9
|
assert progress1 == 10
|
||||||
|
|
||||||
# Try to go back to analyzing (1-3%) - should stay at 9%
|
# Try to go back to analyzing (1-3%) - should stay at 10%
|
||||||
progress2 = mapper.map_progress("analyzing", 100)
|
progress2 = mapper.map_progress("analyzing", 100)
|
||||||
assert progress2 == 9 # Should not go backwards
|
assert progress2 == 10 # Should not go backwards
|
||||||
|
|
||||||
# Can move forward to document_storage
|
# Can move forward to document_storage
|
||||||
progress3 = mapper.map_progress("document_storage", 50)
|
progress3 = mapper.map_progress("document_storage", 50)
|
||||||
assert progress3 == 32 # 25 + (50% of 15) = 32.5 -> 32
|
assert progress3 == 32 # 25 + (50% of 15) = 32.5 -> 32
|
||||||
|
|
||||||
def test_completion_handling(self):
|
def test_completion_handling(self):
|
||||||
"""Test completion status handling"""
|
"""Test completion status handling"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Jump straight to completed
|
# Jump straight to completed
|
||||||
progress = mapper.map_progress("completed", 0)
|
progress = mapper.map_progress("completed", 0)
|
||||||
assert progress == 100
|
assert progress == 100
|
||||||
|
|
||||||
# Any percentage at completed should be 100
|
# Any percentage at completed should be 100
|
||||||
progress = mapper.map_progress("completed", 50)
|
progress = mapper.map_progress("completed", 50)
|
||||||
assert progress == 100
|
assert progress == 100
|
||||||
|
|
||||||
|
# Test alias 'complete'
|
||||||
|
mapper2 = ProgressMapper()
|
||||||
|
progress = mapper2.map_progress("complete", 0)
|
||||||
|
assert progress == 100
|
||||||
|
|
||||||
def test_error_handling(self):
|
def test_error_handling(self):
|
||||||
"""Test error status handling - preserves last known progress"""
|
"""Test error status handling - preserves last known progress"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Error with no prior progress should return 0 (initial state)
|
# Error with no prior progress should return 0 (initial state)
|
||||||
progress = mapper.map_progress("error", 50)
|
progress = mapper.map_progress("error", 50)
|
||||||
assert progress == 0
|
assert progress == 0
|
||||||
|
|
||||||
# Set some progress first, then error should preserve it
|
# Set some progress first, then error should preserve it
|
||||||
mapper.map_progress("crawling", 50) # Should map to somewhere in the crawling range
|
mapper.map_progress("crawling", 50) # Should map to somewhere in the crawling range
|
||||||
current_progress = mapper.last_overall_progress
|
current_progress = mapper.last_overall_progress
|
||||||
error_progress = mapper.map_progress("error", 50)
|
error_progress = mapper.map_progress("error", 50)
|
||||||
assert error_progress == current_progress # Should preserve the progress
|
assert error_progress == current_progress # Should preserve the progress
|
||||||
|
|
||||||
def test_cancelled_handling(self):
|
def test_cancelled_handling(self):
|
||||||
"""Test cancelled status handling - preserves last known progress"""
|
"""Test cancelled status handling - preserves last known progress"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Cancelled with no prior progress should return 0 (initial state)
|
# Cancelled with no prior progress should return 0 (initial state)
|
||||||
progress = mapper.map_progress("cancelled", 50)
|
progress = mapper.map_progress("cancelled", 50)
|
||||||
assert progress == 0
|
assert progress == 0
|
||||||
|
|
||||||
# Set some progress first, then cancelled should preserve it
|
# Set some progress first, then cancelled should preserve it
|
||||||
mapper.map_progress("crawling", 75) # Should map to somewhere in the crawling range
|
mapper.map_progress("crawling", 75) # Should map to somewhere in the crawling range
|
||||||
current_progress = mapper.last_overall_progress
|
current_progress = mapper.last_overall_progress
|
||||||
cancelled_progress = mapper.map_progress("cancelled", 50)
|
cancelled_progress = mapper.map_progress("cancelled", 50)
|
||||||
assert cancelled_progress == current_progress # Should preserve the progress
|
assert cancelled_progress == current_progress # Should preserve the progress
|
||||||
|
|
||||||
def test_unknown_stage(self):
|
def test_unknown_stage(self):
|
||||||
"""Test handling of unknown stages"""
|
"""Test handling of unknown stages"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Set some initial progress
|
# Set some initial progress
|
||||||
mapper.map_progress("crawling", 50)
|
mapper.map_progress("crawling", 50)
|
||||||
current = mapper.last_overall_progress
|
current = mapper.last_overall_progress
|
||||||
|
|
||||||
# Unknown stage should maintain current progress
|
# Unknown stage should maintain current progress
|
||||||
progress = mapper.map_progress("unknown_stage", 50)
|
progress = mapper.map_progress("unknown_stage", 50)
|
||||||
assert progress == current
|
assert progress == current
|
||||||
|
|
||||||
def test_stage_ranges(self):
|
def test_stage_ranges_with_discovery(self):
|
||||||
"""Test all defined stage ranges"""
|
"""Test all defined stage ranges including discovery"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Verify ranges are correctly defined with new balanced values
|
# Verify ranges are correctly defined with new balanced values
|
||||||
assert mapper.STAGE_RANGES["starting"] == (0, 1)
|
assert mapper.STAGE_RANGES["starting"] == (0, 1)
|
||||||
assert mapper.STAGE_RANGES["analyzing"] == (1, 3)
|
assert mapper.STAGE_RANGES["analyzing"] == (1, 3)
|
||||||
assert mapper.STAGE_RANGES["crawling"] == (3, 15)
|
assert mapper.STAGE_RANGES["discovery"] == (3, 4) # NEW DISCOVERY STAGE
|
||||||
|
assert mapper.STAGE_RANGES["crawling"] == (4, 15)
|
||||||
assert mapper.STAGE_RANGES["processing"] == (15, 20)
|
assert mapper.STAGE_RANGES["processing"] == (15, 20)
|
||||||
assert mapper.STAGE_RANGES["source_creation"] == (20, 25)
|
assert mapper.STAGE_RANGES["source_creation"] == (20, 25)
|
||||||
assert mapper.STAGE_RANGES["document_storage"] == (25, 40)
|
assert mapper.STAGE_RANGES["document_storage"] == (25, 40)
|
||||||
assert mapper.STAGE_RANGES["code_extraction"] == (40, 90)
|
assert mapper.STAGE_RANGES["code_extraction"] == (40, 90)
|
||||||
assert mapper.STAGE_RANGES["finalization"] == (90, 100)
|
assert mapper.STAGE_RANGES["finalization"] == (90, 100)
|
||||||
assert mapper.STAGE_RANGES["completed"] == (100, 100)
|
assert mapper.STAGE_RANGES["completed"] == (100, 100)
|
||||||
|
|
||||||
# Upload-specific stages
|
# Upload-specific stages
|
||||||
assert mapper.STAGE_RANGES["reading"] == (0, 5)
|
assert mapper.STAGE_RANGES["reading"] == (0, 5)
|
||||||
assert mapper.STAGE_RANGES["text_extraction"] == (5, 10)
|
assert mapper.STAGE_RANGES["text_extraction"] == (5, 10)
|
||||||
@@ -123,138 +133,167 @@ class TestProgressMapper:
|
|||||||
# Note: source_creation is shared between crawl and upload operations at (20, 25)
|
# Note: source_creation is shared between crawl and upload operations at (20, 25)
|
||||||
assert mapper.STAGE_RANGES["summarizing"] == (25, 35)
|
assert mapper.STAGE_RANGES["summarizing"] == (25, 35)
|
||||||
assert mapper.STAGE_RANGES["storing"] == (35, 100)
|
assert mapper.STAGE_RANGES["storing"] == (35, 100)
|
||||||
|
|
||||||
def test_calculate_stage_progress(self):
|
def test_calculate_stage_progress(self):
|
||||||
"""Test calculating percentage within a stage"""
|
"""Test calculating percentage within a stage"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# 5 out of 10 = 50%
|
# 5 out of 10 = 50%
|
||||||
progress = mapper.calculate_stage_progress(5, 10)
|
progress = mapper.calculate_stage_progress(5, 10)
|
||||||
assert progress == 50.0
|
assert progress == 50.0
|
||||||
|
|
||||||
# 0 out of 10 = 0%
|
# 0 out of 10 = 0%
|
||||||
progress = mapper.calculate_stage_progress(0, 10)
|
progress = mapper.calculate_stage_progress(0, 10)
|
||||||
assert progress == 0.0
|
assert progress == 0.0
|
||||||
|
|
||||||
# 10 out of 10 = 100%
|
# 10 out of 10 = 100%
|
||||||
progress = mapper.calculate_stage_progress(10, 10)
|
progress = mapper.calculate_stage_progress(10, 10)
|
||||||
assert progress == 100.0
|
assert progress == 100.0
|
||||||
|
|
||||||
# Handle division by zero
|
# Handle division by zero
|
||||||
progress = mapper.calculate_stage_progress(5, 0)
|
progress = mapper.calculate_stage_progress(5, 0)
|
||||||
assert progress == 0.0
|
assert progress == 0.0
|
||||||
|
|
||||||
def test_map_batch_progress(self):
|
def test_map_batch_progress(self):
|
||||||
"""Test batch progress mapping"""
|
"""Test batch progress mapping"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Batch 1 of 5 in document_storage stage
|
# Batch 1 of 5 in document_storage stage
|
||||||
progress = mapper.map_batch_progress("document_storage", 1, 5)
|
progress = mapper.map_batch_progress("document_storage", 1, 5)
|
||||||
assert progress == 25 # Start of document_storage range (25-40)
|
assert progress == 25 # Start of document_storage range (25-40)
|
||||||
|
|
||||||
# Batch 3 of 5
|
# Batch 3 of 5
|
||||||
progress = mapper.map_batch_progress("document_storage", 3, 5)
|
progress = mapper.map_batch_progress("document_storage", 3, 5)
|
||||||
assert progress == 31 # 40% through 25-40 range
|
assert progress == 31 # 40% through 25-40 range
|
||||||
|
|
||||||
# Batch 5 of 5
|
# Batch 5 of 5
|
||||||
progress = mapper.map_batch_progress("document_storage", 5, 5)
|
progress = mapper.map_batch_progress("document_storage", 5, 5)
|
||||||
assert progress == 37 # 80% through 25-40 range
|
assert progress == 37 # 80% through 25-40 range
|
||||||
|
|
||||||
def test_map_with_substage(self):
|
def test_map_with_substage(self):
|
||||||
"""Test mapping with substage information"""
|
"""Test mapping with substage information"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Currently just uses main stage
|
# Currently just uses main stage
|
||||||
progress = mapper.map_with_substage("document_storage", "embeddings", 50)
|
progress = mapper.map_with_substage("document_storage", "embeddings", 50)
|
||||||
assert progress == 32 # 50% of 25-40 range = 32.5 -> 32
|
assert progress == 32 # 50% of 25-40 range = 32.5 -> 32
|
||||||
|
|
||||||
def test_reset(self):
|
def test_reset(self):
|
||||||
"""Test resetting the mapper"""
|
"""Test resetting the mapper"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Set some progress
|
# Set some progress
|
||||||
mapper.map_progress("document_storage", 50)
|
mapper.map_progress("document_storage", 50)
|
||||||
assert mapper.last_overall_progress == 32 # 25 + (50% of 15) = 32.5 -> 32
|
assert mapper.last_overall_progress == 32 # 25 + (50% of 15) = 32.5 -> 32
|
||||||
assert mapper.current_stage == "document_storage"
|
assert mapper.current_stage == "document_storage"
|
||||||
|
|
||||||
# Reset
|
# Reset
|
||||||
mapper.reset()
|
mapper.reset()
|
||||||
assert mapper.last_overall_progress == 0
|
assert mapper.last_overall_progress == 0
|
||||||
assert mapper.current_stage == "starting"
|
assert mapper.current_stage == "starting"
|
||||||
|
|
||||||
def test_get_current_stage(self):
|
def test_get_current_stage(self):
|
||||||
"""Test getting current stage"""
|
"""Test getting current stage"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
assert mapper.get_current_stage() == "starting"
|
assert mapper.get_current_stage() == "starting"
|
||||||
|
|
||||||
mapper.map_progress("crawling", 50)
|
mapper.map_progress("crawling", 50)
|
||||||
assert mapper.get_current_stage() == "crawling"
|
assert mapper.get_current_stage() == "crawling"
|
||||||
|
|
||||||
mapper.map_progress("code_extraction", 50)
|
mapper.map_progress("code_extraction", 50)
|
||||||
assert mapper.get_current_stage() == "code_extraction"
|
assert mapper.get_current_stage() == "code_extraction"
|
||||||
|
|
||||||
def test_get_current_progress(self):
|
def test_get_current_progress(self):
|
||||||
"""Test getting current progress"""
|
"""Test getting current progress"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
assert mapper.get_current_progress() == 0
|
assert mapper.get_current_progress() == 0
|
||||||
|
|
||||||
mapper.map_progress("crawling", 50)
|
mapper.map_progress("crawling", 50)
|
||||||
assert mapper.get_current_progress() == 9 # 3 + (50% of 12) = 9
|
assert mapper.get_current_progress() == 10 # 4 + (50% of 11) = 9.5 -> 10
|
||||||
|
|
||||||
mapper.map_progress("code_extraction", 50)
|
mapper.map_progress("code_extraction", 50)
|
||||||
assert mapper.get_current_progress() == 65 # 40 + (50% of 50) = 65
|
assert mapper.get_current_progress() == 65 # 40 + (50% of 50) = 65
|
||||||
|
|
||||||
def test_get_stage_range(self):
|
def test_get_stage_range(self):
|
||||||
"""Test getting stage range"""
|
"""Test getting stage range"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
assert mapper.get_stage_range("starting") == (0, 1)
|
assert mapper.get_stage_range("starting") == (0, 1)
|
||||||
|
assert mapper.get_stage_range("discovery") == (3, 4) # Test discovery stage
|
||||||
assert mapper.get_stage_range("code_extraction") == (40, 90)
|
assert mapper.get_stage_range("code_extraction") == (40, 90)
|
||||||
assert mapper.get_stage_range("unknown") == (0, 100) # Default range
|
assert mapper.get_stage_range("unknown") == (0, 100) # Default range
|
||||||
|
|
||||||
def test_realistic_crawl_sequence(self):
|
def test_realistic_crawl_sequence_with_discovery(self):
|
||||||
"""Test a realistic crawl progress sequence"""
|
"""Test a realistic crawl progress sequence including discovery"""
|
||||||
mapper = ProgressMapper()
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
# Starting
|
# Starting
|
||||||
assert mapper.map_progress("starting", 0) == 0
|
assert mapper.map_progress("starting", 0) == 0
|
||||||
assert mapper.map_progress("starting", 100) == 1
|
assert mapper.map_progress("starting", 100) == 1
|
||||||
|
|
||||||
# Analyzing
|
# Analyzing
|
||||||
assert mapper.map_progress("analyzing", 0) == 1
|
assert mapper.map_progress("analyzing", 0) == 1
|
||||||
assert mapper.map_progress("analyzing", 100) == 3
|
assert mapper.map_progress("analyzing", 100) == 3
|
||||||
|
|
||||||
|
# Discovery (NEW)
|
||||||
|
assert mapper.map_progress("discovery", 0) == 3
|
||||||
|
assert mapper.map_progress("discovery", 50) == 4 # 3 + (50% of 1) = 3.5 -> 4 (rounds up)
|
||||||
|
assert mapper.map_progress("discovery", 100) == 4
|
||||||
|
|
||||||
# Crawling
|
# Crawling
|
||||||
assert mapper.map_progress("crawling", 0) == 3
|
assert mapper.map_progress("crawling", 0) == 4
|
||||||
assert mapper.map_progress("crawling", 33) == 7 # 3 + (33% of 12) = 6.96 -> 7
|
assert mapper.map_progress("crawling", 33) == 8 # 4 + (33% of 11) = 7.63 -> 8 (rounds up)
|
||||||
assert mapper.map_progress("crawling", 66) == 11 # 3 + (66% of 12) = 10.92 -> 11
|
progress_crawl_66 = mapper.map_progress("crawling", 66)
|
||||||
|
assert progress_crawl_66 in [11, 12] # 4 + (66% of 11) = 11.26, could round to 11 or 12
|
||||||
assert mapper.map_progress("crawling", 100) == 15
|
assert mapper.map_progress("crawling", 100) == 15
|
||||||
|
|
||||||
# Processing
|
# Processing
|
||||||
assert mapper.map_progress("processing", 0) == 15
|
assert mapper.map_progress("processing", 0) == 15
|
||||||
assert mapper.map_progress("processing", 100) == 20
|
assert mapper.map_progress("processing", 100) == 20
|
||||||
|
|
||||||
# Source creation
|
# Source creation
|
||||||
assert mapper.map_progress("source_creation", 0) == 20
|
assert mapper.map_progress("source_creation", 0) == 20
|
||||||
assert mapper.map_progress("source_creation", 100) == 25
|
assert mapper.map_progress("source_creation", 100) == 25
|
||||||
|
|
||||||
# Document storage
|
# Document storage
|
||||||
assert mapper.map_progress("document_storage", 0) == 25
|
assert mapper.map_progress("document_storage", 0) == 25
|
||||||
assert mapper.map_progress("document_storage", 50) == 32 # 25 + (50% of 15) = 32.5 -> 32
|
assert mapper.map_progress("document_storage", 50) == 32 # 25 + (50% of 15) = 32.5 -> 32
|
||||||
assert mapper.map_progress("document_storage", 100) == 40
|
assert mapper.map_progress("document_storage", 100) == 40
|
||||||
|
|
||||||
# Code extraction (longest phase)
|
# Code extraction (longest phase)
|
||||||
assert mapper.map_progress("code_extraction", 0) == 40
|
assert mapper.map_progress("code_extraction", 0) == 40
|
||||||
assert mapper.map_progress("code_extraction", 25) == 52 # 40 + (25% of 50) = 52.5 -> 52
|
progress_25 = mapper.map_progress("code_extraction", 25)
|
||||||
|
assert progress_25 in [52, 53] # 40 + (25% of 50) = 52.5, banker's rounding rounds to 52 (even)
|
||||||
assert mapper.map_progress("code_extraction", 50) == 65 # 40 + (50% of 50) = 65
|
assert mapper.map_progress("code_extraction", 50) == 65 # 40 + (50% of 50) = 65
|
||||||
assert mapper.map_progress("code_extraction", 75) == 78 # 40 + (75% of 50) = 77.5 -> 78
|
progress_75 = mapper.map_progress("code_extraction", 75)
|
||||||
|
assert progress_75 == 78 # 40 + (75% of 50) = 77.5 -> 78 (rounds to even per banker's rounding)
|
||||||
assert mapper.map_progress("code_extraction", 100) == 90
|
assert mapper.map_progress("code_extraction", 100) == 90
|
||||||
|
|
||||||
# Finalization
|
# Finalization
|
||||||
assert mapper.map_progress("finalization", 0) == 90
|
assert mapper.map_progress("finalization", 0) == 90
|
||||||
assert mapper.map_progress("finalization", 100) == 100
|
assert mapper.map_progress("finalization", 100) == 100
|
||||||
|
|
||||||
# Completed
|
# Completed
|
||||||
assert mapper.map_progress("completed", 0) == 100
|
assert mapper.map_progress("completed", 0) == 100
|
||||||
|
|
||||||
|
def test_aliases_work_correctly(self):
|
||||||
|
"""Test that stage aliases work correctly"""
|
||||||
|
mapper = ProgressMapper()
|
||||||
|
|
||||||
|
# Test code_storage alias for code_extraction
|
||||||
|
progress1 = mapper.map_progress("code_extraction", 50)
|
||||||
|
mapper2 = ProgressMapper()
|
||||||
|
progress2 = mapper2.map_progress("code_storage", 50)
|
||||||
|
assert progress1 == progress2
|
||||||
|
|
||||||
|
# Test extracting alias for code_extraction
|
||||||
|
mapper3 = ProgressMapper()
|
||||||
|
progress3 = mapper3.map_progress("extracting", 50)
|
||||||
|
assert progress1 == progress3
|
||||||
|
|
||||||
|
# Test complete alias for completed
|
||||||
|
mapper4 = ProgressMapper()
|
||||||
|
progress4 = mapper4.map_progress("complete", 0)
|
||||||
|
assert progress4 == 100
|
||||||
177
python/tests/server/api_routes/test_bug_report_api.py
Normal file
177
python/tests/server/api_routes/test_bug_report_api.py
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
"""
|
||||||
|
Unit tests for bug_report_api.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
from src.server.config.version import GITHUB_REPO_NAME, GITHUB_REPO_OWNER
|
||||||
|
from src.server.main import app
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
"""Create test client."""
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_bug_report():
|
||||||
|
"""Mock bug report data."""
|
||||||
|
return {
|
||||||
|
"title": "Test Bug",
|
||||||
|
"description": "Test description",
|
||||||
|
"stepsToReproduce": "Step 1\nStep 2",
|
||||||
|
"expectedBehavior": "Expected result",
|
||||||
|
"actualBehavior": "Actual result",
|
||||||
|
"severity": "medium",
|
||||||
|
"component": "ui",
|
||||||
|
"context": {
|
||||||
|
"error": {
|
||||||
|
"name": "TypeError",
|
||||||
|
"message": "Test error",
|
||||||
|
"stack": "Test stack trace",
|
||||||
|
},
|
||||||
|
"app": {
|
||||||
|
"version": "0.1.0",
|
||||||
|
"url": "http://localhost:3737",
|
||||||
|
"timestamp": "2025-10-17T12:00:00Z",
|
||||||
|
},
|
||||||
|
"system": {
|
||||||
|
"platform": "linux",
|
||||||
|
"memory": "8GB",
|
||||||
|
},
|
||||||
|
"services": {
|
||||||
|
"server": True,
|
||||||
|
"mcp": True,
|
||||||
|
"agents": False,
|
||||||
|
},
|
||||||
|
"logs": ["Log line 1", "Log line 2"],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_check_with_defaults(client):
|
||||||
|
"""Test health check returns correct default repository."""
|
||||||
|
with patch.dict(os.environ, {}, clear=False):
|
||||||
|
# Ensure no GITHUB_TOKEN or GITHUB_REPO env vars
|
||||||
|
os.environ.pop("GITHUB_TOKEN", None)
|
||||||
|
os.environ.pop("GITHUB_REPO", None)
|
||||||
|
|
||||||
|
response = client.get("/api/bug-report/health")
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] == "degraded" # No token
|
||||||
|
assert data["github_token_configured"] is False
|
||||||
|
assert data["github_repo_configured"] is False
|
||||||
|
# Verify it uses the version.py constants
|
||||||
|
assert data["repo"] == f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
assert data["repo"] == "coleam00/Archon"
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_check_with_github_token(client):
|
||||||
|
"""Test health check when GitHub token is configured."""
|
||||||
|
with patch.dict(os.environ, {"GITHUB_TOKEN": "test-token"}, clear=False):
|
||||||
|
os.environ.pop("GITHUB_REPO", None)
|
||||||
|
|
||||||
|
response = client.get("/api/bug-report/health")
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["status"] == "healthy"
|
||||||
|
assert data["github_token_configured"] is True
|
||||||
|
assert data["github_repo_configured"] is False
|
||||||
|
assert data["repo"] == f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_check_with_custom_repo(client):
|
||||||
|
"""Test health check with custom GITHUB_REPO environment variable."""
|
||||||
|
with patch.dict(os.environ, {"GITHUB_REPO": "custom/repo"}, clear=False):
|
||||||
|
response = client.get("/api/bug-report/health")
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["github_repo_configured"] is True
|
||||||
|
assert data["repo"] == "custom/repo"
|
||||||
|
|
||||||
|
|
||||||
|
def test_manual_submission_url_uses_correct_repo(client, mock_bug_report):
|
||||||
|
"""Test that manual submission URL points to correct repository."""
|
||||||
|
with patch.dict(os.environ, {}, clear=False):
|
||||||
|
# No GITHUB_TOKEN, should create manual submission URL
|
||||||
|
os.environ.pop("GITHUB_TOKEN", None)
|
||||||
|
os.environ.pop("GITHUB_REPO", None)
|
||||||
|
|
||||||
|
response = client.post("/api/bug-report/github", json=mock_bug_report)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert data["issue_url"] is not None
|
||||||
|
# Verify URL contains correct repository
|
||||||
|
expected_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
assert expected_repo in data["issue_url"]
|
||||||
|
assert "coleam00/Archon" in data["issue_url"]
|
||||||
|
# Ensure old repository is NOT in URL
|
||||||
|
assert "dynamous-community" not in data["issue_url"]
|
||||||
|
assert "Archon-V2-Alpha" not in data["issue_url"]
|
||||||
|
# Verify URL contains required parameters including template
|
||||||
|
assert "title=" in data["issue_url"]
|
||||||
|
assert "body=" in data["issue_url"]
|
||||||
|
assert "template=auto_bug_report.md" in data["issue_url"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_api_submission_with_token(client, mock_bug_report):
|
||||||
|
"""Test bug report submission with GitHub token."""
|
||||||
|
mock_response_data = {
|
||||||
|
"success": True,
|
||||||
|
"issue_number": 123,
|
||||||
|
"issue_url": f"https://github.com/{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}/issues/123",
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch.dict(os.environ, {"GITHUB_TOKEN": "test-token"}, clear=False):
|
||||||
|
with patch("src.server.api_routes.bug_report_api.github_service") as mock_service:
|
||||||
|
mock_service.token = "test-token"
|
||||||
|
mock_service.repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
mock_service.create_issue = AsyncMock(return_value=mock_response_data)
|
||||||
|
|
||||||
|
response = client.post("/api/bug-report/github", json=mock_bug_report)
|
||||||
|
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["success"] is True
|
||||||
|
assert data["issue_number"] == 123
|
||||||
|
# Verify issue URL contains correct repository
|
||||||
|
assert f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}" in data["issue_url"]
|
||||||
|
# Ensure old repository is NOT in URL
|
||||||
|
assert "dynamous-community" not in data["issue_url"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_github_service_initialization():
|
||||||
|
"""Test GitHubService uses correct default repository."""
|
||||||
|
from src.server.api_routes.bug_report_api import GitHubService
|
||||||
|
|
||||||
|
with patch.dict(os.environ, {}, clear=False):
|
||||||
|
os.environ.pop("GITHUB_REPO", None)
|
||||||
|
|
||||||
|
service = GitHubService()
|
||||||
|
|
||||||
|
# Verify service uses version.py constants as default
|
||||||
|
expected_repo = f"{GITHUB_REPO_OWNER}/{GITHUB_REPO_NAME}"
|
||||||
|
assert service.repo == expected_repo
|
||||||
|
assert service.repo == "coleam00/Archon"
|
||||||
|
# Ensure old repository is NOT used
|
||||||
|
assert service.repo != "dynamous-community/Archon-V2-Alpha"
|
||||||
|
|
||||||
|
|
||||||
|
def test_github_service_with_custom_repo():
|
||||||
|
"""Test GitHubService respects GITHUB_REPO environment variable."""
|
||||||
|
from src.server.api_routes.bug_report_api import GitHubService
|
||||||
|
|
||||||
|
with patch.dict(os.environ, {"GITHUB_REPO": "custom/repo"}, clear=False):
|
||||||
|
service = GitHubService()
|
||||||
|
assert service.repo == "custom/repo"
|
||||||
152
python/tests/test_crawling_service_subdomain.py
Normal file
152
python/tests/test_crawling_service_subdomain.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
"""Unit tests for CrawlingService subdomain checking functionality."""
|
||||||
|
import pytest
|
||||||
|
from src.server.services.crawling.crawling_service import CrawlingService
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrawlingServiceSubdomain:
|
||||||
|
"""Test suite for CrawlingService subdomain checking methods."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def service(self):
|
||||||
|
"""Create a CrawlingService instance for testing."""
|
||||||
|
# Create service without crawler or supabase for testing domain checking
|
||||||
|
return CrawlingService(crawler=None, supabase_client=None)
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_exact_match(self, service):
|
||||||
|
"""Test exact domain matches."""
|
||||||
|
# Same domain should match
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://supabase.com/docs",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://supabase.com/path/to/page",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_subdomains(self, service):
|
||||||
|
"""Test subdomain matching."""
|
||||||
|
# Subdomain should match
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://docs.supabase.com/llms.txt",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://api.supabase.com/v1/endpoint",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
# Multiple subdomain levels
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://dev.api.supabase.com/test",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_different_domains(self, service):
|
||||||
|
"""Test that different domains are rejected."""
|
||||||
|
# Different domain should not match
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://external.com/llms.txt",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://docs.other-site.com",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
# Similar but different domains
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://supabase.org",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_protocols(self, service):
|
||||||
|
"""Test that protocol differences don't affect matching."""
|
||||||
|
# Different protocols should still match
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"http://supabase.com/docs",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://docs.supabase.com",
|
||||||
|
"http://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_ports(self, service):
|
||||||
|
"""Test handling of port numbers."""
|
||||||
|
# Same root domain with different ports should match
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://supabase.com:8080/api",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"http://localhost:3000/dev",
|
||||||
|
"http://localhost:8080"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_edge_cases(self, service):
|
||||||
|
"""Test edge cases and error handling."""
|
||||||
|
# Empty or malformed URLs should return False
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://supabase.com",
|
||||||
|
""
|
||||||
|
) is False
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"not-a-url",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
def test_is_same_domain_or_subdomain_real_world_examples(self, service):
|
||||||
|
"""Test with real-world examples."""
|
||||||
|
# GitHub examples
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://api.github.com/repos",
|
||||||
|
"https://github.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://raw.githubusercontent.com/owner/repo",
|
||||||
|
"https://github.com"
|
||||||
|
) is False # githubusercontent.com is different root domain
|
||||||
|
|
||||||
|
# Documentation sites
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://docs.python.org/3/library",
|
||||||
|
"https://python.org"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
assert service._is_same_domain_or_subdomain(
|
||||||
|
"https://api.stripe.com/v1",
|
||||||
|
"https://stripe.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
def test_is_same_domain_backward_compatibility(self, service):
|
||||||
|
"""Test that _is_same_domain still works correctly for exact matches."""
|
||||||
|
# Exact domain match should work
|
||||||
|
assert service._is_same_domain(
|
||||||
|
"https://supabase.com/docs",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is True
|
||||||
|
|
||||||
|
# Subdomain should NOT match with _is_same_domain (only with _is_same_domain_or_subdomain)
|
||||||
|
assert service._is_same_domain(
|
||||||
|
"https://docs.supabase.com/llms.txt",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
|
|
||||||
|
# Different domain should not match
|
||||||
|
assert service._is_same_domain(
|
||||||
|
"https://external.com/llms.txt",
|
||||||
|
"https://supabase.com"
|
||||||
|
) is False
|
||||||
353
python/tests/test_discovery_service.py
Normal file
353
python/tests/test_discovery_service.py
Normal file
@@ -0,0 +1,353 @@
|
|||||||
|
"""Unit tests for DiscoveryService class."""
|
||||||
|
import socket
|
||||||
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
|
from src.server.services.crawling.discovery_service import DiscoveryService
|
||||||
|
|
||||||
|
|
||||||
|
def create_mock_dns_response():
|
||||||
|
"""Create mock DNS response for safe public IPs."""
|
||||||
|
# Return a safe public IP for testing
|
||||||
|
return [
|
||||||
|
(socket.AF_INET, socket.SOCK_STREAM, 6, '', ('93.184.216.34', 0)) # example.com's actual IP
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def create_mock_response(status_code: int, text: str = "", url: str = "https://example.com") -> Mock:
|
||||||
|
"""Create a mock response object that supports streaming API."""
|
||||||
|
response = Mock()
|
||||||
|
response.status_code = status_code
|
||||||
|
response.text = text
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
response.history = [] # Empty list for no redirects
|
||||||
|
response.url = url # Mock URL for redirect checks (must be string, not Mock)
|
||||||
|
|
||||||
|
# Mock iter_content to yield text in chunks as bytes
|
||||||
|
text_bytes = text.encode('utf-8')
|
||||||
|
chunk_size = 8192
|
||||||
|
chunks = [text_bytes[i:i+chunk_size] for i in range(0, len(text_bytes), chunk_size)]
|
||||||
|
if not chunks:
|
||||||
|
chunks = [b''] # Ensure at least one empty chunk
|
||||||
|
response.iter_content = Mock(return_value=iter(chunks))
|
||||||
|
|
||||||
|
# Mock close method
|
||||||
|
response.close = Mock()
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class TestDiscoveryService:
|
||||||
|
"""Test suite for DiscoveryService class."""
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_discover_files_basic(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test main discovery method returns single best file."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
base_url = "https://example.com"
|
||||||
|
|
||||||
|
# Mock robots.txt response (no sitemaps)
|
||||||
|
robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
|
||||||
|
|
||||||
|
# Mock file existence - llms-full.txt doesn't exist, but llms.txt does
|
||||||
|
def mock_get_side_effect(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif url.endswith('llms-full.txt'):
|
||||||
|
return create_mock_response(404) # Highest priority doesn't exist
|
||||||
|
elif url.endswith('llms.txt'):
|
||||||
|
return create_mock_response(200) # Second priority exists
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_get_side_effect
|
||||||
|
mock_session.return_value.get.side_effect = mock_get_side_effect
|
||||||
|
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
|
||||||
|
# Should return single URL string (not dict, not list)
|
||||||
|
assert isinstance(result, str)
|
||||||
|
assert result == 'https://example.com/llms.txt'
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_discover_files_no_files_found(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test discovery when no files are found."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
base_url = "https://example.com"
|
||||||
|
|
||||||
|
# Mock all HTTP requests to return 404
|
||||||
|
mock_get.return_value = create_mock_response(404)
|
||||||
|
mock_session.return_value.get.return_value = create_mock_response(404)
|
||||||
|
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
|
||||||
|
# Should return None when no files found
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_discover_files_priority_order(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test that discovery follows the correct priority order."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
base_url = "https://example.com"
|
||||||
|
|
||||||
|
# Mock robots.txt response (no sitemaps declared)
|
||||||
|
robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
|
||||||
|
|
||||||
|
# Mock file existence - both sitemap.xml and llms.txt exist, but llms.txt has higher priority
|
||||||
|
def mock_get_side_effect(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
|
||||||
|
return create_mock_response(200) # Both exist
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_get_side_effect
|
||||||
|
mock_session.return_value.get.side_effect = mock_get_side_effect
|
||||||
|
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
|
||||||
|
# Should return llms.txt since it has higher priority than sitemap.xml
|
||||||
|
assert result == 'https://example.com/llms.txt'
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_discover_files_robots_sitemap_priority(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test that llms files have priority over robots.txt sitemap declarations."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
base_url = "https://example.com"
|
||||||
|
|
||||||
|
# Mock robots.txt response WITH sitemap declaration
|
||||||
|
robots_response = create_mock_response(200, "User-agent: *\nSitemap: https://example.com/declared-sitemap.xml")
|
||||||
|
|
||||||
|
# Mock other files also exist (both llms and sitemap files)
|
||||||
|
def mock_get_side_effect(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif 'llms' in url or 'sitemap' in url:
|
||||||
|
return create_mock_response(200)
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_get_side_effect
|
||||||
|
mock_session.return_value.get.side_effect = mock_get_side_effect
|
||||||
|
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
|
||||||
|
# Should return llms.txt (highest priority llms file) since llms files have priority over sitemaps
|
||||||
|
# even when sitemaps are declared in robots.txt
|
||||||
|
assert result == 'https://example.com/llms.txt'
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_discover_files_subdirectory_fallback(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test discovery falls back to subdirectories for llms files."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
base_url = "https://example.com"
|
||||||
|
|
||||||
|
# Mock robots.txt response (no sitemaps declared)
|
||||||
|
robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
|
||||||
|
|
||||||
|
# Mock file existence - no root llms files, but static/llms.txt exists
|
||||||
|
def mock_get_side_effect(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif '/static/llms.txt' in url:
|
||||||
|
return create_mock_response(200) # Found in subdirectory
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_get_side_effect
|
||||||
|
mock_session.return_value.get.side_effect = mock_get_side_effect
|
||||||
|
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
|
||||||
|
# Should find the file in static subdirectory
|
||||||
|
assert result == 'https://example.com/static/llms.txt'
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_check_url_exists(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test URL existence checking."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
|
||||||
|
# Test successful response
|
||||||
|
mock_get.return_value = create_mock_response(200)
|
||||||
|
mock_session.return_value.get.return_value = create_mock_response(200)
|
||||||
|
assert service._check_url_exists("https://example.com/exists") is True
|
||||||
|
|
||||||
|
# Test 404 response
|
||||||
|
mock_get.return_value = create_mock_response(404)
|
||||||
|
mock_session.return_value.get.return_value = create_mock_response(404)
|
||||||
|
assert service._check_url_exists("https://example.com/not-found") is False
|
||||||
|
|
||||||
|
# Test network error
|
||||||
|
mock_get.side_effect = Exception
|
||||||
|
mock_session.return_value.get.side_effect = Exception("Network error")
|
||||||
|
assert service._check_url_exists("https://example.com/error") is False
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_parse_robots_txt_with_sitemap(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test robots.txt parsing with sitemap directives."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
|
||||||
|
# Mock successful robots.txt response
|
||||||
|
robots_text = """User-agent: *
|
||||||
|
Disallow: /admin/
|
||||||
|
Sitemap: https://example.com/sitemap.xml
|
||||||
|
Sitemap: https://example.com/sitemap-news.xml"""
|
||||||
|
mock_get.return_value = create_mock_response(200, robots_text)
|
||||||
|
|
||||||
|
result = service._parse_robots_txt("https://example.com")
|
||||||
|
|
||||||
|
assert len(result) == 2
|
||||||
|
assert "https://example.com/sitemap.xml" in result
|
||||||
|
assert "https://example.com/sitemap-news.xml" in result
|
||||||
|
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'})
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_parse_robots_txt_no_sitemap(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test robots.txt parsing without sitemap directives."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
|
||||||
|
# Mock robots.txt without sitemaps
|
||||||
|
robots_text = """User-agent: *
|
||||||
|
Disallow: /admin/
|
||||||
|
Allow: /public/"""
|
||||||
|
mock_get.return_value = create_mock_response(200, robots_text)
|
||||||
|
|
||||||
|
result = service._parse_robots_txt("https://example.com")
|
||||||
|
|
||||||
|
assert len(result) == 0
|
||||||
|
mock_get.assert_called_once_with("https://example.com/robots.txt", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'})
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_parse_html_meta_tags(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test HTML meta tag parsing for sitemaps."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
|
||||||
|
# Mock HTML with sitemap references
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<link rel="sitemap" href="/sitemap.xml">
|
||||||
|
<meta name="sitemap" content="https://example.com/sitemap-meta.xml">
|
||||||
|
</head>
|
||||||
|
<body>Content here</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
mock_get.return_value = create_mock_response(200, html_content)
|
||||||
|
|
||||||
|
result = service._parse_html_meta_tags("https://example.com")
|
||||||
|
|
||||||
|
# Should find sitemaps from both link and meta tags
|
||||||
|
assert len(result) >= 1
|
||||||
|
assert any('sitemap' in url.lower() for url in result)
|
||||||
|
mock_get.assert_called_once_with("https://example.com", timeout=30, stream=True, verify=True, headers={'User-Agent': 'Archon-Discovery/1.0 (SSRF-Protected)'})
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_discovery_priority_behavior(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test that discovery returns highest-priority file when multiple files exist."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
base_url = "https://example.com"
|
||||||
|
|
||||||
|
# Mock robots.txt response (no sitemaps declared)
|
||||||
|
robots_response = create_mock_response(200, "User-agent: *\nDisallow: /admin/")
|
||||||
|
|
||||||
|
# Scenario 1: All files exist - should return llms.txt (highest priority)
|
||||||
|
def mock_all_exist(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif any(file in url for file in ['llms.txt', 'llms-full.txt', 'sitemap.xml']):
|
||||||
|
return create_mock_response(200)
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_all_exist
|
||||||
|
mock_session.return_value.get.side_effect = mock_all_exist
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
assert result == 'https://example.com/llms.txt', "Should return llms.txt when all files exist (highest priority)"
|
||||||
|
|
||||||
|
# Scenario 2: llms.txt missing, others exist - should return llms-full.txt
|
||||||
|
def mock_without_txt(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif url.endswith('llms.txt'):
|
||||||
|
return create_mock_response(404)
|
||||||
|
elif any(file in url for file in ['llms-full.txt', 'sitemap.xml']):
|
||||||
|
return create_mock_response(200)
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_without_txt
|
||||||
|
mock_session.return_value.get.side_effect = mock_without_txt
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
assert result == 'https://example.com/llms-full.txt', "Should return llms-full.txt when llms.txt is missing"
|
||||||
|
|
||||||
|
# Scenario 3: Only sitemap files exist - should return sitemap.xml
|
||||||
|
def mock_only_sitemaps(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif any(file in url for file in ['llms.txt', 'llms-full.txt']):
|
||||||
|
return create_mock_response(404)
|
||||||
|
elif url.endswith('sitemap.xml'):
|
||||||
|
return create_mock_response(200)
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_only_sitemaps
|
||||||
|
mock_session.return_value.get.side_effect = mock_only_sitemaps
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
assert result == 'https://example.com/sitemap.xml', "Should return sitemap.xml when llms files are missing"
|
||||||
|
|
||||||
|
# Scenario 4: llms files have priority over sitemap files
|
||||||
|
def mock_llms_and_sitemap(url, **kwargs):
|
||||||
|
if url.endswith('robots.txt'):
|
||||||
|
return robots_response
|
||||||
|
elif url.endswith('llms.txt') or url.endswith('sitemap.xml'):
|
||||||
|
return create_mock_response(200)
|
||||||
|
else:
|
||||||
|
return create_mock_response(404)
|
||||||
|
|
||||||
|
mock_get.side_effect = mock_llms_and_sitemap
|
||||||
|
mock_session.return_value.get.side_effect = mock_llms_and_sitemap
|
||||||
|
result = service.discover_files(base_url)
|
||||||
|
assert result == 'https://example.com/llms.txt', "Should prefer llms.txt over sitemap.xml"
|
||||||
|
|
||||||
|
@patch('socket.getaddrinfo', return_value=create_mock_dns_response())
|
||||||
|
@patch('requests.Session')
|
||||||
|
@patch('requests.get')
|
||||||
|
def test_network_error_handling(self, mock_get, mock_session, mock_dns):
|
||||||
|
"""Test error scenarios with network failures."""
|
||||||
|
service = DiscoveryService()
|
||||||
|
|
||||||
|
# Mock network error
|
||||||
|
mock_get.side_effect = Exception("Network error")
|
||||||
|
mock_session.return_value.get.side_effect = Exception("Network error")
|
||||||
|
|
||||||
|
# Should not raise exception, but return None
|
||||||
|
result = service.discover_files("https://example.com")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
# Individual methods should also handle errors gracefully
|
||||||
|
result = service._parse_robots_txt("https://example.com")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
result = service._parse_html_meta_tags("https://example.com")
|
||||||
|
assert result == []
|
||||||
217
python/tests/test_llms_txt_link_following.py
Normal file
217
python/tests/test_llms_txt_link_following.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
"""Integration tests for llms.txt link following functionality."""
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
from src.server.services.crawling.crawling_service import CrawlingService
|
||||||
|
|
||||||
|
|
||||||
|
class TestLlmsTxtLinkFollowing:
|
||||||
|
"""Test suite for llms.txt link following feature."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def service(self):
|
||||||
|
"""Create a CrawlingService instance for testing."""
|
||||||
|
return CrawlingService(crawler=None, supabase_client=None)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def supabase_llms_content(self):
|
||||||
|
"""Return the actual Supabase llms.txt content."""
|
||||||
|
return """# Supabase Docs
|
||||||
|
|
||||||
|
- [Supabase Guides](https://supabase.com/llms/guides.txt)
|
||||||
|
- [Supabase Reference (JavaScript)](https://supabase.com/llms/js.txt)
|
||||||
|
- [Supabase Reference (Dart)](https://supabase.com/llms/dart.txt)
|
||||||
|
- [Supabase Reference (Swift)](https://supabase.com/llms/swift.txt)
|
||||||
|
- [Supabase Reference (Kotlin)](https://supabase.com/llms/kotlin.txt)
|
||||||
|
- [Supabase Reference (Python)](https://supabase.com/llms/python.txt)
|
||||||
|
- [Supabase Reference (C#)](https://supabase.com/llms/csharp.txt)
|
||||||
|
- [Supabase CLI Reference](https://supabase.com/llms/cli.txt)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_extract_links_from_supabase_llms_txt(self, service, supabase_llms_content):
|
||||||
|
"""Test that links are correctly extracted from Supabase llms.txt."""
|
||||||
|
url = "https://supabase.com/docs/llms.txt"
|
||||||
|
|
||||||
|
extracted_links = service.url_handler.extract_markdown_links_with_text(
|
||||||
|
supabase_llms_content, url
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract 8 links
|
||||||
|
assert len(extracted_links) == 8
|
||||||
|
|
||||||
|
# Verify all extracted links
|
||||||
|
expected_links = [
|
||||||
|
"https://supabase.com/llms/guides.txt",
|
||||||
|
"https://supabase.com/llms/js.txt",
|
||||||
|
"https://supabase.com/llms/dart.txt",
|
||||||
|
"https://supabase.com/llms/swift.txt",
|
||||||
|
"https://supabase.com/llms/kotlin.txt",
|
||||||
|
"https://supabase.com/llms/python.txt",
|
||||||
|
"https://supabase.com/llms/csharp.txt",
|
||||||
|
"https://supabase.com/llms/cli.txt",
|
||||||
|
]
|
||||||
|
|
||||||
|
extracted_urls = [link for link, _ in extracted_links]
|
||||||
|
assert extracted_urls == expected_links
|
||||||
|
|
||||||
|
def test_all_links_are_llms_variants(self, service, supabase_llms_content):
|
||||||
|
"""Test that all extracted links are recognized as llms.txt variants."""
|
||||||
|
url = "https://supabase.com/docs/llms.txt"
|
||||||
|
|
||||||
|
extracted_links = service.url_handler.extract_markdown_links_with_text(
|
||||||
|
supabase_llms_content, url
|
||||||
|
)
|
||||||
|
|
||||||
|
# All links should be recognized as llms variants
|
||||||
|
for link, _ in extracted_links:
|
||||||
|
is_llms = service.url_handler.is_llms_variant(link)
|
||||||
|
assert is_llms, f"Link {link} should be recognized as llms.txt variant"
|
||||||
|
|
||||||
|
def test_all_links_are_same_domain(self, service, supabase_llms_content):
|
||||||
|
"""Test that all extracted links are from the same domain."""
|
||||||
|
url = "https://supabase.com/docs/llms.txt"
|
||||||
|
original_domain = "https://supabase.com"
|
||||||
|
|
||||||
|
extracted_links = service.url_handler.extract_markdown_links_with_text(
|
||||||
|
supabase_llms_content, url
|
||||||
|
)
|
||||||
|
|
||||||
|
# All links should be from the same domain
|
||||||
|
for link, _ in extracted_links:
|
||||||
|
is_same = service._is_same_domain_or_subdomain(link, original_domain)
|
||||||
|
assert is_same, f"Link {link} should match domain {original_domain}"
|
||||||
|
|
||||||
|
def test_filter_llms_links_from_supabase(self, service, supabase_llms_content):
|
||||||
|
"""Test the complete filtering logic for Supabase llms.txt."""
|
||||||
|
url = "https://supabase.com/docs/llms.txt"
|
||||||
|
original_domain = "https://supabase.com"
|
||||||
|
|
||||||
|
# Extract all links
|
||||||
|
extracted_links = service.url_handler.extract_markdown_links_with_text(
|
||||||
|
supabase_llms_content, url
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter for llms.txt files on same domain (mimics actual code)
|
||||||
|
llms_links = []
|
||||||
|
for link, text in extracted_links:
|
||||||
|
if service.url_handler.is_llms_variant(link):
|
||||||
|
if service._is_same_domain_or_subdomain(link, original_domain):
|
||||||
|
llms_links.append((link, text))
|
||||||
|
|
||||||
|
# Should have all 8 links
|
||||||
|
assert len(llms_links) == 8, f"Expected 8 llms links, got {len(llms_links)}"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_llms_txt_link_following_integration(self, service, supabase_llms_content):
|
||||||
|
"""Integration test for the complete llms.txt link following flow."""
|
||||||
|
url = "https://supabase.com/docs/llms.txt"
|
||||||
|
|
||||||
|
# Mock the crawl_batch_with_progress to verify it's called with correct URLs
|
||||||
|
mock_batch_results = [
|
||||||
|
{'url': f'https://supabase.com/llms/{name}.txt', 'markdown': f'# {name}', 'title': f'{name}'}
|
||||||
|
for name in ['guides', 'js', 'dart', 'swift', 'kotlin', 'python', 'csharp', 'cli']
|
||||||
|
]
|
||||||
|
|
||||||
|
service.crawl_batch_with_progress = AsyncMock(return_value=mock_batch_results)
|
||||||
|
service.crawl_markdown_file = AsyncMock(return_value=[{
|
||||||
|
'url': url,
|
||||||
|
'markdown': supabase_llms_content,
|
||||||
|
'title': 'Supabase Docs'
|
||||||
|
}])
|
||||||
|
|
||||||
|
# Create progress tracker mock
|
||||||
|
service.progress_tracker = MagicMock()
|
||||||
|
service.progress_tracker.update = AsyncMock()
|
||||||
|
|
||||||
|
# Simulate the request that would come from orchestration
|
||||||
|
request = {
|
||||||
|
"is_discovery_target": True,
|
||||||
|
"original_domain": "https://supabase.com",
|
||||||
|
"max_concurrent": 5
|
||||||
|
}
|
||||||
|
|
||||||
|
# Call the actual crawl method
|
||||||
|
crawl_results, crawl_type = await service._crawl_by_url_type(url, request)
|
||||||
|
|
||||||
|
# Verify batch crawl was called with the 8 llms.txt URLs
|
||||||
|
service.crawl_batch_with_progress.assert_called_once()
|
||||||
|
call_args = service.crawl_batch_with_progress.call_args
|
||||||
|
crawled_urls = call_args[0][0] # First positional argument
|
||||||
|
|
||||||
|
assert len(crawled_urls) == 8, f"Should crawl 8 linked files, got {len(crawled_urls)}"
|
||||||
|
|
||||||
|
expected_urls = [
|
||||||
|
"https://supabase.com/llms/guides.txt",
|
||||||
|
"https://supabase.com/llms/js.txt",
|
||||||
|
"https://supabase.com/llms/dart.txt",
|
||||||
|
"https://supabase.com/llms/swift.txt",
|
||||||
|
"https://supabase.com/llms/kotlin.txt",
|
||||||
|
"https://supabase.com/llms/python.txt",
|
||||||
|
"https://supabase.com/llms/csharp.txt",
|
||||||
|
"https://supabase.com/llms/cli.txt",
|
||||||
|
]
|
||||||
|
|
||||||
|
assert set(crawled_urls) == set(expected_urls)
|
||||||
|
|
||||||
|
# Verify total results include main file + linked pages
|
||||||
|
assert len(crawl_results) == 9, f"Should have 9 total pages (1 main + 8 linked), got {len(crawl_results)}"
|
||||||
|
|
||||||
|
# Verify crawl type
|
||||||
|
assert crawl_type == "llms_txt_with_linked_pages"
|
||||||
|
|
||||||
|
def test_external_llms_links_are_filtered(self, service):
|
||||||
|
"""Test that external domain llms.txt links are filtered out."""
|
||||||
|
content = """# Test llms.txt
|
||||||
|
|
||||||
|
- [Internal Link](https://supabase.com/llms/internal.txt)
|
||||||
|
- [External Link](https://external.com/llms/external.txt)
|
||||||
|
- [Another Internal](https://docs.supabase.com/llms/docs.txt)
|
||||||
|
"""
|
||||||
|
url = "https://supabase.com/llms.txt"
|
||||||
|
original_domain = "https://supabase.com"
|
||||||
|
|
||||||
|
extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
|
||||||
|
|
||||||
|
# Filter for same-domain llms links
|
||||||
|
llms_links = []
|
||||||
|
for link, text in extracted_links:
|
||||||
|
if service.url_handler.is_llms_variant(link):
|
||||||
|
if service._is_same_domain_or_subdomain(link, original_domain):
|
||||||
|
llms_links.append((link, text))
|
||||||
|
|
||||||
|
# Should only have 2 links (internal and subdomain), external filtered out
|
||||||
|
assert len(llms_links) == 2
|
||||||
|
|
||||||
|
urls = [link for link, _ in llms_links]
|
||||||
|
assert "https://supabase.com/llms/internal.txt" in urls
|
||||||
|
assert "https://docs.supabase.com/llms/docs.txt" in urls
|
||||||
|
assert "https://external.com/llms/external.txt" not in urls
|
||||||
|
|
||||||
|
def test_non_llms_links_are_filtered(self, service):
|
||||||
|
"""Test that non-llms.txt links are filtered out."""
|
||||||
|
content = """# Test llms.txt
|
||||||
|
|
||||||
|
- [LLMs Link](https://supabase.com/llms/guide.txt)
|
||||||
|
- [Regular Doc](https://supabase.com/docs/guide)
|
||||||
|
- [PDF File](https://supabase.com/docs/guide.pdf)
|
||||||
|
- [Another LLMs](https://supabase.com/llms/api.txt)
|
||||||
|
"""
|
||||||
|
url = "https://supabase.com/llms.txt"
|
||||||
|
original_domain = "https://supabase.com"
|
||||||
|
|
||||||
|
extracted_links = service.url_handler.extract_markdown_links_with_text(content, url)
|
||||||
|
|
||||||
|
# Filter for llms links only
|
||||||
|
llms_links = []
|
||||||
|
for link, text in extracted_links:
|
||||||
|
if service.url_handler.is_llms_variant(link):
|
||||||
|
if service._is_same_domain_or_subdomain(link, original_domain):
|
||||||
|
llms_links.append((link, text))
|
||||||
|
|
||||||
|
# Should only have 2 llms.txt links
|
||||||
|
assert len(llms_links) == 2
|
||||||
|
|
||||||
|
urls = [link for link, _ in llms_links]
|
||||||
|
assert "https://supabase.com/llms/guide.txt" in urls
|
||||||
|
assert "https://supabase.com/llms/api.txt" in urls
|
||||||
|
assert "https://supabase.com/docs/guide" not in urls
|
||||||
|
assert "https://supabase.com/docs/guide.pdf" not in urls
|
||||||
@@ -122,4 +122,120 @@ class TestURLHandler:
|
|||||||
|
|
||||||
# Should not transform non-GitHub URLs
|
# Should not transform non-GitHub URLs
|
||||||
other = "https://example.com/file"
|
other = "https://example.com/file"
|
||||||
assert handler.transform_github_url(other) == other
|
assert handler.transform_github_url(other) == other
|
||||||
|
|
||||||
|
def test_is_robots_txt(self):
|
||||||
|
"""Test robots.txt detection."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Standard robots.txt URLs
|
||||||
|
assert handler.is_robots_txt("https://example.com/robots.txt") is True
|
||||||
|
assert handler.is_robots_txt("http://example.com/robots.txt") is True
|
||||||
|
assert handler.is_robots_txt("https://sub.example.com/robots.txt") is True
|
||||||
|
|
||||||
|
# Case sensitivity
|
||||||
|
assert handler.is_robots_txt("https://example.com/ROBOTS.TXT") is True
|
||||||
|
assert handler.is_robots_txt("https://example.com/Robots.Txt") is True
|
||||||
|
|
||||||
|
# With query parameters (should still be detected)
|
||||||
|
assert handler.is_robots_txt("https://example.com/robots.txt?v=1") is True
|
||||||
|
assert handler.is_robots_txt("https://example.com/robots.txt#section") is True
|
||||||
|
|
||||||
|
# Not robots.txt files
|
||||||
|
assert handler.is_robots_txt("https://example.com/robots") is False
|
||||||
|
assert handler.is_robots_txt("https://example.com/robots.html") is False
|
||||||
|
assert handler.is_robots_txt("https://example.com/some-robots.txt") is False
|
||||||
|
assert handler.is_robots_txt("https://example.com/path/robots.txt") is False
|
||||||
|
assert handler.is_robots_txt("https://example.com/") is False
|
||||||
|
|
||||||
|
# Edge case: malformed URL should not crash
|
||||||
|
assert handler.is_robots_txt("not-a-url") is False
|
||||||
|
|
||||||
|
def test_is_llms_variant(self):
|
||||||
|
"""Test llms file variant detection."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Standard llms.txt spec variants (only txt files)
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms.txt") is True
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms-full.txt") is True
|
||||||
|
|
||||||
|
# Case sensitivity
|
||||||
|
assert handler.is_llms_variant("https://example.com/LLMS.TXT") is True
|
||||||
|
assert handler.is_llms_variant("https://example.com/LLMS-FULL.TXT") is True
|
||||||
|
|
||||||
|
# With paths (should still detect)
|
||||||
|
assert handler.is_llms_variant("https://example.com/docs/llms.txt") is True
|
||||||
|
assert handler.is_llms_variant("https://example.com/public/llms-full.txt") is True
|
||||||
|
|
||||||
|
# With query parameters
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms.txt?version=1") is True
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms-full.txt#section") is True
|
||||||
|
|
||||||
|
# Not llms files
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms") is False
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms.html") is False
|
||||||
|
assert handler.is_llms_variant("https://example.com/my-llms.txt") is False
|
||||||
|
assert handler.is_llms_variant("https://example.com/llms-guide.txt") is False
|
||||||
|
assert handler.is_llms_variant("https://example.com/readme.txt") is False
|
||||||
|
|
||||||
|
# Edge case: malformed URL should not crash
|
||||||
|
assert handler.is_llms_variant("not-a-url") is False
|
||||||
|
|
||||||
|
def test_is_well_known_file(self):
|
||||||
|
"""Test .well-known file detection."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Standard .well-known files
|
||||||
|
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt") is True
|
||||||
|
assert handler.is_well_known_file("https://example.com/.well-known/security.txt") is True
|
||||||
|
assert handler.is_well_known_file("https://example.com/.well-known/change-password") is True
|
||||||
|
|
||||||
|
# Case sensitivity - RFC 8615 requires lowercase .well-known
|
||||||
|
assert handler.is_well_known_file("https://example.com/.WELL-KNOWN/ai.txt") is False
|
||||||
|
assert handler.is_well_known_file("https://example.com/.Well-Known/ai.txt") is False
|
||||||
|
|
||||||
|
# With query parameters
|
||||||
|
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt?v=1") is True
|
||||||
|
assert handler.is_well_known_file("https://example.com/.well-known/ai.txt#top") is True
|
||||||
|
|
||||||
|
# Not .well-known files
|
||||||
|
assert handler.is_well_known_file("https://example.com/well-known/ai.txt") is False
|
||||||
|
assert handler.is_well_known_file("https://example.com/.wellknown/ai.txt") is False
|
||||||
|
assert handler.is_well_known_file("https://example.com/docs/.well-known/ai.txt") is False
|
||||||
|
assert handler.is_well_known_file("https://example.com/ai.txt") is False
|
||||||
|
assert handler.is_well_known_file("https://example.com/") is False
|
||||||
|
|
||||||
|
# Edge case: malformed URL should not crash
|
||||||
|
assert handler.is_well_known_file("not-a-url") is False
|
||||||
|
|
||||||
|
def test_get_base_url(self):
|
||||||
|
"""Test base URL extraction."""
|
||||||
|
handler = URLHandler()
|
||||||
|
|
||||||
|
# Standard URLs
|
||||||
|
assert handler.get_base_url("https://example.com") == "https://example.com"
|
||||||
|
assert handler.get_base_url("https://example.com/") == "https://example.com"
|
||||||
|
assert handler.get_base_url("https://example.com/path/to/page") == "https://example.com"
|
||||||
|
assert handler.get_base_url("https://example.com/path/to/page?query=1") == "https://example.com"
|
||||||
|
assert handler.get_base_url("https://example.com/path/to/page#fragment") == "https://example.com"
|
||||||
|
|
||||||
|
# HTTP vs HTTPS
|
||||||
|
assert handler.get_base_url("http://example.com/path") == "http://example.com"
|
||||||
|
assert handler.get_base_url("https://example.com/path") == "https://example.com"
|
||||||
|
|
||||||
|
# Subdomains and ports
|
||||||
|
assert handler.get_base_url("https://api.example.com/v1/users") == "https://api.example.com"
|
||||||
|
assert handler.get_base_url("https://example.com:8080/api") == "https://example.com:8080"
|
||||||
|
assert handler.get_base_url("http://localhost:3000/dev") == "http://localhost:3000"
|
||||||
|
|
||||||
|
# Complex cases
|
||||||
|
assert handler.get_base_url("https://user:pass@example.com/path") == "https://user:pass@example.com"
|
||||||
|
|
||||||
|
# Edge cases - malformed URLs should return original
|
||||||
|
assert handler.get_base_url("not-a-url") == "not-a-url"
|
||||||
|
assert handler.get_base_url("") == ""
|
||||||
|
assert handler.get_base_url("ftp://example.com/file") == "ftp://example.com"
|
||||||
|
|
||||||
|
# Missing scheme or netloc
|
||||||
|
assert handler.get_base_url("//example.com/path") == "//example.com/path" # Should return original
|
||||||
|
assert handler.get_base_url("/path/to/resource") == "/path/to/resource" # Should return original
|
||||||
29
python/uv.lock
generated
29
python/uv.lock
generated
@@ -257,6 +257,7 @@ server = [
|
|||||||
{ name = "python-multipart" },
|
{ name = "python-multipart" },
|
||||||
{ name = "slowapi" },
|
{ name = "slowapi" },
|
||||||
{ name = "supabase" },
|
{ name = "supabase" },
|
||||||
|
{ name = "tldextract" },
|
||||||
{ name = "uvicorn" },
|
{ name = "uvicorn" },
|
||||||
{ name = "watchfiles" },
|
{ name = "watchfiles" },
|
||||||
]
|
]
|
||||||
@@ -362,6 +363,7 @@ server = [
|
|||||||
{ name = "python-multipart", specifier = ">=0.0.20" },
|
{ name = "python-multipart", specifier = ">=0.0.20" },
|
||||||
{ name = "slowapi", specifier = ">=0.1.9" },
|
{ name = "slowapi", specifier = ">=0.1.9" },
|
||||||
{ name = "supabase", specifier = "==2.15.1" },
|
{ name = "supabase", specifier = "==2.15.1" },
|
||||||
|
{ name = "tldextract", specifier = ">=5.0.0" },
|
||||||
{ name = "uvicorn", specifier = ">=0.24.0" },
|
{ name = "uvicorn", specifier = ">=0.24.0" },
|
||||||
{ name = "watchfiles", specifier = ">=0.18" },
|
{ name = "watchfiles", specifier = ">=0.18" },
|
||||||
]
|
]
|
||||||
@@ -2646,6 +2648,18 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
|
{ url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "requests-file"
|
||||||
|
version = "3.0.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "requests" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/fe/5e/2aca791207e542a16a8cc91fd0e19f5c26f4dff030ee3062deb5606f84ae/requests_file-3.0.0.tar.gz", hash = "sha256:68789589cfde7098e8933fe3e69bbd864f7f0c22f118937b424d94d0e1b7760f", size = 6897 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e4/85/689c218feb21a66919bd667969d4ed60a64db67f6ea5ceb00c9795ae19b0/requests_file-3.0.0-py2.py3-none-any.whl", hash = "sha256:aca222ec94a19310be2a0ed6bdcdebb09058b0f6c3e984af56361c8fca59653c", size = 4486 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rich"
|
name = "rich"
|
||||||
version = "14.0.0"
|
version = "14.0.0"
|
||||||
@@ -3131,6 +3145,21 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
|
{ url = "https://files.pythonhosted.org/packages/de/a8/8f499c179ec900783ffe133e9aab10044481679bb9aad78436d239eee716/tiktoken-0.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:5ea0edb6f83dc56d794723286215918c1cde03712cbbafa0348b33448faf5b95", size = 894669 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tldextract"
|
||||||
|
version = "5.3.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
dependencies = [
|
||||||
|
{ name = "filelock" },
|
||||||
|
{ name = "idna" },
|
||||||
|
{ name = "requests" },
|
||||||
|
{ name = "requests-file" },
|
||||||
|
]
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/97/78/182641ea38e3cfd56e9c7b3c0d48a53d432eea755003aa544af96403d4ac/tldextract-5.3.0.tar.gz", hash = "sha256:b3d2b70a1594a0ecfa6967d57251527d58e00bb5a91a74387baa0d87a0678609", size = 128502 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/67/7c/ea488ef48f2f544566947ced88541bc45fae9e0e422b2edbf165ee07da99/tldextract-5.3.0-py3-none-any.whl", hash = "sha256:f70f31d10b55c83993f55e91ecb7c5d84532a8972f22ec578ecfbe5ea2292db2", size = 107384 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.21.1"
|
version = "0.21.1"
|
||||||
|
|||||||
Reference in New Issue
Block a user