mirror of
https://github.com/coleam00/Archon.git
synced 2026-01-01 12:18:41 -05:00
158 lines
7.7 KiB
Python
158 lines
7.7 KiB
Python
import streamlit as st
|
|
import time
|
|
import sys
|
|
import os
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
from archon.crawl_pydantic_ai_docs import start_crawl_with_requests, clear_existing_records
|
|
from utils.utils import get_env_var, create_new_tab_button
|
|
|
|
def documentation_tab(supabase_client):
|
|
"""Display the documentation interface"""
|
|
st.header("Documentation")
|
|
|
|
# Create tabs for different documentation sources
|
|
doc_tabs = st.tabs(["Pydantic AI Docs", "Future Sources"])
|
|
|
|
with doc_tabs[0]:
|
|
st.subheader("Pydantic AI Documentation")
|
|
st.markdown("""
|
|
This section allows you to crawl and index the Pydantic AI documentation.
|
|
The crawler will:
|
|
|
|
1. Fetch URLs from the Pydantic AI sitemap
|
|
2. Crawl each page and extract content
|
|
3. Split content into chunks
|
|
4. Generate embeddings for each chunk
|
|
5. Store the chunks in the Supabase database
|
|
|
|
This process may take several minutes depending on the number of pages.
|
|
""")
|
|
|
|
# Check if the database is configured
|
|
supabase_url = get_env_var("SUPABASE_URL")
|
|
supabase_key = get_env_var("SUPABASE_SERVICE_KEY")
|
|
|
|
if not supabase_url or not supabase_key:
|
|
st.warning("⚠️ Supabase is not configured. Please set up your environment variables first.")
|
|
create_new_tab_button("Go to Environment Section", "Environment", key="goto_env_from_docs")
|
|
else:
|
|
# Initialize session state for tracking crawl progress
|
|
if "crawl_tracker" not in st.session_state:
|
|
st.session_state.crawl_tracker = None
|
|
|
|
if "crawl_status" not in st.session_state:
|
|
st.session_state.crawl_status = None
|
|
|
|
if "last_update_time" not in st.session_state:
|
|
st.session_state.last_update_time = time.time()
|
|
|
|
# Create columns for the buttons
|
|
col1, col2 = st.columns(2)
|
|
|
|
with col1:
|
|
# Button to start crawling
|
|
if st.button("Crawl Pydantic AI Docs", key="crawl_pydantic") and not (st.session_state.crawl_tracker and st.session_state.crawl_tracker.is_running):
|
|
try:
|
|
# Define a callback function to update the session state
|
|
def update_progress(status):
|
|
st.session_state.crawl_status = status
|
|
|
|
# Start the crawling process in a separate thread
|
|
st.session_state.crawl_tracker = start_crawl_with_requests(update_progress)
|
|
st.session_state.crawl_status = st.session_state.crawl_tracker.get_status()
|
|
|
|
# Force a rerun to start showing progress
|
|
st.rerun()
|
|
except Exception as e:
|
|
st.error(f"❌ Error starting crawl: {str(e)}")
|
|
|
|
with col2:
|
|
# Button to clear existing Pydantic AI docs
|
|
if st.button("Clear Pydantic AI Docs", key="clear_pydantic"):
|
|
with st.spinner("Clearing existing Pydantic AI docs..."):
|
|
try:
|
|
# Run the function to clear records
|
|
clear_existing_records()
|
|
st.success("✅ Successfully cleared existing Pydantic AI docs from the database.")
|
|
|
|
# Force a rerun to update the UI
|
|
st.rerun()
|
|
except Exception as e:
|
|
st.error(f"❌ Error clearing Pydantic AI docs: {str(e)}")
|
|
|
|
# Display crawling progress if a crawl is in progress or has completed
|
|
if st.session_state.crawl_tracker:
|
|
# Create a container for the progress information
|
|
progress_container = st.container()
|
|
|
|
with progress_container:
|
|
# Get the latest status
|
|
current_time = time.time()
|
|
# Update status every second
|
|
if current_time - st.session_state.last_update_time >= 1:
|
|
st.session_state.crawl_status = st.session_state.crawl_tracker.get_status()
|
|
st.session_state.last_update_time = current_time
|
|
|
|
status = st.session_state.crawl_status
|
|
|
|
# Display a progress bar
|
|
if status and status["urls_found"] > 0:
|
|
progress = status["urls_processed"] / status["urls_found"]
|
|
st.progress(progress)
|
|
|
|
# Display status metrics
|
|
col1, col2, col3, col4 = st.columns(4)
|
|
if status:
|
|
col1.metric("URLs Found", status["urls_found"])
|
|
col2.metric("URLs Processed", status["urls_processed"])
|
|
col3.metric("Successful", status["urls_succeeded"])
|
|
col4.metric("Failed", status["urls_failed"])
|
|
else:
|
|
col1.metric("URLs Found", 0)
|
|
col2.metric("URLs Processed", 0)
|
|
col3.metric("Successful", 0)
|
|
col4.metric("Failed", 0)
|
|
|
|
# Display logs in an expander
|
|
with st.expander("Crawling Logs", expanded=True):
|
|
if status and "logs" in status:
|
|
logs_text = "\n".join(status["logs"][-20:]) # Show last 20 logs
|
|
st.code(logs_text)
|
|
else:
|
|
st.code("No logs available yet...")
|
|
|
|
# Show completion message
|
|
if status and not status["is_running"] and status["end_time"]:
|
|
if status["urls_failed"] == 0:
|
|
st.success("✅ Crawling process completed successfully!")
|
|
else:
|
|
st.warning(f"⚠️ Crawling process completed with {status['urls_failed']} failed URLs.")
|
|
|
|
# Auto-refresh while crawling is in progress
|
|
if not status or status["is_running"]:
|
|
st.rerun()
|
|
|
|
# Display database statistics
|
|
st.subheader("Database Statistics")
|
|
try:
|
|
# Query the count of Pydantic AI docs
|
|
result = supabase_client.table("site_pages").select("count", count="exact").eq("metadata->>source", "pydantic_ai_docs").execute()
|
|
count = result.count if hasattr(result, "count") else 0
|
|
|
|
# Display the count
|
|
st.metric("Pydantic AI Docs Chunks", count)
|
|
|
|
# Add a button to view the data
|
|
if count > 0 and st.button("View Indexed Data", key="view_pydantic_data"):
|
|
# Query a sample of the data
|
|
sample_data = supabase_client.table("site_pages").select("url,title,summary,chunk_number").eq("metadata->>source", "pydantic_ai_docs").limit(10).execute()
|
|
|
|
# Display the sample data
|
|
st.dataframe(sample_data.data)
|
|
st.info("Showing up to 10 sample records. The database contains more records.")
|
|
except Exception as e:
|
|
st.error(f"Error querying database: {str(e)}")
|
|
|
|
with doc_tabs[1]:
|
|
st.info("Additional documentation sources will be available in future updates.") |