Files
archon/original_archon/streamlit_pages/documentation.py

158 lines
7.7 KiB
Python

import streamlit as st
import time
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from archon.crawl_pydantic_ai_docs import start_crawl_with_requests, clear_existing_records
from utils.utils import get_env_var, create_new_tab_button
def documentation_tab(supabase_client):
"""Display the documentation interface"""
st.header("Documentation")
# Create tabs for different documentation sources
doc_tabs = st.tabs(["Pydantic AI Docs", "Future Sources"])
with doc_tabs[0]:
st.subheader("Pydantic AI Documentation")
st.markdown("""
This section allows you to crawl and index the Pydantic AI documentation.
The crawler will:
1. Fetch URLs from the Pydantic AI sitemap
2. Crawl each page and extract content
3. Split content into chunks
4. Generate embeddings for each chunk
5. Store the chunks in the Supabase database
This process may take several minutes depending on the number of pages.
""")
# Check if the database is configured
supabase_url = get_env_var("SUPABASE_URL")
supabase_key = get_env_var("SUPABASE_SERVICE_KEY")
if not supabase_url or not supabase_key:
st.warning("⚠️ Supabase is not configured. Please set up your environment variables first.")
create_new_tab_button("Go to Environment Section", "Environment", key="goto_env_from_docs")
else:
# Initialize session state for tracking crawl progress
if "crawl_tracker" not in st.session_state:
st.session_state.crawl_tracker = None
if "crawl_status" not in st.session_state:
st.session_state.crawl_status = None
if "last_update_time" not in st.session_state:
st.session_state.last_update_time = time.time()
# Create columns for the buttons
col1, col2 = st.columns(2)
with col1:
# Button to start crawling
if st.button("Crawl Pydantic AI Docs", key="crawl_pydantic") and not (st.session_state.crawl_tracker and st.session_state.crawl_tracker.is_running):
try:
# Define a callback function to update the session state
def update_progress(status):
st.session_state.crawl_status = status
# Start the crawling process in a separate thread
st.session_state.crawl_tracker = start_crawl_with_requests(update_progress)
st.session_state.crawl_status = st.session_state.crawl_tracker.get_status()
# Force a rerun to start showing progress
st.rerun()
except Exception as e:
st.error(f"❌ Error starting crawl: {str(e)}")
with col2:
# Button to clear existing Pydantic AI docs
if st.button("Clear Pydantic AI Docs", key="clear_pydantic"):
with st.spinner("Clearing existing Pydantic AI docs..."):
try:
# Run the function to clear records
clear_existing_records()
st.success("✅ Successfully cleared existing Pydantic AI docs from the database.")
# Force a rerun to update the UI
st.rerun()
except Exception as e:
st.error(f"❌ Error clearing Pydantic AI docs: {str(e)}")
# Display crawling progress if a crawl is in progress or has completed
if st.session_state.crawl_tracker:
# Create a container for the progress information
progress_container = st.container()
with progress_container:
# Get the latest status
current_time = time.time()
# Update status every second
if current_time - st.session_state.last_update_time >= 1:
st.session_state.crawl_status = st.session_state.crawl_tracker.get_status()
st.session_state.last_update_time = current_time
status = st.session_state.crawl_status
# Display a progress bar
if status and status["urls_found"] > 0:
progress = status["urls_processed"] / status["urls_found"]
st.progress(progress)
# Display status metrics
col1, col2, col3, col4 = st.columns(4)
if status:
col1.metric("URLs Found", status["urls_found"])
col2.metric("URLs Processed", status["urls_processed"])
col3.metric("Successful", status["urls_succeeded"])
col4.metric("Failed", status["urls_failed"])
else:
col1.metric("URLs Found", 0)
col2.metric("URLs Processed", 0)
col3.metric("Successful", 0)
col4.metric("Failed", 0)
# Display logs in an expander
with st.expander("Crawling Logs", expanded=True):
if status and "logs" in status:
logs_text = "\n".join(status["logs"][-20:]) # Show last 20 logs
st.code(logs_text)
else:
st.code("No logs available yet...")
# Show completion message
if status and not status["is_running"] and status["end_time"]:
if status["urls_failed"] == 0:
st.success("✅ Crawling process completed successfully!")
else:
st.warning(f"⚠️ Crawling process completed with {status['urls_failed']} failed URLs.")
# Auto-refresh while crawling is in progress
if not status or status["is_running"]:
st.rerun()
# Display database statistics
st.subheader("Database Statistics")
try:
# Query the count of Pydantic AI docs
result = supabase_client.table("site_pages").select("count", count="exact").eq("metadata->>source", "pydantic_ai_docs").execute()
count = result.count if hasattr(result, "count") else 0
# Display the count
st.metric("Pydantic AI Docs Chunks", count)
# Add a button to view the data
if count > 0 and st.button("View Indexed Data", key="view_pydantic_data"):
# Query a sample of the data
sample_data = supabase_client.table("site_pages").select("url,title,summary,chunk_number").eq("metadata->>source", "pydantic_ai_docs").limit(10).execute()
# Display the sample data
st.dataframe(sample_data.data)
st.info("Showing up to 10 sample records. The database contains more records.")
except Exception as e:
st.error(f"Error querying database: {str(e)}")
with doc_tabs[1]:
st.info("Additional documentation sources will be available in future updates.")