import axios from "axios"; import { useEffect, useState } from "react"; import { useNavigate } from "react-router-dom"; import StatsStyling from "../styles/stats_styling"; const styles = StatsStyling; const API_BASE_URL = import.meta.env.VITE_BACKEND_URL; type SourceOption = { id: string; label: string; search_enabled?: boolean; categories_enabled?: boolean; searchEnabled?: boolean; categoriesEnabled?: boolean; }; type SourceConfig = { sourceName: string; limit: string; search: string; category: string; }; const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({ sourceName, limit: "100", search: "", category: "", }); const supportsSearch = (source?: SourceOption): boolean => Boolean(source?.search_enabled ?? source?.searchEnabled); const supportsCategories = (source?: SourceOption): boolean => Boolean(source?.categories_enabled ?? source?.categoriesEnabled); const AutoScrapePage = () => { const navigate = useNavigate(); const [datasetName, setDatasetName] = useState(""); const [sourceOptions, setSourceOptions] = useState([]); const [sourceConfigs, setSourceConfigs] = useState([]); const [returnMessage, setReturnMessage] = useState(""); const [isLoadingSources, setIsLoadingSources] = useState(true); const [isSubmitting, setIsSubmitting] = useState(false); const [hasError, setHasError] = useState(false); useEffect(() => { axios .get(`${API_BASE_URL}/datasets/sources`) .then((response) => { const options = response.data || []; setSourceOptions(options); setSourceConfigs([buildEmptySourceConfig(options[0]?.id || "")]); }) .catch((requestError: unknown) => { setHasError(true); if (axios.isAxiosError(requestError)) { setReturnMessage( `Failed to load available sources: ${String( requestError.response?.data?.error || requestError.message )}` ); } else { setReturnMessage("Failed to load available sources."); } }) .finally(() => { setIsLoadingSources(false); }); }, []); const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => { setSourceConfigs((previous) => previous.map((config, configIndex) => configIndex === index ? field === "sourceName" ? { ...config, sourceName: value, search: "", category: "" } : { ...config, [field]: value } : config ) ); }; const getSourceOption = (sourceName: string) => sourceOptions.find((option) => option.id === sourceName); const addSourceConfig = () => { setSourceConfigs((previous) => [ ...previous, buildEmptySourceConfig(sourceOptions[0]?.id || ""), ]); }; const removeSourceConfig = (index: number) => { setSourceConfigs((previous) => previous.filter((_, configIndex) => configIndex !== index)); }; const autoScrape = async () => { const token = localStorage.getItem("access_token"); if (!token) { setHasError(true); setReturnMessage("You must be signed in to auto scrape a dataset."); return; } const normalizedDatasetName = datasetName.trim(); if (!normalizedDatasetName) { setHasError(true); setReturnMessage("Please add a dataset name before continuing."); return; } if (sourceConfigs.length === 0) { setHasError(true); setReturnMessage("Please add at least one source."); return; } const normalizedSources = sourceConfigs.map((source) => { const sourceOption = getSourceOption(source.sourceName); return { name: source.sourceName, limit: Number(source.limit || 100), search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined, category: supportsCategories(sourceOption) ? source.category.trim() || undefined : undefined, }; }); const invalidSource = normalizedSources.find( (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0 ); if (invalidSource) { setHasError(true); setReturnMessage("Every source needs a name and a limit greater than zero."); return; } try { setIsSubmitting(true); setHasError(false); setReturnMessage(""); const response = await axios.post( `${API_BASE_URL}/datasets/scrape`, { name: normalizedDatasetName, sources: normalizedSources, }, { headers: { Authorization: `Bearer ${token}`, }, } ); const datasetId = Number(response.data.dataset_id); setReturnMessage( `Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...` ); setTimeout(() => { navigate(`/dataset/${datasetId}/status`); }, 400); } catch (requestError: unknown) { setHasError(true); if (axios.isAxiosError(requestError)) { const message = String( requestError.response?.data?.error || requestError.message || "Auto scrape failed." ); setReturnMessage(`Auto scrape failed: ${message}`); } else { setReturnMessage("Auto scrape failed due to an unexpected error."); } } finally { setIsSubmitting(false); } }; return (

Auto Scrape Dataset

Select sources and scrape settings, then queue processing automatically.

Dataset Name

Use a clear label so you can identify this run later.

setDatasetName(event.target.value)} />

Sources

Configure source, limit, optional search, and optional category.

{isLoadingSources &&

Loading sources...

} {!isLoadingSources && sourceOptions.length === 0 && (

No source connectors are currently available.

)} {!isLoadingSources && sourceOptions.length > 0 && (
{sourceConfigs.map((source, index) => { const sourceOption = getSourceOption(source.sourceName); const searchEnabled = supportsSearch(sourceOption); const categoriesEnabled = supportsCategories(sourceOption); return (
updateSourceConfig(index, "limit", event.target.value)} /> updateSourceConfig(index, "search", event.target.value)} /> updateSourceConfig(index, "category", event.target.value)} /> {sourceConfigs.length > 1 && ( )}
); })}
)}
{returnMessage || "After queueing, your dataset is fetched and processed in the background automatically."}
); }; export default AutoScrapePage;