Compare commits
4 Commits
01d6bd0164
...
d96f459104
| Author | SHA1 | Date | |
|---|---|---|---|
| d96f459104 | |||
| 162a4de64e | |||
| 6684780d23 | |||
| c12f1b4371 |
@@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
|
|||||||
type SourceOption = {
|
type SourceOption = {
|
||||||
id: string;
|
id: string;
|
||||||
label: string;
|
label: string;
|
||||||
|
search_enabled?: boolean;
|
||||||
|
categories_enabled?: boolean;
|
||||||
|
searchEnabled?: boolean;
|
||||||
|
categoriesEnabled?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
type SourceConfig = {
|
type SourceConfig = {
|
||||||
@@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
|
|||||||
category: "",
|
category: "",
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const supportsSearch = (source?: SourceOption): boolean =>
|
||||||
|
Boolean(source?.search_enabled ?? source?.searchEnabled);
|
||||||
|
|
||||||
|
const supportsCategories = (source?: SourceOption): boolean =>
|
||||||
|
Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
|
||||||
|
|
||||||
const AutoScrapePage = () => {
|
const AutoScrapePage = () => {
|
||||||
const navigate = useNavigate();
|
const navigate = useNavigate();
|
||||||
const [datasetName, setDatasetName] = useState("");
|
const [datasetName, setDatasetName] = useState("");
|
||||||
@@ -63,11 +73,18 @@ const AutoScrapePage = () => {
|
|||||||
const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
|
const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
|
||||||
setSourceConfigs((previous) =>
|
setSourceConfigs((previous) =>
|
||||||
previous.map((config, configIndex) =>
|
previous.map((config, configIndex) =>
|
||||||
configIndex === index ? { ...config, [field]: value } : config
|
configIndex === index
|
||||||
|
? field === "sourceName"
|
||||||
|
? { ...config, sourceName: value, search: "", category: "" }
|
||||||
|
: { ...config, [field]: value }
|
||||||
|
: config
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const getSourceOption = (sourceName: string) =>
|
||||||
|
sourceOptions.find((option) => option.id === sourceName);
|
||||||
|
|
||||||
const addSourceConfig = () => {
|
const addSourceConfig = () => {
|
||||||
setSourceConfigs((previous) => [
|
setSourceConfigs((previous) => [
|
||||||
...previous,
|
...previous,
|
||||||
@@ -100,12 +117,18 @@ const AutoScrapePage = () => {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const normalizedSources = sourceConfigs.map((source) => ({
|
const normalizedSources = sourceConfigs.map((source) => {
|
||||||
name: source.sourceName,
|
const sourceOption = getSourceOption(source.sourceName);
|
||||||
limit: Number(source.limit || 100),
|
|
||||||
search: source.search.trim() || undefined,
|
return {
|
||||||
category: source.category.trim() || undefined,
|
name: source.sourceName,
|
||||||
}));
|
limit: Number(source.limit || 100),
|
||||||
|
search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined,
|
||||||
|
category: supportsCategories(sourceOption)
|
||||||
|
? source.category.trim() || undefined
|
||||||
|
: undefined,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
const invalidSource = normalizedSources.find(
|
const invalidSource = normalizedSources.find(
|
||||||
(source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
|
(source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
|
||||||
@@ -212,7 +235,12 @@ const AutoScrapePage = () => {
|
|||||||
|
|
||||||
{!isLoadingSources && sourceOptions.length > 0 && (
|
{!isLoadingSources && sourceOptions.length > 0 && (
|
||||||
<div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
|
<div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
|
||||||
{sourceConfigs.map((source, index) => (
|
{sourceConfigs.map((source, index) => {
|
||||||
|
const sourceOption = getSourceOption(source.sourceName);
|
||||||
|
const searchEnabled = supportsSearch(sourceOption);
|
||||||
|
const categoriesEnabled = supportsCategories(sourceOption);
|
||||||
|
|
||||||
|
return (
|
||||||
<div
|
<div
|
||||||
key={`source-${index}`}
|
key={`source-${index}`}
|
||||||
style={{
|
style={{
|
||||||
@@ -248,16 +276,26 @@ const AutoScrapePage = () => {
|
|||||||
<input
|
<input
|
||||||
type="text"
|
type="text"
|
||||||
value={source.search}
|
value={source.search}
|
||||||
placeholder="Search term (optional)"
|
placeholder={
|
||||||
|
searchEnabled
|
||||||
|
? "Search term (optional)"
|
||||||
|
: "Search not supported for this source"
|
||||||
|
}
|
||||||
style={{ ...styles.input, ...styles.inputFullWidth }}
|
style={{ ...styles.input, ...styles.inputFullWidth }}
|
||||||
|
disabled={!searchEnabled}
|
||||||
onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
|
onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<input
|
<input
|
||||||
type="text"
|
type="text"
|
||||||
value={source.category}
|
value={source.category}
|
||||||
placeholder="Category (optional)"
|
placeholder={
|
||||||
|
categoriesEnabled
|
||||||
|
? "Category (optional)"
|
||||||
|
: "Categories not supported for this source"
|
||||||
|
}
|
||||||
style={{ ...styles.input, ...styles.inputFullWidth }}
|
style={{ ...styles.input, ...styles.inputFullWidth }}
|
||||||
|
disabled={!categoriesEnabled}
|
||||||
onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
|
onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
@@ -271,7 +309,8 @@ const AutoScrapePage = () => {
|
|||||||
</button>
|
</button>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
))}
|
);
|
||||||
|
})}
|
||||||
|
|
||||||
<button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
|
<button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
|
||||||
Add another source
|
Add another source
|
||||||
|
|||||||
@@ -119,50 +119,82 @@ def get_user_datasets():
|
|||||||
|
|
||||||
@app.route("/datasets/sources", methods=["GET"])
|
@app.route("/datasets/sources", methods=["GET"])
|
||||||
def get_dataset_sources():
|
def get_dataset_sources():
|
||||||
return jsonify(get_connector_metadata())
|
list_metadata = list(get_connector_metadata().values())
|
||||||
|
return jsonify(list_metadata)
|
||||||
|
|
||||||
@app.route("/datasets/scrape", methods=["POST"])
|
@app.route("/datasets/scrape", methods=["POST"])
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
def scrape_data():
|
def scrape_data():
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
|
connector_metadata = get_connector_metadata()
|
||||||
|
|
||||||
|
# Strong validation needed, otherwise data goes to Celery and crashes silently
|
||||||
if not data or "sources" not in data:
|
if not data or "sources" not in data:
|
||||||
return jsonify({"error": "Sources must be provided"}), 400
|
return jsonify({"error": "Sources must be provided"}), 400
|
||||||
|
|
||||||
user_id = int(get_jwt_identity())
|
if "name" not in data or not str(data["name"]).strip():
|
||||||
|
return jsonify({"error": "Dataset name is required"}), 400
|
||||||
|
|
||||||
dataset_name = data["name"].strip()
|
dataset_name = data["name"].strip()
|
||||||
|
user_id = int(get_jwt_identity())
|
||||||
|
|
||||||
source_configs = data["sources"]
|
source_configs = data["sources"]
|
||||||
|
|
||||||
if not isinstance(source_configs, list) or len(source_configs) == 0:
|
if not isinstance(source_configs, list) or len(source_configs) == 0:
|
||||||
return jsonify({"error": "Sources must be a non-empty list"}), 400
|
return jsonify({"error": "Sources must be a non-empty list"}), 400
|
||||||
|
|
||||||
# Light Validation
|
|
||||||
for source in source_configs:
|
for source in source_configs:
|
||||||
|
if not isinstance(source, dict):
|
||||||
|
return jsonify({"error": "Each source must be an object"}), 400
|
||||||
|
|
||||||
if "name" not in source:
|
if "name" not in source:
|
||||||
return jsonify({"error": "Each source must contain a name"}), 400
|
return jsonify({"error": "Each source must contain a name"}), 400
|
||||||
if "limit" in source:
|
|
||||||
source["limit"] = int(source["limit"])
|
|
||||||
|
|
||||||
dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
|
|
||||||
dataset_manager.set_dataset_status(
|
|
||||||
dataset_id,
|
|
||||||
"fetching",
|
|
||||||
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
|
|
||||||
|
|
||||||
return jsonify(
|
if "limit" in source:
|
||||||
{
|
try:
|
||||||
"message": "Dataset queued for processing",
|
source["limit"] = int(source["limit"])
|
||||||
"dataset_id": dataset_id,
|
except (ValueError, TypeError):
|
||||||
"status": "processing",
|
return jsonify({"error": "Limit must be an integer"}), 400
|
||||||
}
|
|
||||||
), 202
|
name = source["name"]
|
||||||
|
|
||||||
|
if name not in connector_metadata:
|
||||||
|
return jsonify({"error": "Source not supported"}), 400
|
||||||
|
|
||||||
|
if "search" in source and not connector_metadata[name]["search_enabled"]:
|
||||||
|
return jsonify({"error": f"Source {name} does not support search"}), 400
|
||||||
|
|
||||||
|
if "category" in source and not connector_metadata[name]["categories_enabled"]:
|
||||||
|
return jsonify({"error": f"Source {name} does not support categories"}), 400
|
||||||
|
|
||||||
|
try:
|
||||||
|
dataset_id = dataset_manager.save_dataset_info(
|
||||||
|
user_id,
|
||||||
|
dataset_name,
|
||||||
|
default_topic_list
|
||||||
|
)
|
||||||
|
|
||||||
|
dataset_manager.set_dataset_status(
|
||||||
|
dataset_id,
|
||||||
|
"fetching",
|
||||||
|
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
fetch_and_process_dataset.delay(
|
||||||
|
dataset_id,
|
||||||
|
source_configs,
|
||||||
|
default_topic_list
|
||||||
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": "An unexpected error occurred"}), 500
|
return jsonify({"error": "Failed to queue dataset processing"}), 500
|
||||||
|
|
||||||
|
|
||||||
|
return jsonify({
|
||||||
|
"message": "Dataset queued for processing",
|
||||||
|
"dataset_id": dataset_id,
|
||||||
|
"status": "processing"
|
||||||
|
}), 202
|
||||||
|
|
||||||
@app.route("/datasets/upload", methods=["POST"])
|
@app.route("/datasets/upload", methods=["POST"])
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ class BaseConnector(ABC):
|
|||||||
display_name: str # human-readable: "Reddit", "YouTube"
|
display_name: str # human-readable: "Reddit", "YouTube"
|
||||||
required_env: list[str] = [] # env vars needed to activate
|
required_env: list[str] = [] # env vars needed to activate
|
||||||
|
|
||||||
|
search_enabled: bool
|
||||||
|
categories_enabled: bool
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_available(cls) -> bool:
|
def is_available(cls) -> bool:
|
||||||
"""Returns True if all required env vars are set."""
|
"""Returns True if all required env vars are set."""
|
||||||
|
|||||||
@@ -19,8 +19,11 @@ class BoardsAPI(BaseConnector):
|
|||||||
source_name: str = "boards.ie"
|
source_name: str = "boards.ie"
|
||||||
display_name: str = "Boards.ie"
|
display_name: str = "Boards.ie"
|
||||||
|
|
||||||
|
categories_enabled: bool = True
|
||||||
|
search_enabled: bool = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.url = "https://www.boards.ie"
|
self.base_url = "https://www.boards.ie"
|
||||||
|
|
||||||
def get_new_posts_by_search(self,
|
def get_new_posts_by_search(self,
|
||||||
search: str,
|
search: str,
|
||||||
@@ -31,9 +34,9 @@ class BoardsAPI(BaseConnector):
|
|||||||
raise NotImplementedError("Search not compatible with boards.ie")
|
raise NotImplementedError("Search not compatible with boards.ie")
|
||||||
|
|
||||||
if category:
|
if category:
|
||||||
return self._get_posts(f"{self.url}/categories/{category}", post_limit)
|
return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
|
||||||
else:
|
else:
|
||||||
return self._get_posts(f"{self.url}/discussions", post_limit)
|
return self._get_posts(f"{self.base_url}/discussions", post_limit)
|
||||||
|
|
||||||
## Private
|
## Private
|
||||||
def _get_posts(self, url, limit) -> list[Post]:
|
def _get_posts(self, url, limit) -> list[Post]:
|
||||||
@@ -41,7 +44,7 @@ class BoardsAPI(BaseConnector):
|
|||||||
current_page = 1
|
current_page = 1
|
||||||
|
|
||||||
while len(urls) < limit:
|
while len(urls) < limit:
|
||||||
url = f"{self.url}/p{current_page}"
|
url = f"{url}/p{current_page}"
|
||||||
html = self._fetch_page(url)
|
html = self._fetch_page(url)
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
@@ -145,7 +148,7 @@ class BoardsAPI(BaseConnector):
|
|||||||
|
|
||||||
if next_link and next_link.get('href'):
|
if next_link and next_link.get('href'):
|
||||||
href = next_link.get('href')
|
href = next_link.get('href')
|
||||||
current_url = href if href.startswith('http') else self.url + href
|
current_url = href if href.startswith('http') else url + href
|
||||||
else:
|
else:
|
||||||
current_url = None
|
current_url = None
|
||||||
|
|
||||||
|
|||||||
@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class RedditAPI(BaseConnector):
|
class RedditAPI(BaseConnector):
|
||||||
source_name = "reddit"
|
source_name: str = "reddit"
|
||||||
display_name = "Reddit"
|
display_name: str = "Reddit"
|
||||||
|
search_enabled: bool = True
|
||||||
|
categories_enabled: bool = True
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.url = "https://www.reddit.com/"
|
self.url = "https://www.reddit.com/"
|
||||||
|
|||||||
@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
|
|||||||
def get_available_connectors() -> dict[str, type[BaseConnector]]:
|
def get_available_connectors() -> dict[str, type[BaseConnector]]:
|
||||||
return {c.source_name: c for c in _discover_connectors() if c.is_available()}
|
return {c.source_name: c for c in _discover_connectors() if c.is_available()}
|
||||||
|
|
||||||
def get_connector_metadata() -> list[dict]:
|
def get_connector_metadata() -> dict[str, dict]:
|
||||||
return [
|
res = {}
|
||||||
{"id": id, "label": obj.display_name}
|
for id, obj in get_available_connectors().items():
|
||||||
for id, obj in get_available_connectors().items()
|
res[id] = {"id": id,
|
||||||
]
|
"label": obj.display_name,
|
||||||
|
"search_enabled": obj.search_enabled,
|
||||||
|
"categories_enabled": obj.categories_enabled
|
||||||
|
}
|
||||||
|
|
||||||
|
return res
|
||||||
@@ -13,8 +13,10 @@ load_dotenv()
|
|||||||
API_KEY = os.getenv("YOUTUBE_API_KEY")
|
API_KEY = os.getenv("YOUTUBE_API_KEY")
|
||||||
|
|
||||||
class YouTubeAPI(BaseConnector):
|
class YouTubeAPI(BaseConnector):
|
||||||
source_name = "youtube"
|
source_name: str = "youtube"
|
||||||
display_name = "YouTube"
|
display_name: str = "YouTube"
|
||||||
|
search_enabled: bool = True
|
||||||
|
categories_enabled: bool = False
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
|
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
|
||||||
|
|||||||
Reference in New Issue
Block a user