Compare commits

..

4 Commits

Author SHA1 Message Date
d96f459104 fix(connectors): update URL references to use base_url in BoardsAPI 2026-03-13 21:59:17 +00:00
162a4de64e fix(frontend): detects which sources support category or search 2026-03-12 10:07:28 +00:00
6684780d23 fix(connectors): add stronger validation to scrape endpoint
Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.
2026-03-12 09:59:07 +00:00
c12f1b4371 chore(connectors): add category and search validation fields 2026-03-12 09:56:34 +00:00
7 changed files with 136 additions and 50 deletions

View File

@@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
type SourceOption = { type SourceOption = {
id: string; id: string;
label: string; label: string;
search_enabled?: boolean;
categories_enabled?: boolean;
searchEnabled?: boolean;
categoriesEnabled?: boolean;
}; };
type SourceConfig = { type SourceConfig = {
@@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
category: "", category: "",
}); });
const supportsSearch = (source?: SourceOption): boolean =>
Boolean(source?.search_enabled ?? source?.searchEnabled);
const supportsCategories = (source?: SourceOption): boolean =>
Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
const AutoScrapePage = () => { const AutoScrapePage = () => {
const navigate = useNavigate(); const navigate = useNavigate();
const [datasetName, setDatasetName] = useState(""); const [datasetName, setDatasetName] = useState("");
@@ -63,11 +73,18 @@ const AutoScrapePage = () => {
const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => { const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
setSourceConfigs((previous) => setSourceConfigs((previous) =>
previous.map((config, configIndex) => previous.map((config, configIndex) =>
configIndex === index ? { ...config, [field]: value } : config configIndex === index
? field === "sourceName"
? { ...config, sourceName: value, search: "", category: "" }
: { ...config, [field]: value }
: config
) )
); );
}; };
const getSourceOption = (sourceName: string) =>
sourceOptions.find((option) => option.id === sourceName);
const addSourceConfig = () => { const addSourceConfig = () => {
setSourceConfigs((previous) => [ setSourceConfigs((previous) => [
...previous, ...previous,
@@ -100,12 +117,18 @@ const AutoScrapePage = () => {
return; return;
} }
const normalizedSources = sourceConfigs.map((source) => ({ const normalizedSources = sourceConfigs.map((source) => {
name: source.sourceName, const sourceOption = getSourceOption(source.sourceName);
limit: Number(source.limit || 100),
search: source.search.trim() || undefined, return {
category: source.category.trim() || undefined, name: source.sourceName,
})); limit: Number(source.limit || 100),
search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined,
category: supportsCategories(sourceOption)
? source.category.trim() || undefined
: undefined,
};
});
const invalidSource = normalizedSources.find( const invalidSource = normalizedSources.find(
(source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0 (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
@@ -212,7 +235,12 @@ const AutoScrapePage = () => {
{!isLoadingSources && sourceOptions.length > 0 && ( {!isLoadingSources && sourceOptions.length > 0 && (
<div style={{ display: "flex", flexDirection: "column", gap: 10 }}> <div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
{sourceConfigs.map((source, index) => ( {sourceConfigs.map((source, index) => {
const sourceOption = getSourceOption(source.sourceName);
const searchEnabled = supportsSearch(sourceOption);
const categoriesEnabled = supportsCategories(sourceOption);
return (
<div <div
key={`source-${index}`} key={`source-${index}`}
style={{ style={{
@@ -248,16 +276,26 @@ const AutoScrapePage = () => {
<input <input
type="text" type="text"
value={source.search} value={source.search}
placeholder="Search term (optional)" placeholder={
searchEnabled
? "Search term (optional)"
: "Search not supported for this source"
}
style={{ ...styles.input, ...styles.inputFullWidth }} style={{ ...styles.input, ...styles.inputFullWidth }}
disabled={!searchEnabled}
onChange={(event) => updateSourceConfig(index, "search", event.target.value)} onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
/> />
<input <input
type="text" type="text"
value={source.category} value={source.category}
placeholder="Category (optional)" placeholder={
categoriesEnabled
? "Category (optional)"
: "Categories not supported for this source"
}
style={{ ...styles.input, ...styles.inputFullWidth }} style={{ ...styles.input, ...styles.inputFullWidth }}
disabled={!categoriesEnabled}
onChange={(event) => updateSourceConfig(index, "category", event.target.value)} onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
/> />
@@ -271,7 +309,8 @@ const AutoScrapePage = () => {
</button> </button>
)} )}
</div> </div>
))} );
})}
<button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}> <button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
Add another source Add another source

View File

@@ -119,50 +119,82 @@ def get_user_datasets():
@app.route("/datasets/sources", methods=["GET"]) @app.route("/datasets/sources", methods=["GET"])
def get_dataset_sources(): def get_dataset_sources():
return jsonify(get_connector_metadata()) list_metadata = list(get_connector_metadata().values())
return jsonify(list_metadata)
@app.route("/datasets/scrape", methods=["POST"]) @app.route("/datasets/scrape", methods=["POST"])
@jwt_required() @jwt_required()
def scrape_data(): def scrape_data():
data = request.get_json() data = request.get_json()
connector_metadata = get_connector_metadata()
# Strong validation needed, otherwise data goes to Celery and crashes silently
if not data or "sources" not in data: if not data or "sources" not in data:
return jsonify({"error": "Sources must be provided"}), 400 return jsonify({"error": "Sources must be provided"}), 400
if "name" not in data or not str(data["name"]).strip():
return jsonify({"error": "Dataset name is required"}), 400
user_id = int(get_jwt_identity())
dataset_name = data["name"].strip() dataset_name = data["name"].strip()
user_id = int(get_jwt_identity())
source_configs = data["sources"] source_configs = data["sources"]
if not isinstance(source_configs, list) or len(source_configs) == 0: if not isinstance(source_configs, list) or len(source_configs) == 0:
return jsonify({"error": "Sources must be a non-empty list"}), 400 return jsonify({"error": "Sources must be a non-empty list"}), 400
# Light Validation
for source in source_configs: for source in source_configs:
if not isinstance(source, dict):
return jsonify({"error": "Each source must be an object"}), 400
if "name" not in source: if "name" not in source:
return jsonify({"error": "Each source must contain a name"}), 400 return jsonify({"error": "Each source must contain a name"}), 400
if "limit" in source:
source["limit"] = int(source["limit"])
dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list) if "limit" in source:
dataset_manager.set_dataset_status( try:
dataset_id, source["limit"] = int(source["limit"])
"fetching", except (ValueError, TypeError):
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" return jsonify({"error": "Limit must be an integer"}), 400
)
name = source["name"]
if name not in connector_metadata:
return jsonify({"error": "Source not supported"}), 400
if "search" in source and not connector_metadata[name]["search_enabled"]:
return jsonify({"error": f"Source {name} does not support search"}), 400
if "category" in source and not connector_metadata[name]["categories_enabled"]:
return jsonify({"error": f"Source {name} does not support categories"}), 400
try: try:
fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list) dataset_id = dataset_manager.save_dataset_info(
user_id,
dataset_name,
default_topic_list
)
return jsonify( dataset_manager.set_dataset_status(
{ dataset_id,
"message": "Dataset queued for processing", "fetching",
"dataset_id": dataset_id, f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
"status": "processing", )
}
), 202 fetch_and_process_dataset.delay(
dataset_id,
source_configs,
default_topic_list
)
except Exception: except Exception:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": "An unexpected error occurred"}), 500 return jsonify({"error": "Failed to queue dataset processing"}), 500
return jsonify({
"message": "Dataset queued for processing",
"dataset_id": dataset_id,
"status": "processing"
}), 202
@app.route("/datasets/upload", methods=["POST"]) @app.route("/datasets/upload", methods=["POST"])
@jwt_required() @jwt_required()

View File

@@ -7,6 +7,9 @@ class BaseConnector(ABC):
display_name: str # human-readable: "Reddit", "YouTube" display_name: str # human-readable: "Reddit", "YouTube"
required_env: list[str] = [] # env vars needed to activate required_env: list[str] = [] # env vars needed to activate
search_enabled: bool
categories_enabled: bool
@classmethod @classmethod
def is_available(cls) -> bool: def is_available(cls) -> bool:
"""Returns True if all required env vars are set.""" """Returns True if all required env vars are set."""

View File

@@ -19,8 +19,11 @@ class BoardsAPI(BaseConnector):
source_name: str = "boards.ie" source_name: str = "boards.ie"
display_name: str = "Boards.ie" display_name: str = "Boards.ie"
categories_enabled: bool = True
search_enabled: bool = False
def __init__(self): def __init__(self):
self.url = "https://www.boards.ie" self.base_url = "https://www.boards.ie"
def get_new_posts_by_search(self, def get_new_posts_by_search(self,
search: str, search: str,
@@ -31,9 +34,9 @@ class BoardsAPI(BaseConnector):
raise NotImplementedError("Search not compatible with boards.ie") raise NotImplementedError("Search not compatible with boards.ie")
if category: if category:
return self._get_posts(f"{self.url}/categories/{category}", post_limit) return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
else: else:
return self._get_posts(f"{self.url}/discussions", post_limit) return self._get_posts(f"{self.base_url}/discussions", post_limit)
## Private ## Private
def _get_posts(self, url, limit) -> list[Post]: def _get_posts(self, url, limit) -> list[Post]:
@@ -41,7 +44,7 @@ class BoardsAPI(BaseConnector):
current_page = 1 current_page = 1
while len(urls) < limit: while len(urls) < limit:
url = f"{self.url}/p{current_page}" url = f"{url}/p{current_page}"
html = self._fetch_page(url) html = self._fetch_page(url)
soup = BeautifulSoup(html, "html.parser") soup = BeautifulSoup(html, "html.parser")
@@ -145,7 +148,7 @@ class BoardsAPI(BaseConnector):
if next_link and next_link.get('href'): if next_link and next_link.get('href'):
href = next_link.get('href') href = next_link.get('href')
current_url = href if href.startswith('http') else self.url + href current_url = href if href.startswith('http') else url + href
else: else:
current_url = None current_url = None

View File

@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class RedditAPI(BaseConnector): class RedditAPI(BaseConnector):
source_name = "reddit" source_name: str = "reddit"
display_name = "Reddit" display_name: str = "Reddit"
search_enabled: bool = True
categories_enabled: bool = True
def __init__(self): def __init__(self):
self.url = "https://www.reddit.com/" self.url = "https://www.reddit.com/"

View File

@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
def get_available_connectors() -> dict[str, type[BaseConnector]]: def get_available_connectors() -> dict[str, type[BaseConnector]]:
return {c.source_name: c for c in _discover_connectors() if c.is_available()} return {c.source_name: c for c in _discover_connectors() if c.is_available()}
def get_connector_metadata() -> list[dict]: def get_connector_metadata() -> dict[str, dict]:
return [ res = {}
{"id": id, "label": obj.display_name} for id, obj in get_available_connectors().items():
for id, obj in get_available_connectors().items() res[id] = {"id": id,
] "label": obj.display_name,
"search_enabled": obj.search_enabled,
"categories_enabled": obj.categories_enabled
}
return res

View File

@@ -13,8 +13,10 @@ load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY") API_KEY = os.getenv("YOUTUBE_API_KEY")
class YouTubeAPI(BaseConnector): class YouTubeAPI(BaseConnector):
source_name = "youtube" source_name: str = "youtube"
display_name = "YouTube" display_name: str = "YouTube"
search_enabled: bool = True
categories_enabled: bool = False
def __init__(self): def __init__(self):
self.youtube = build('youtube', 'v3', developerKey=API_KEY) self.youtube = build('youtube', 'v3', developerKey=API_KEY)