Compare commits

..

4 Commits

Author SHA1 Message Date
d96f459104 fix(connectors): update URL references to use base_url in BoardsAPI 2026-03-13 21:59:17 +00:00
162a4de64e fix(frontend): detects which sources support category or search 2026-03-12 10:07:28 +00:00
6684780d23 fix(connectors): add stronger validation to scrape endpoint
Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.
2026-03-12 09:59:07 +00:00
c12f1b4371 chore(connectors): add category and search validation fields 2026-03-12 09:56:34 +00:00
7 changed files with 136 additions and 50 deletions

View File

@@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
type SourceOption = {
id: string;
label: string;
search_enabled?: boolean;
categories_enabled?: boolean;
searchEnabled?: boolean;
categoriesEnabled?: boolean;
};
type SourceConfig = {
@@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
category: "",
});
const supportsSearch = (source?: SourceOption): boolean =>
Boolean(source?.search_enabled ?? source?.searchEnabled);
const supportsCategories = (source?: SourceOption): boolean =>
Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
const AutoScrapePage = () => {
const navigate = useNavigate();
const [datasetName, setDatasetName] = useState("");
@@ -63,11 +73,18 @@ const AutoScrapePage = () => {
const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
setSourceConfigs((previous) =>
previous.map((config, configIndex) =>
configIndex === index ? { ...config, [field]: value } : config
configIndex === index
? field === "sourceName"
? { ...config, sourceName: value, search: "", category: "" }
: { ...config, [field]: value }
: config
)
);
};
const getSourceOption = (sourceName: string) =>
sourceOptions.find((option) => option.id === sourceName);
const addSourceConfig = () => {
setSourceConfigs((previous) => [
...previous,
@@ -100,12 +117,18 @@ const AutoScrapePage = () => {
return;
}
const normalizedSources = sourceConfigs.map((source) => ({
name: source.sourceName,
limit: Number(source.limit || 100),
search: source.search.trim() || undefined,
category: source.category.trim() || undefined,
}));
const normalizedSources = sourceConfigs.map((source) => {
const sourceOption = getSourceOption(source.sourceName);
return {
name: source.sourceName,
limit: Number(source.limit || 100),
search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined,
category: supportsCategories(sourceOption)
? source.category.trim() || undefined
: undefined,
};
});
const invalidSource = normalizedSources.find(
(source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
@@ -212,7 +235,12 @@ const AutoScrapePage = () => {
{!isLoadingSources && sourceOptions.length > 0 && (
<div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
{sourceConfigs.map((source, index) => (
{sourceConfigs.map((source, index) => {
const sourceOption = getSourceOption(source.sourceName);
const searchEnabled = supportsSearch(sourceOption);
const categoriesEnabled = supportsCategories(sourceOption);
return (
<div
key={`source-${index}`}
style={{
@@ -248,16 +276,26 @@ const AutoScrapePage = () => {
<input
type="text"
value={source.search}
placeholder="Search term (optional)"
placeholder={
searchEnabled
? "Search term (optional)"
: "Search not supported for this source"
}
style={{ ...styles.input, ...styles.inputFullWidth }}
disabled={!searchEnabled}
onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
/>
<input
type="text"
value={source.category}
placeholder="Category (optional)"
placeholder={
categoriesEnabled
? "Category (optional)"
: "Categories not supported for this source"
}
style={{ ...styles.input, ...styles.inputFullWidth }}
disabled={!categoriesEnabled}
onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
/>
@@ -271,7 +309,8 @@ const AutoScrapePage = () => {
</button>
)}
</div>
))}
);
})}
<button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
Add another source

View File

@@ -119,50 +119,82 @@ def get_user_datasets():
@app.route("/datasets/sources", methods=["GET"])
def get_dataset_sources():
return jsonify(get_connector_metadata())
list_metadata = list(get_connector_metadata().values())
return jsonify(list_metadata)
@app.route("/datasets/scrape", methods=["POST"])
@jwt_required()
def scrape_data():
data = request.get_json()
connector_metadata = get_connector_metadata()
# Strong validation needed, otherwise data goes to Celery and crashes silently
if not data or "sources" not in data:
return jsonify({"error": "Sources must be provided"}), 400
user_id = int(get_jwt_identity())
return jsonify({"error": "Sources must be provided"}), 400
if "name" not in data or not str(data["name"]).strip():
return jsonify({"error": "Dataset name is required"}), 400
dataset_name = data["name"].strip()
user_id = int(get_jwt_identity())
source_configs = data["sources"]
if not isinstance(source_configs, list) or len(source_configs) == 0:
return jsonify({"error": "Sources must be a non-empty list"}), 400
# Light Validation
for source in source_configs:
if not isinstance(source, dict):
return jsonify({"error": "Each source must be an object"}), 400
if "name" not in source:
return jsonify({"error": "Each source must contain a name"}), 400
if "limit" in source:
source["limit"] = int(source["limit"])
dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
dataset_manager.set_dataset_status(
dataset_id,
"fetching",
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
)
try:
fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
return jsonify(
{
"message": "Dataset queued for processing",
"dataset_id": dataset_id,
"status": "processing",
}
), 202
if "limit" in source:
try:
source["limit"] = int(source["limit"])
except (ValueError, TypeError):
return jsonify({"error": "Limit must be an integer"}), 400
name = source["name"]
if name not in connector_metadata:
return jsonify({"error": "Source not supported"}), 400
if "search" in source and not connector_metadata[name]["search_enabled"]:
return jsonify({"error": f"Source {name} does not support search"}), 400
if "category" in source and not connector_metadata[name]["categories_enabled"]:
return jsonify({"error": f"Source {name} does not support categories"}), 400
try:
dataset_id = dataset_manager.save_dataset_info(
user_id,
dataset_name,
default_topic_list
)
dataset_manager.set_dataset_status(
dataset_id,
"fetching",
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
)
fetch_and_process_dataset.delay(
dataset_id,
source_configs,
default_topic_list
)
except Exception:
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occurred"}), 500
return jsonify({"error": "Failed to queue dataset processing"}), 500
return jsonify({
"message": "Dataset queued for processing",
"dataset_id": dataset_id,
"status": "processing"
}), 202
@app.route("/datasets/upload", methods=["POST"])
@jwt_required()

View File

@@ -7,6 +7,9 @@ class BaseConnector(ABC):
display_name: str # human-readable: "Reddit", "YouTube"
required_env: list[str] = [] # env vars needed to activate
search_enabled: bool
categories_enabled: bool
@classmethod
def is_available(cls) -> bool:
"""Returns True if all required env vars are set."""

View File

@@ -19,8 +19,11 @@ class BoardsAPI(BaseConnector):
source_name: str = "boards.ie"
display_name: str = "Boards.ie"
categories_enabled: bool = True
search_enabled: bool = False
def __init__(self):
self.url = "https://www.boards.ie"
self.base_url = "https://www.boards.ie"
def get_new_posts_by_search(self,
search: str,
@@ -31,9 +34,9 @@ class BoardsAPI(BaseConnector):
raise NotImplementedError("Search not compatible with boards.ie")
if category:
return self._get_posts(f"{self.url}/categories/{category}", post_limit)
return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
else:
return self._get_posts(f"{self.url}/discussions", post_limit)
return self._get_posts(f"{self.base_url}/discussions", post_limit)
## Private
def _get_posts(self, url, limit) -> list[Post]:
@@ -41,7 +44,7 @@ class BoardsAPI(BaseConnector):
current_page = 1
while len(urls) < limit:
url = f"{self.url}/p{current_page}"
url = f"{url}/p{current_page}"
html = self._fetch_page(url)
soup = BeautifulSoup(html, "html.parser")
@@ -145,7 +148,7 @@ class BoardsAPI(BaseConnector):
if next_link and next_link.get('href'):
href = next_link.get('href')
current_url = href if href.startswith('http') else self.url + href
current_url = href if href.startswith('http') else url + href
else:
current_url = None

View File

@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__)
class RedditAPI(BaseConnector):
source_name = "reddit"
display_name = "Reddit"
source_name: str = "reddit"
display_name: str = "Reddit"
search_enabled: bool = True
categories_enabled: bool = True
def __init__(self):
self.url = "https://www.reddit.com/"

View File

@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
def get_available_connectors() -> dict[str, type[BaseConnector]]:
return {c.source_name: c for c in _discover_connectors() if c.is_available()}
def get_connector_metadata() -> list[dict]:
return [
{"id": id, "label": obj.display_name}
for id, obj in get_available_connectors().items()
]
def get_connector_metadata() -> dict[str, dict]:
res = {}
for id, obj in get_available_connectors().items():
res[id] = {"id": id,
"label": obj.display_name,
"search_enabled": obj.search_enabled,
"categories_enabled": obj.categories_enabled
}
return res

View File

@@ -13,8 +13,10 @@ load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
class YouTubeAPI(BaseConnector):
source_name = "youtube"
display_name = "YouTube"
source_name: str = "youtube"
display_name: str = "YouTube"
search_enabled: bool = True
categories_enabled: bool = False
def __init__(self):
self.youtube = build('youtube', 'v3', developerKey=API_KEY)