From ca444e9cb0da2742395130098469d913c515a491 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 9 Mar 2026 20:53:13 +0000 Subject: [PATCH 01/36] refactor: move connectors to backend dir They will now be more used in the backend. --- {connectors => server/connectors}/boards_api.py | 0 {connectors => server/connectors}/reddit_api.py | 0 {connectors => server/connectors}/youtube_api.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename {connectors => server/connectors}/boards_api.py (100%) rename {connectors => server/connectors}/reddit_api.py (100%) rename {connectors => server/connectors}/youtube_api.py (100%) diff --git a/connectors/boards_api.py b/server/connectors/boards_api.py similarity index 100% rename from connectors/boards_api.py rename to server/connectors/boards_api.py diff --git a/connectors/reddit_api.py b/server/connectors/reddit_api.py similarity index 100% rename from connectors/reddit_api.py rename to server/connectors/reddit_api.py diff --git a/connectors/youtube_api.py b/server/connectors/youtube_api.py similarity index 100% rename from connectors/youtube_api.py rename to server/connectors/youtube_api.py From 262a70dbf331247ce2bcc76903870ec656b85582 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 9 Mar 2026 20:55:12 +0000 Subject: [PATCH 02/36] refactor(api): rename /upload endpoint Ensures consistency with the other dataset-based endpoints and follows the REST-API rules more cleanly. --- frontend/src/pages/Upload.tsx | 2 +- server/app.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/src/pages/Upload.tsx b/frontend/src/pages/Upload.tsx index 93383dc..0799f9b 100644 --- a/frontend/src/pages/Upload.tsx +++ b/frontend/src/pages/Upload.tsx @@ -40,7 +40,7 @@ const UploadPage = () => { setHasError(false); setReturnMessage(""); - const response = await axios.post(`${API_BASE_URL}/upload`, formData, { + const response = await axios.post(`${API_BASE_URL}/datasets/upload`, formData, { headers: { "Content-Type": "multipart/form-data", }, diff --git a/server/app.py b/server/app.py index 7cbf9d3..eb27e70 100644 --- a/server/app.py +++ b/server/app.py @@ -111,7 +111,7 @@ def get_user_datasets(): current_user = int(get_jwt_identity()) return jsonify(dataset_manager.get_user_datasets(current_user)), 200 -@app.route("/upload", methods=["POST"]) +@app.route("/datasets/upload", methods=["POST"]) @jwt_required() def upload_data(): if "posts" not in request.files or "topics" not in request.files: From cc799f736875d7e8326423b78c4b650c8ebee359 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 9 Mar 2026 21:28:44 +0000 Subject: [PATCH 03/36] feat(connectors): add base connector and registry for detection Idea is to have a "plugin-type" system, where new connectors can extend the `BaseConnector` class and implement the fetch posts method. These are automatically detected by the registry, and automatically used in new Flask endpoints that give a list of possible sources. Allows for an open-ended system where new data scrapers / API consumers can be added dynamically. --- server/app.py | 14 ++++++++++++++ server/connectors/base.py | 23 +++++++++++++++++++++++ server/connectors/boards_api.py | 13 ++++++++++--- server/connectors/registry.py | 25 +++++++++++++++++++++++++ server/core/datasets.py | 2 +- 5 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 server/connectors/base.py create mode 100644 server/connectors/registry.py diff --git a/server/app.py b/server/app.py index eb27e70..23c095f 100644 --- a/server/app.py +++ b/server/app.py @@ -111,6 +111,20 @@ def get_user_datasets(): current_user = int(get_jwt_identity()) return jsonify(dataset_manager.get_user_datasets(current_user)), 200 +@app.route("/datasets/sources", methods=["GET"]) +@jwt_required() +def get_dataset_sources(): + return jsonify({""}) + +@app.route("/datasets/scrape", methods=["POST"]) +@jwt_required() +def scrape_data(): + if "sources" not in request.form: + return jsonify({"error": "Data source names are required."}), 400 + + sources = request.form.get("sources") + + @app.route("/datasets/upload", methods=["POST"]) @jwt_required() def upload_data(): diff --git a/server/connectors/base.py b/server/connectors/base.py new file mode 100644 index 0000000..f555769 --- /dev/null +++ b/server/connectors/base.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from dto.post import Post + +class BaseConnector(ABC): + # Each subclass declares these at the class level + source_name: str # machine-readable: "reddit", "youtube" + display_name: str # human-readable: "Reddit", "YouTube" + required_env: list[str] = [] # env vars needed to activate + + @classmethod + def is_available(cls) -> bool: + """Returns True if all required env vars are set.""" + import os + return all(os.getenv(var) for var in cls.required_env) + + @abstractmethod + def get_new_posts_by_search(self, + search: str = None, + category: str = None, + post_limit: int = 10, + comment_limit: int = 10 + ) -> list[Post]: + ... \ No newline at end of file diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index 1b63aa9..e714048 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -7,6 +7,7 @@ from dto.post import Post from dto.comment import Comment from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed +from server.connectors.base import BaseConnector logger = logging.getLogger(__name__) @@ -14,12 +15,18 @@ HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)" } -class BoardsAPI: +class BoardsAPI(BaseConnector): def __init__(self): self.url = "https://www.boards.ie" - self.source_name = "Boards.ie" + self.source_name = "boards.ie" + self.display_name = "Boards.ie" - def get_new_category_posts(self, category: str, post_limit: int, comment_limit: int) -> list[Post]: + def get_new_posts_by_search(self, + search: str, + category: str, + post_limit: int, + comment_limit: int + ) -> list[Post]: urls = [] current_page = 1 diff --git a/server/connectors/registry.py b/server/connectors/registry.py new file mode 100644 index 0000000..0883476 --- /dev/null +++ b/server/connectors/registry.py @@ -0,0 +1,25 @@ +import pkgutil +import importlib +import connectors +from connectors.base import BaseConnector + +def _discover_connectors() -> list[type[BaseConnector]]: + """Walk the connectors package and collect all BaseConnector subclasses.""" + for _, module_name, _ in pkgutil.iter_modules(connectors.__path__): + if module_name in ("base", "registry"): + continue + importlib.import_module(f"connectors.{module_name}") + + return [ + cls for cls in BaseConnector.__subclasses__() + if cls.source_name # guard against abstract intermediaries + ] + +def get_available_connectors() -> list[type[BaseConnector]]: + return [c for c in _discover_connectors() if c.is_available()] + +def get_connector_metadata() -> list[dict]: + return [ + {"id": c.source_name, "label": c.display_name} + for c in get_available_connectors() + ] \ No newline at end of file diff --git a/server/core/datasets.py b/server/core/datasets.py index 5886cfc..3a62fc9 100644 --- a/server/core/datasets.py +++ b/server/core/datasets.py @@ -1,7 +1,7 @@ import pandas as pd from server.db.database import PostgresConnector from psycopg2.extras import Json -from server.exceptions import NotAuthorisedException, NonExistentDatasetException +from server.exceptions import NonExistentDatasetException class DatasetManager: def __init__(self, db: PostgresConnector): From e7a8c17be4d524fd5cb7f57435ed7709396b0838 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:08:01 +0000 Subject: [PATCH 04/36] chore(connectors): add base connector inheritance --- server/connectors/reddit_api.py | 3 ++- server/connectors/youtube_api.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 0ec6100..61f3656 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -5,10 +5,11 @@ import time from dto.post import Post from dto.user import User from dto.comment import Comment +from server.connectors.base import BaseConnector logger = logging.getLogger(__name__) -class RedditAPI: +class RedditAPI(BaseConnector): def __init__(self): self.url = "https://www.reddit.com/" self.source_name = "Reddit" diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py index d0e00a3..71ce6ed 100644 --- a/server/connectors/youtube_api.py +++ b/server/connectors/youtube_api.py @@ -6,12 +6,13 @@ from googleapiclient.discovery import build from googleapiclient.errors import HttpError from dto.post import Post from dto.comment import Comment +from server.connectors.base import BaseConnector load_dotenv() API_KEY = os.getenv("YOUTUBE_API_KEY") -class YouTubeAPI: +class YouTubeAPI(BaseConnector): def __init__(self): self.youtube = build('youtube', 'v3', developerKey=API_KEY) From 2a8d7c797237cbccd62ed35d32b1f53fcb15bf49 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:11:33 +0000 Subject: [PATCH 05/36] refactor(connectors): Youtube & Reddit connectors implement BaseConnector --- server/connectors/reddit_api.py | 12 ++++- server/connectors/youtube_api.py | 91 +++++++++++++++++--------------- 2 files changed, 58 insertions(+), 45 deletions(-) diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 61f3656..13e5e7b 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -15,7 +15,15 @@ class RedditAPI(BaseConnector): self.source_name = "Reddit" # Public Methods # - def search_new_subreddit_posts(self, search: str, subreddit: str, limit: int) -> list[Post]: + def get_new_posts_by_search(self, + search: str, + subreddit: str, + limit: int + ) -> list[Post]: + + if not search: + return self._get_new_subreddit_posts(subreddit, limit=limit) + params = { 'q': search, 'limit': limit, @@ -43,7 +51,7 @@ class RedditAPI(BaseConnector): return posts - def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: + def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: posts = [] after = None url = f"r/{subreddit}/new.json" diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py index 71ce6ed..691f53d 100644 --- a/server/connectors/youtube_api.py +++ b/server/connectors/youtube_api.py @@ -16,6 +16,54 @@ class YouTubeAPI(BaseConnector): def __init__(self): self.youtube = build('youtube', 'v3', developerKey=API_KEY) + def get_new_posts_by_search(self, + search: str, + category: str, + post_limit: int, + comment_limit: int + ) -> list[Post]: + videos = self.search_videos(search, post_limit) + posts = [] + + for video in videos: + video_id = video['id']['videoId'] + snippet = video['snippet'] + title = snippet['title'] + description = snippet['description'] + published_at = datetime.datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp() + channel_title = snippet['channelTitle'] + + comments = [] + comments_data = self.get_video_comments(video_id, comment_limit) + for comment_thread in comments_data: + comment_snippet = comment_thread['snippet']['topLevelComment']['snippet'] + comment = Comment( + id=comment_thread['id'], + post_id=video_id, + content=comment_snippet['textDisplay'], + author=comment_snippet['authorDisplayName'], + timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(), + reply_to=None, + source="YouTube" + ) + + comments.append(comment) + + post = Post( + id=video_id, + content=f"{title}\n\n{description}", + author=channel_title, + timestamp=published_at, + url=f"https://www.youtube.com/watch?v={video_id}", + title=title, + source="YouTube", + comments=comments + ) + + posts.append(post) + + return posts + def search_videos(self, query, limit): request = self.youtube.search().list( q=query, @@ -40,46 +88,3 @@ class YouTubeAPI(BaseConnector): print(f"Error fetching comments for video {video_id}: {e}") return [] return response.get('items', []) - - def fetch_videos(self, query, video_limit, comment_limit) -> list[Post]: - videos = self.search_videos(query, video_limit) - posts = [] - - for video in videos: - video_id = video['id']['videoId'] - snippet = video['snippet'] - title = snippet['title'] - description = snippet['description'] - published_at = datetime.datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp() - channel_title = snippet['channelTitle'] - - comments = [] - comments_data = self.get_video_comments(video_id, comment_limit) - for comment_thread in comments_data: - comment_snippet = comment_thread['snippet']['topLevelComment']['snippet'] - comment = Comment( - id=comment_thread['id'], - post_id=video_id, - content=comment_snippet['textDisplay'], - author=comment_snippet['authorDisplayName'], - timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(), - reply_to=None, - source="YouTube" - ) - - comments.append(comment) - - post = Post( - id=video_id, - content=f"{title}\n\n{description}", - author=channel_title, - timestamp=published_at, - url=f"https://www.youtube.com/watch?v={video_id}", - title=title, - source="YouTube", - comments=comments - ) - - posts.append(post) - - return posts \ No newline at end of file From 5ccb2e73cd0512fd0ee95712e2d9e001b63fd405 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:18:42 +0000 Subject: [PATCH 06/36] fix(connectors): incorrect registry location Registry paths were using the incorrect connector path locations. --- server/app.py | 4 ++-- server/connectors/boards_api.py | 5 +++-- server/connectors/reddit_api.py | 4 +++- server/connectors/registry.py | 8 ++++---- server/connectors/youtube_api.py | 5 ++++- 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/server/app.py b/server/app.py index 23c095f..e5a8037 100644 --- a/server/app.py +++ b/server/app.py @@ -21,6 +21,7 @@ from server.core.auth import AuthManager from server.core.datasets import DatasetManager from server.utils import get_request_filters from server.queue.tasks import process_dataset +from server.connectors.registry import get_connector_metadata app = Flask(__name__) @@ -112,9 +113,8 @@ def get_user_datasets(): return jsonify(dataset_manager.get_user_datasets(current_user)), 200 @app.route("/datasets/sources", methods=["GET"]) -@jwt_required() def get_dataset_sources(): - return jsonify({""}) + return jsonify(get_connector_metadata()) @app.route("/datasets/scrape", methods=["POST"]) @jwt_required() diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index e714048..9109e71 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -16,10 +16,11 @@ HEADERS = { } class BoardsAPI(BaseConnector): + source_name: str = "boards.ie" + display_name: str = "Boards.ie" + def __init__(self): self.url = "https://www.boards.ie" - self.source_name = "boards.ie" - self.display_name = "Boards.ie" def get_new_posts_by_search(self, search: str, diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 13e5e7b..2107ded 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -10,9 +10,11 @@ from server.connectors.base import BaseConnector logger = logging.getLogger(__name__) class RedditAPI(BaseConnector): + source_name = "reddit" + display_name = "Reddit" + def __init__(self): self.url = "https://www.reddit.com/" - self.source_name = "Reddit" # Public Methods # def get_new_posts_by_search(self, diff --git a/server/connectors/registry.py b/server/connectors/registry.py index 0883476..dfc1fda 100644 --- a/server/connectors/registry.py +++ b/server/connectors/registry.py @@ -1,14 +1,14 @@ import pkgutil import importlib -import connectors -from connectors.base import BaseConnector +import server.connectors +from server.connectors.base import BaseConnector def _discover_connectors() -> list[type[BaseConnector]]: """Walk the connectors package and collect all BaseConnector subclasses.""" - for _, module_name, _ in pkgutil.iter_modules(connectors.__path__): + for _, module_name, _ in pkgutil.iter_modules(server.connectors.__path__): if module_name in ("base", "registry"): continue - importlib.import_module(f"connectors.{module_name}") + importlib.import_module(f"server.connectors.{module_name}") return [ cls for cls in BaseConnector.__subclasses__() diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py index 691f53d..323d3f8 100644 --- a/server/connectors/youtube_api.py +++ b/server/connectors/youtube_api.py @@ -13,6 +13,9 @@ load_dotenv() API_KEY = os.getenv("YOUTUBE_API_KEY") class YouTubeAPI(BaseConnector): + source_name = "youtube" + display_name = "YouTube" + def __init__(self): self.youtube = build('youtube', 'v3', developerKey=API_KEY) @@ -44,7 +47,7 @@ class YouTubeAPI(BaseConnector): author=comment_snippet['authorDisplayName'], timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(), reply_to=None, - source="YouTube" + source=self.source_name ) comments.append(comment) From 0866dda8b3fe23312c9efc9d03998765b880eddd Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:25:05 +0000 Subject: [PATCH 07/36] chore: add util to always split evenly --- server/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/utils.py b/server/utils.py index 815739f..5a783d2 100644 --- a/server/utils.py +++ b/server/utils.py @@ -48,3 +48,7 @@ def get_request_filters() -> dict: filters["data_sources"] = data_sources return filters + +def split_limit(limit: int, n: int) -> list[int]: + base, remainder = divmod(limit, n) + return [base + (1 if i < remainder else 0) for i in range(n)] From 53cb5c2ea52bedcd5c84b45551aa43aa7fbe9e6a Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:36:08 +0000 Subject: [PATCH 08/36] feat(topics): add generalised topic list This is easier and quicker compared to deriving a topics list based on the dataset that has been scraped. While using LLMs to create a personalised topic list based on the query, category or dataset itself would yield better results for most, it is beyond the scope of this project. --- server/topics.json | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 server/topics.json diff --git a/server/topics.json b/server/topics.json new file mode 100644 index 0000000..271913a --- /dev/null +++ b/server/topics.json @@ -0,0 +1,67 @@ +{ + "Personal Life": "daily life, life updates, what happened today, personal stories, life events, reflections", + + "Relationships": "dating, relationships, breakups, friendships, family relationships, marriage, relationship advice", + + "Family & Parenting": "parents, parenting, children, raising kids, family dynamics, family stories", + + "Work & Careers": "jobs, workplaces, office life, promotions, quitting jobs, career advice, workplace drama", + + "Education": "school, studying, exams, university, homework, academic pressure, learning experiences", + + "Money & Finance": "saving money, debt, budgeting, cost of living, financial advice, personal finance", + + "Health & Fitness": "exercise, gym, workouts, running, diet, fitness routines, weight loss", + + "Mental Health": "stress, anxiety, depression, burnout, therapy, emotional wellbeing", + + "Food & Cooking": "meals, cooking, recipes, restaurants, snacks, food opinions", + + "Travel": "holidays, trips, tourism, travel experiences, airports, flights, travel tips", + + "Entertainment": "movies, TV shows, streaming services, celebrities, pop culture", + + "Music": "songs, albums, artists, concerts, music opinions", + + "Gaming": "video games, gaming culture, consoles, PC gaming, esports", + + "Sports": "sports matches, teams, players, competitions, sports opinions", + + "Technology": "phones, gadgets, apps, AI, software, tech trends", + + "Internet Culture": "memes, viral trends, online jokes, internet drama, trending topics", + + "Social Media": "platforms, influencers, content creators, algorithms, online communities", + + "News & Current Events": "breaking news, world events, major incidents, public discussions", + + "Politics": "political debates, elections, government policies, ideology", + + "Culture & Society": "social issues, cultural trends, generational debates, societal changes", + + "Identity & Lifestyle": "personal identity, lifestyle choices, values, self-expression", + + "Hobbies & Interests": "art, photography, crafts, collecting, hobbies", + + "Fashion & Beauty": "clothing, style, makeup, skincare, fashion trends", + + "Animals & Pets": "pets, animal videos, pet care, wildlife", + + "Humour": "jokes, funny stories, sarcasm, memes", + + "Opinions & Debates": "hot takes, controversial opinions, arguments, discussions", + + "Advice & Tips": "life advice, tutorials, how-to tips, recommendations", + + "Product Reviews": "reviews, recommendations, experiences with products", + + "Complaints & Rants": "frustrations, complaining, venting about things", + + "Motivation & Inspiration": "motivational quotes, success stories, encouragement", + + "Questions & Curiosity": "asking questions, seeking opinions, curiosity posts", + + "Celebrations & Achievements": "birthdays, milestones, achievements, good news", + + "Random Thoughts": "shower thoughts, observations, random ideas" +} \ No newline at end of file From 17bd4702b22a5bce43a68a9efc1c2c1ed51df8db Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:36:40 +0000 Subject: [PATCH 09/36] fix(connectors): connector detectors returning name of ID alongside connector obj --- server/connectors/registry.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/connectors/registry.py b/server/connectors/registry.py index dfc1fda..47b1d6a 100644 --- a/server/connectors/registry.py +++ b/server/connectors/registry.py @@ -15,11 +15,11 @@ def _discover_connectors() -> list[type[BaseConnector]]: if cls.source_name # guard against abstract intermediaries ] -def get_available_connectors() -> list[type[BaseConnector]]: - return [c for c in _discover_connectors() if c.is_available()] +def get_available_connectors() -> dict[str, type[BaseConnector]]: + return {c.source_name: c for c in _discover_connectors() if c.is_available()} def get_connector_metadata() -> list[dict]: return [ - {"id": c.source_name, "label": c.display_name} - for c in get_available_connectors() + {"id": id, "label": obj.display_name} + for id, obj in get_available_connectors().items() ] \ No newline at end of file From 2572664e2694c638b0bbdc4d1b94caeb806740de Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 18:50:53 +0000 Subject: [PATCH 10/36] chore(utils): add env getter that fails if env not found --- server/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/server/utils.py b/server/utils.py index 5a783d2..1a507bb 100644 --- a/server/utils.py +++ b/server/utils.py @@ -1,4 +1,5 @@ import datetime +import os from flask import request def parse_datetime_filter(value): @@ -52,3 +53,9 @@ def get_request_filters() -> dict: def split_limit(limit: int, n: int) -> list[int]: base, remainder = divmod(limit, n) return [base + (1 if i < remainder else 0) for i in range(n)] + +def get_env(name: str) -> str: + value = os.getenv(name) + if not value: + raise RuntimeError(f"Missing required environment variable: {name}") + return value From 6ec47256d038ce7b4a30fea6812977cd8fdcf6dd Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 19:04:33 +0000 Subject: [PATCH 11/36] feat(api): add database scraping endpoints --- server/app.py | 60 ++++++++++++++++++++++++++++----- server/connectors/reddit_api.py | 19 ++++++----- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/server/app.py b/server/app.py index e5a8037..d896ac2 100644 --- a/server/app.py +++ b/server/app.py @@ -19,19 +19,18 @@ from server.exceptions import NotAuthorisedException, NonExistentDatasetExceptio from server.db.database import PostgresConnector from server.core.auth import AuthManager from server.core.datasets import DatasetManager -from server.utils import get_request_filters +from server.utils import get_request_filters, split_limit, get_env from server.queue.tasks import process_dataset -from server.connectors.registry import get_connector_metadata +from server.connectors.registry import get_available_connectors, get_connector_metadata app = Flask(__name__) # Env Variables load_dotenv() -frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173") -jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this") -jwt_access_token_expires = int( - os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200) -) # Default to 20 minutes +max_fetch_limit = int(get_env("MAX_FETCH_LIMIT")) +frontend_url = get_env("FRONTEND_URL") +jwt_secret_key = get_env("JWT_SECRET_KEY") +jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes # Flask Configuration CORS(app, resources={r"/*": {"origins": frontend_url}}) @@ -45,7 +44,8 @@ db = PostgresConnector() auth_manager = AuthManager(db, bcrypt) dataset_manager = DatasetManager(db) stat_gen = StatGen() - +connectors = get_available_connectors() +default_topic_list = json.load(open("server/topics.json")) @app.route("/register", methods=["POST"]) def register_user(): @@ -122,8 +122,50 @@ def scrape_data(): if "sources" not in request.form: return jsonify({"error": "Data source names are required."}), 400 - sources = request.form.get("sources") + user_id = int(get_jwt_identity()) + sources = request.form.getlist("sources") + limit = int(request.form.get("limit", max_fetch_limit)) + dataset_name = request.form.get("name", "").strip() + search = request.form.get("search") + category = request.form.get("category") + + print(sources) + + if limit > max_fetch_limit: + return jsonify({"error": f"Due to API limitations, we cannot receive more than ${max_fetch_limit} posts"}), 400 + + for source in sources: + if source not in connectors.keys(): + return jsonify({"error": "Source must exist"}), 400 + + limits = split_limit(limit, len(sources)) + per_source = dict(zip(sources, limits)) + + try: + posts = [] + for source_name, source_limit in per_source.items(): + connector = connectors[source_name]() + posts.extend(connector.get_new_posts_by_search( + search=search, + category=category, + post_limit=source_limit, + comment_limit=source_limit + )) + + dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, {}) + process_dataset.delay(dataset_id, [p.to_dict() for p in posts], default_topic_list) + + return jsonify( + { + "message": "Dataset queued for processing", + "dataset_id": dataset_id, + "status": "processing", + } + ), 202 + except Exception: + print(traceback.format_exc()) + return jsonify({"error": "An unexpected error occurred"}), 500 @app.route("/datasets/upload", methods=["POST"]) @jwt_required() diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 2107ded..444326a 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -19,32 +19,33 @@ class RedditAPI(BaseConnector): # Public Methods # def get_new_posts_by_search(self, search: str, - subreddit: str, - limit: int + category: str, + post_limit: int, + comment_limit: int ) -> list[Post]: if not search: - return self._get_new_subreddit_posts(subreddit, limit=limit) + return self._get_new_subreddit_posts(category, limit=post_limit) params = { 'q': search, - 'limit': limit, + 'limit': post_limit, 'restrict_sr': 'on', 'sort': 'new' } - logger.info(f"Searching subreddit '{subreddit}' for '{search}' with limit {limit}") - url = f"r/{subreddit}/search.json" + logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}") + url = f"r/{category}/search.json" posts = [] - while len(posts) < limit: - batch_limit = min(100, limit - len(posts)) + while len(posts) < post_limit: + batch_limit = min(100, post_limit - len(posts)) params['limit'] = batch_limit data = self._fetch_post_overviews(url, params) batch_posts = self._parse_posts(data) - logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {subreddit}") + logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}") if not batch_posts: break From 15704a07823f6b1fd5ca40142f66d3b9e4e5acda Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 19:17:08 +0000 Subject: [PATCH 12/36] chore(db): update db schema to include "fetching" status --- server/core/datasets.py | 2 +- server/db/schema.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/core/datasets.py b/server/core/datasets.py index 3a62fc9..4690454 100644 --- a/server/core/datasets.py +++ b/server/core/datasets.py @@ -114,7 +114,7 @@ class DatasetManager: self.db.execute_batch(query, values) def set_dataset_status(self, dataset_id: int, status: str, status_message: str | None = None): - if status not in ["processing", "complete", "error"]: + if status not in ["fetching", "processing", "complete", "error"]: raise ValueError("Invalid status") query = """ diff --git a/server/db/schema.sql b/server/db/schema.sql index 051a396..4550633 100644 --- a/server/db/schema.sql +++ b/server/db/schema.sql @@ -23,7 +23,7 @@ CREATE TABLE datasets ( -- Enforce valid states CONSTRAINT datasets_status_check - CHECK (status IN ('processing', 'complete', 'error')) + CHECK (status IN ('fetching', 'processing', 'complete', 'error')) ); CREATE TABLE events ( From a65c4a461c8e10f278e12431985c999af97e4d8a Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 19:17:41 +0000 Subject: [PATCH 13/36] fix(api): flask delegates dataset fetch to celery --- server/app.py | 19 ++++--------------- server/queue/tasks.py | 27 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/server/app.py b/server/app.py index d896ac2..16d6f39 100644 --- a/server/app.py +++ b/server/app.py @@ -20,7 +20,7 @@ from server.db.database import PostgresConnector from server.core.auth import AuthManager from server.core.datasets import DatasetManager from server.utils import get_request_filters, split_limit, get_env -from server.queue.tasks import process_dataset +from server.queue.tasks import process_dataset, fetch_and_process_dataset from server.connectors.registry import get_available_connectors, get_connector_metadata app = Flask(__name__) @@ -130,8 +130,6 @@ def scrape_data(): search = request.form.get("search") category = request.form.get("category") - print(sources) - if limit > max_fetch_limit: return jsonify({"error": f"Due to API limitations, we cannot receive more than ${max_fetch_limit} posts"}), 400 @@ -141,20 +139,11 @@ def scrape_data(): limits = split_limit(limit, len(sources)) per_source = dict(zip(sources, limits)) + dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list) + dataset_manager.set_dataset_status(dataset_id, "fetching", f"Data is being fetched from {str(sources)}") try: - posts = [] - for source_name, source_limit in per_source.items(): - connector = connectors[source_name]() - posts.extend(connector.get_new_posts_by_search( - search=search, - category=category, - post_limit=source_limit, - comment_limit=source_limit - )) - - dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, {}) - process_dataset.delay(dataset_id, [p.to_dict() for p in posts], default_topic_list) + fetch_and_process_dataset.delay(dataset_id, per_source, search, category, default_topic_list) return jsonify( { diff --git a/server/queue/tasks.py b/server/queue/tasks.py index a089596..8a71680 100644 --- a/server/queue/tasks.py +++ b/server/queue/tasks.py @@ -4,6 +4,7 @@ from server.queue.celery_app import celery from server.analysis.enrichment import DatasetEnrichment from server.db.database import PostgresConnector from server.core.datasets import DatasetManager +from server.connectors.registry import get_available_connectors @celery.task(bind=True, max_retries=3) def process_dataset(self, dataset_id: int, posts: list, topics: dict): @@ -18,5 +19,31 @@ def process_dataset(self, dataset_id: int, posts: list, topics: dict): dataset_manager.save_dataset_content(dataset_id, enriched_df) dataset_manager.set_dataset_status(dataset_id, "complete", "NLP Processing Completed Successfully") + except Exception as e: + dataset_manager.set_dataset_status(dataset_id, "error", f"An error occurred: {e}") + +@celery.task(bind=True, max_retries=3) +def fetch_and_process_dataset(self, + dataset_id: int, + per_source: dict[str, int], + search: str, + category: str, + topics: dict): + connectors = get_available_connectors() + db = PostgresConnector() + dataset_manager = DatasetManager(db) + posts = [] + + try: + for source_name, source_limit in per_source.items(): + connector = connectors[source_name]() + posts.extend(connector.get_new_posts_by_search( + search=search, + category=category, + post_limit=source_limit, + comment_limit=source_limit + )) + + process_dataset.delay(dataset_id, [p.to_dict() for p in posts], topics) except Exception as e: dataset_manager.set_dataset_status(dataset_id, "error", f"An error occurred: {e}") \ No newline at end of file From a3dbe04a5715f2d5d0c7a7ef8ff605e29c229e03 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 19:23:48 +0000 Subject: [PATCH 14/36] fix(frontend): option to delete dataset not shown after fail --- frontend/src/pages/Datasets.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/src/pages/Datasets.tsx b/frontend/src/pages/Datasets.tsx index 4c79cdc..ede2317 100644 --- a/frontend/src/pages/Datasets.tsx +++ b/frontend/src/pages/Datasets.tsx @@ -9,7 +9,7 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL; type DatasetItem = { id: number; name?: string; - status?: "processing" | "complete" | "error" | string; + status?: "processing" | "complete" | "error" | "fetching" | string; status_message?: string | null; completed_at?: string | null; created_at?: string | null; @@ -93,7 +93,7 @@ const DatasetsPage = () => {
    {datasets.map((dataset) => { - const isComplete = dataset.status === "complete"; + const isComplete = dataset.status === "complete" || dataset.status === "error"; const editPath = `/dataset/${dataset.id}/edit`; const targetPath = isComplete ? `/dataset/${dataset.id}/stats` From 7ccc934f7102138b3f413e6577182e12a9f7f7e4 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 22:14:45 +0000 Subject: [PATCH 15/36] build: change celery to debug mode --- docker-compose.dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 96c3430..dc3edb2 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -43,7 +43,7 @@ services: - .env command: > celery -A server.queue.celery_app.celery worker - --loglevel=info + --loglevel=debug --pool=solo depends_on: - postgres From dc330b87b9696a0bfaa78310b6eeb22674b81df3 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 22:17:00 +0000 Subject: [PATCH 16/36] fix(celery): process dataset directly in fetch task Calling the original `process_dataset` function led to issues with JSON serialisation. --- server/queue/tasks.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/server/queue/tasks.py b/server/queue/tasks.py index 8a71680..7feaf9a 100644 --- a/server/queue/tasks.py +++ b/server/queue/tasks.py @@ -1,4 +1,5 @@ import pandas as pd +import json from server.queue.celery_app import celery from server.analysis.enrichment import DatasetEnrichment @@ -37,13 +38,20 @@ def fetch_and_process_dataset(self, try: for source_name, source_limit in per_source.items(): connector = connectors[source_name]() - posts.extend(connector.get_new_posts_by_search( + raw_posts = connector.get_new_posts_by_search( search=search, category=category, post_limit=source_limit, comment_limit=source_limit - )) + ) + posts.extend(post.to_dict() for post in raw_posts) - process_dataset.delay(dataset_id, [p.to_dict() for p in posts], topics) + df = pd.DataFrame(posts) + + processor = DatasetEnrichment(df, topics) + enriched_df = processor.enrich() + + dataset_manager.save_dataset_content(dataset_id, enriched_df) + dataset_manager.set_dataset_status(dataset_id, "complete", "NLP Processing Completed Successfully") except Exception as e: dataset_manager.set_dataset_status(dataset_id, "error", f"An error occurred: {e}") \ No newline at end of file From 8fe84a30f6c1cf29c38ea4e4044ffd9d90756e68 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 22:45:07 +0000 Subject: [PATCH 17/36] fix: data leak when opening topics file --- server/app.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/server/app.py b/server/app.py index 16d6f39..3efa6eb 100644 --- a/server/app.py +++ b/server/app.py @@ -37,15 +37,20 @@ CORS(app, resources={r"/*": {"origins": frontend_url}}) app.config["JWT_SECRET_KEY"] = jwt_secret_key app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires +# Security bcrypt = Bcrypt(app) jwt = JWTManager(app) +# Helper Objects db = PostgresConnector() auth_manager = AuthManager(db, bcrypt) dataset_manager = DatasetManager(db) stat_gen = StatGen() connectors = get_available_connectors() -default_topic_list = json.load(open("server/topics.json")) + +# Default Files +with open("server/topics.json") as f: + default_topic_list = json.load(f) @app.route("/register", methods=["POST"]) def register_user(): From d520e2af98be4f2c0ab076af3b3126ba1d261f4d Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 22:48:04 +0000 Subject: [PATCH 18/36] fix(auth): missing email and username business rules --- server/core/auth.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/core/auth.py b/server/core/auth.py index 625c3c2..34bb93c 100644 --- a/server/core/auth.py +++ b/server/core/auth.py @@ -1,6 +1,10 @@ +import re + from server.db.database import PostgresConnector from flask_bcrypt import Bcrypt +EMAIL_REGEX = re.compile(r"[^@]+@[^@]+\.[^@]+") + class AuthManager: def __init__(self, db: PostgresConnector, bcrypt: Bcrypt): self.db = db @@ -18,6 +22,12 @@ class AuthManager: def register_user(self, username, email, password): hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8") + if len(username) < 3: + raise ValueError("Username must be longer than 3 characters") + + if not EMAIL_REGEX.match(email): + raise ValueError("Please enter a valid email address") + if self.get_user_by_email(email): raise ValueError("Email already registered") From 2ab74d922ab61b709c42cade087cbe1ff36747eb Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 23:15:33 +0000 Subject: [PATCH 19/36] feat(api): support per-source search, category and limit configuration --- server/app.py | 38 ++++++++++++++++++++------------------ server/queue/tasks.py | 21 +++++++++++++-------- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/server/app.py b/server/app.py index 3efa6eb..550537f 100644 --- a/server/app.py +++ b/server/app.py @@ -124,31 +124,33 @@ def get_dataset_sources(): @app.route("/datasets/scrape", methods=["POST"]) @jwt_required() def scrape_data(): - if "sources" not in request.form: - return jsonify({"error": "Data source names are required."}), 400 + data = request.get_json() + + if not data or "sources" not in data: + return jsonify({"error": "Sources must be provided"}), 400 user_id = int(get_jwt_identity()) - sources = request.form.getlist("sources") - limit = int(request.form.get("limit", max_fetch_limit)) + dataset_name = data["name"].strip() + source_configs = data["sources"] - dataset_name = request.form.get("name", "").strip() - search = request.form.get("search") - category = request.form.get("category") + if not isinstance(source_configs, list) or len(source_configs) == 0: + return jsonify({"error": "Sources must be a non-empty list"}), 400 - if limit > max_fetch_limit: - return jsonify({"error": f"Due to API limitations, we cannot receive more than ${max_fetch_limit} posts"}), 400 - - for source in sources: - if source not in connectors.keys(): - return jsonify({"error": "Source must exist"}), 400 - - limits = split_limit(limit, len(sources)) - per_source = dict(zip(sources, limits)) + # Light Validation + for source in source_configs: + if "name" not in source: + return jsonify({"error": "Each source must contain a name"}), 400 + if "limit" in source: + source["limit"] = int(source["limit"]) + dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list) - dataset_manager.set_dataset_status(dataset_id, "fetching", f"Data is being fetched from {str(sources)}") + dataset_manager.set_dataset_status(dataset_id, + "fetching", + f"Data is being fetched from {str(source["name"] + "," for source in source_configs)}" + ) try: - fetch_and_process_dataset.delay(dataset_id, per_source, search, category, default_topic_list) + fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list) return jsonify( { diff --git a/server/queue/tasks.py b/server/queue/tasks.py index 7feaf9a..fd5237f 100644 --- a/server/queue/tasks.py +++ b/server/queue/tasks.py @@ -1,5 +1,5 @@ import pandas as pd -import json +import logging from server.queue.celery_app import celery from server.analysis.enrichment import DatasetEnrichment @@ -7,6 +7,8 @@ from server.db.database import PostgresConnector from server.core.datasets import DatasetManager from server.connectors.registry import get_available_connectors +logger = logging.getLogger(__name__) + @celery.task(bind=True, max_retries=3) def process_dataset(self, dataset_id: int, posts: list, topics: dict): db = PostgresConnector() @@ -26,9 +28,7 @@ def process_dataset(self, dataset_id: int, posts: list, topics: dict): @celery.task(bind=True, max_retries=3) def fetch_and_process_dataset(self, dataset_id: int, - per_source: dict[str, int], - search: str, - category: str, + source_info: list[dict], topics: dict): connectors = get_available_connectors() db = PostgresConnector() @@ -36,13 +36,18 @@ def fetch_and_process_dataset(self, posts = [] try: - for source_name, source_limit in per_source.items(): - connector = connectors[source_name]() + for metadata in source_info: + name = metadata["name"] + search = metadata.get("search") + category = metadata.get("category") + limit = metadata.get("limit", 100) + + connector = connectors[name]() raw_posts = connector.get_new_posts_by_search( search=search, category=category, - post_limit=source_limit, - comment_limit=source_limit + post_limit=limit, + comment_limit=limit ) posts.extend(post.to_dict() for post in raw_posts) From 524c9c50a08e38880b4ae985e633b532513915db Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 23:23:22 +0000 Subject: [PATCH 20/36] fix(api): incorrect dataset status update message --- server/app.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/server/app.py b/server/app.py index 550537f..b187fb0 100644 --- a/server/app.py +++ b/server/app.py @@ -144,11 +144,12 @@ def scrape_data(): source["limit"] = int(source["limit"]) dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list) - dataset_manager.set_dataset_status(dataset_id, - "fetching", - f"Data is being fetched from {str(source["name"] + "," for source in source_configs)}" - ) - + dataset_manager.set_dataset_status( + dataset_id, + "fetching", + f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" + ) + try: fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list) From eff416c34eef605b75b6804fdbb6ad13e8a46f86 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 10 Mar 2026 23:36:09 +0000 Subject: [PATCH 21/36] fix(connectors): hardcoded source name in Youtube connector --- server/connectors/youtube_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py index 323d3f8..a3047d7 100644 --- a/server/connectors/youtube_api.py +++ b/server/connectors/youtube_api.py @@ -59,7 +59,7 @@ class YouTubeAPI(BaseConnector): timestamp=published_at, url=f"https://www.youtube.com/watch?v={video_id}", title=title, - source="YouTube", + source=self.source_name, comments=comments ) From b2ae1a9f7013755387b3d3b9704992a5181abe94 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 11 Mar 2026 19:41:34 +0000 Subject: [PATCH 22/36] feat(frontend): add page for scraping endpoint --- frontend/src/App.tsx | 2 + frontend/src/pages/AutoScrape.tsx | 299 ++++++++++++++++++++++++++++ frontend/src/pages/Datasets.tsx | 15 +- frontend/src/utils/documentTitle.ts | 1 + 4 files changed, 314 insertions(+), 3 deletions(-) create mode 100644 frontend/src/pages/AutoScrape.tsx diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index b1e6045..1b10f61 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -5,6 +5,7 @@ import DatasetsPage from "./pages/Datasets"; import DatasetStatusPage from "./pages/DatasetStatus"; import LoginPage from "./pages/Login"; import UploadPage from "./pages/Upload"; +import AutoScrapePage from "./pages/AutoScrape"; import StatPage from "./pages/Stats"; import { getDocumentTitle } from "./utils/documentTitle"; import DatasetEditPage from "./pages/DatasetEdit"; @@ -22,6 +23,7 @@ function App() { } /> } /> } /> + } /> } /> } /> } /> diff --git a/frontend/src/pages/AutoScrape.tsx b/frontend/src/pages/AutoScrape.tsx new file mode 100644 index 0000000..7e8e754 --- /dev/null +++ b/frontend/src/pages/AutoScrape.tsx @@ -0,0 +1,299 @@ +import axios from "axios"; +import { useEffect, useState } from "react"; +import { useNavigate } from "react-router-dom"; +import StatsStyling from "../styles/stats_styling"; + +const styles = StatsStyling; +const API_BASE_URL = import.meta.env.VITE_BACKEND_URL; + +type SourceOption = { + id: string; + label: string; +}; + +type SourceConfig = { + sourceName: string; + limit: string; + search: string; + category: string; +}; + +const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({ + sourceName, + limit: "100", + search: "", + category: "", +}); + +const AutoScrapePage = () => { + const navigate = useNavigate(); + const [datasetName, setDatasetName] = useState(""); + const [sourceOptions, setSourceOptions] = useState([]); + const [sourceConfigs, setSourceConfigs] = useState([]); + const [returnMessage, setReturnMessage] = useState(""); + const [isLoadingSources, setIsLoadingSources] = useState(true); + const [isSubmitting, setIsSubmitting] = useState(false); + const [hasError, setHasError] = useState(false); + + useEffect(() => { + axios + .get(`${API_BASE_URL}/datasets/sources`) + .then((response) => { + const options = response.data || []; + setSourceOptions(options); + setSourceConfigs([buildEmptySourceConfig(options[0]?.id || "")]); + }) + .catch((requestError: unknown) => { + setHasError(true); + if (axios.isAxiosError(requestError)) { + setReturnMessage( + `Failed to load available sources: ${String( + requestError.response?.data?.error || requestError.message + )}` + ); + } else { + setReturnMessage("Failed to load available sources."); + } + }) + .finally(() => { + setIsLoadingSources(false); + }); + }, []); + + const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => { + setSourceConfigs((previous) => + previous.map((config, configIndex) => + configIndex === index ? { ...config, [field]: value } : config + ) + ); + }; + + const addSourceConfig = () => { + setSourceConfigs((previous) => [ + ...previous, + buildEmptySourceConfig(sourceOptions[0]?.id || ""), + ]); + }; + + const removeSourceConfig = (index: number) => { + setSourceConfigs((previous) => previous.filter((_, configIndex) => configIndex !== index)); + }; + + const autoScrape = async () => { + const token = localStorage.getItem("access_token"); + if (!token) { + setHasError(true); + setReturnMessage("You must be signed in to auto scrape a dataset."); + return; + } + + const normalizedDatasetName = datasetName.trim(); + if (!normalizedDatasetName) { + setHasError(true); + setReturnMessage("Please add a dataset name before continuing."); + return; + } + + if (sourceConfigs.length === 0) { + setHasError(true); + setReturnMessage("Please add at least one source."); + return; + } + + const normalizedSources = sourceConfigs.map((source) => ({ + name: source.sourceName, + limit: Number(source.limit || 100), + search: source.search.trim() || undefined, + category: source.category.trim() || undefined, + })); + + const invalidSource = normalizedSources.find( + (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0 + ); + + if (invalidSource) { + setHasError(true); + setReturnMessage("Every source needs a name and a limit greater than zero."); + return; + } + + try { + setIsSubmitting(true); + setHasError(false); + setReturnMessage(""); + + const response = await axios.post( + `${API_BASE_URL}/datasets/scrape`, + { + name: normalizedDatasetName, + sources: normalizedSources, + }, + { + headers: { + Authorization: `Bearer ${token}`, + }, + } + ); + + const datasetId = Number(response.data.dataset_id); + + setReturnMessage( + `Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...` + ); + + setTimeout(() => { + navigate(`/dataset/${datasetId}/status`); + }, 400); + } catch (requestError: unknown) { + setHasError(true); + if (axios.isAxiosError(requestError)) { + const message = String( + requestError.response?.data?.error || requestError.message || "Auto scrape failed." + ); + setReturnMessage(`Auto scrape failed: ${message}`); + } else { + setReturnMessage("Auto scrape failed due to an unexpected error."); + } + } finally { + setIsSubmitting(false); + } + }; + + return ( +
    +
    +
    +
    +

    Auto Scrape Dataset

    +

    + Select sources and scrape settings, then queue processing automatically. +

    +
    + +
    + +
    +
    +

    Dataset Name

    +

    Use a clear label so you can identify this run later.

    + setDatasetName(event.target.value)} + /> +
    + +
    +

    Sources

    +

    + Configure source, limit, optional search, and optional category. +

    + + {isLoadingSources &&

    Loading sources...

    } + + {!isLoadingSources && sourceOptions.length === 0 && ( +

    No source connectors are currently available.

    + )} + + {!isLoadingSources && sourceOptions.length > 0 && ( +
    + {sourceConfigs.map((source, index) => ( +
    + + + updateSourceConfig(index, "limit", event.target.value)} + /> + + updateSourceConfig(index, "search", event.target.value)} + /> + + updateSourceConfig(index, "category", event.target.value)} + /> + + {sourceConfigs.length > 1 && ( + + )} +
    + ))} + + +
    + )} +
    +
    + +
    + {returnMessage || + "After queueing, your dataset is fetched and processed in the background automatically."} +
    +
    +
    + ); +}; + +export default AutoScrapePage; diff --git a/frontend/src/pages/Datasets.tsx b/frontend/src/pages/Datasets.tsx index ede2317..daffaf2 100644 --- a/frontend/src/pages/Datasets.tsx +++ b/frontend/src/pages/Datasets.tsx @@ -63,9 +63,18 @@ const DatasetsPage = () => { View and reopen datasets you previously uploaded.

- +
+ + +
{error && ( diff --git a/frontend/src/utils/documentTitle.ts b/frontend/src/utils/documentTitle.ts index 904a6a8..5c7d00d 100644 --- a/frontend/src/utils/documentTitle.ts +++ b/frontend/src/utils/documentTitle.ts @@ -3,6 +3,7 @@ const DEFAULT_TITLE = "Ethnograph View"; const STATIC_TITLES: Record = { "/login": "Sign In", "/upload": "Upload Dataset", + "/auto-scrape": "Auto Scrape Dataset", "/datasets": "My Datasets", }; From 0658713f422fa92f9ccf343543fdfee951c15c17 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 11 Mar 2026 19:44:38 +0000 Subject: [PATCH 23/36] chore: remove unused dataset creation script --- create_dataset.py | 43 ------------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 create_dataset.py diff --git a/create_dataset.py b/create_dataset.py deleted file mode 100644 index 791b2bd..0000000 --- a/create_dataset.py +++ /dev/null @@ -1,43 +0,0 @@ -import json -import logging -from connectors.reddit_api import RedditAPI -from connectors.boards_api import BoardsAPI -from connectors.youtube_api import YouTubeAPI - -posts_file = 'posts_test.jsonl' - -reddit_connector = RedditAPI() -boards_connector = BoardsAPI() -youtube_connector = YouTubeAPI() - -logging.basicConfig(level=logging.DEBUG) -logging.getLogger("urllib3").setLevel(logging.WARNING) - -def remove_empty_posts(posts): - return [post for post in posts if post.content.strip() != ""] - -def save_to_jsonl(filename, posts): - with open(filename, 'a', encoding='utf-8') as f: - for post in posts: - # Convert post object to dict if it's a dataclass - data = post.to_dict() - f.write(json.dumps(data) + '\n') - - -def main(): - boards_posts = boards_connector.get_new_category_posts('cork-city', 1200, 1200) - save_to_jsonl(posts_file, boards_posts) - - reddit_posts = reddit_connector.get_new_subreddit_posts('cork', 1200) - reddit_posts = remove_empty_posts(reddit_posts) - save_to_jsonl(posts_file, reddit_posts) - - ireland_posts = reddit_connector.search_new_subreddit_posts('cork', 'ireland', 1200) - ireland_posts = remove_empty_posts(ireland_posts) - save_to_jsonl(posts_file, ireland_posts) - - youtube_videos = youtube_connector.fetch_videos('cork city', 1200, 1200) - save_to_jsonl(posts_file, youtube_videos) - -if __name__ == "__main__": - main() \ No newline at end of file From 12cbc240748e41087a4af57e9ea99fbf963f2ce2 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 11 Mar 2026 19:47:44 +0000 Subject: [PATCH 24/36] chore(utils): remove `split_limit` function --- server/app.py | 2 +- server/utils.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/server/app.py b/server/app.py index b187fb0..460cb77 100644 --- a/server/app.py +++ b/server/app.py @@ -19,7 +19,7 @@ from server.exceptions import NotAuthorisedException, NonExistentDatasetExceptio from server.db.database import PostgresConnector from server.core.auth import AuthManager from server.core.datasets import DatasetManager -from server.utils import get_request_filters, split_limit, get_env +from server.utils import get_request_filters, get_env from server.queue.tasks import process_dataset, fetch_and_process_dataset from server.connectors.registry import get_available_connectors, get_connector_metadata diff --git a/server/utils.py b/server/utils.py index 1a507bb..fb42953 100644 --- a/server/utils.py +++ b/server/utils.py @@ -50,10 +50,6 @@ def get_request_filters() -> dict: return filters -def split_limit(limit: int, n: int) -> list[int]: - base, remainder = divmod(limit, n) - return [base + (1 if i < remainder else 0) for i in range(n)] - def get_env(name: str) -> str: value = os.getenv(name) if not value: From 01d6bd01640cba4126ea262418ea366b5b4262eb Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 11 Mar 2026 21:16:26 +0000 Subject: [PATCH 25/36] fix(connectors): category / search fields breaking Ideally category and search are fully optional, however some sites break if one or the other is not provided. Unfortuntely `boards.ie` has a different page type for searches and I'm not bothered to implement a scraper from scratch. In addition, removed comment limit options. --- server/connectors/base.py | 3 +-- server/connectors/boards_api.py | 35 +++++++++++++++---------- server/connectors/reddit_api.py | 45 ++++++++++++++++++--------------- server/queue/tasks.py | 3 +-- 4 files changed, 48 insertions(+), 38 deletions(-) diff --git a/server/connectors/base.py b/server/connectors/base.py index f555769..bad73c5 100644 --- a/server/connectors/base.py +++ b/server/connectors/base.py @@ -17,7 +17,6 @@ class BaseConnector(ABC): def get_new_posts_by_search(self, search: str = None, category: str = None, - post_limit: int = 10, - comment_limit: int = 10 + post_limit: int = 10 ) -> list[Post]: ... \ No newline at end of file diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index 9109e71..600c864 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -25,22 +25,29 @@ class BoardsAPI(BaseConnector): def get_new_posts_by_search(self, search: str, category: str, - post_limit: int, - comment_limit: int + post_limit: int ) -> list[Post]: + if search: + raise NotImplementedError("Search not compatible with boards.ie") + + if category: + return self._get_posts(f"{self.url}/categories/{category}", post_limit) + else: + return self._get_posts(f"{self.url}/discussions", post_limit) + + ## Private + def _get_posts(self, url, limit) -> list[Post]: urls = [] current_page = 1 - logger.info(f"Fetching posts from category: {category}") - - while len(urls) < post_limit: - url = f"{self.url}/categories/{category}/p{current_page}" + while len(urls) < limit: + url = f"{self.url}/p{current_page}" html = self._fetch_page(url) soup = BeautifulSoup(html, "html.parser") - logger.debug(f"Processing page {current_page} for category {category}") + logger.debug(f"Processing page {current_page} for link: {url}") for a in soup.select("a.threadbit-threadlink"): - if len(urls) >= post_limit: + if len(urls) >= limit: break href = a.get("href") @@ -49,14 +56,14 @@ class BoardsAPI(BaseConnector): current_page += 1 - logger.debug(f"Fetched {len(urls)} post URLs from category {category}") + logger.debug(f"Fetched {len(urls)} post URLs") # Fetch post details for each URL and create Post objects posts = [] def fetch_and_parse(post_url): html = self._fetch_page(post_url) - post = self._parse_thread(html, post_url, comment_limit) + post = self._parse_thread(html, post_url) return post with ThreadPoolExecutor(max_workers=30) as executor: @@ -79,7 +86,7 @@ class BoardsAPI(BaseConnector): response.raise_for_status() return response.text - def _parse_thread(self, html: str, post_url: str, comment_limit: int) -> Post: + def _parse_thread(self, html: str, post_url: str) -> Post: soup = BeautifulSoup(html, "html.parser") # Author @@ -108,7 +115,7 @@ class BoardsAPI(BaseConnector): title = title_tag.text.strip() if title_tag else None # Comments - comments = self._parse_comments(post_url, post_num, comment_limit) + comments = self._parse_comments(post_url, post_num) post = Post( id=post_num, @@ -123,11 +130,11 @@ class BoardsAPI(BaseConnector): return post - def _parse_comments(self, url: str, post_id: str, comment_limit: int) -> list[Comment]: + def _parse_comments(self, url: str, post_id: str) -> list[Comment]: comments = [] current_url = url - while current_url and len(comments) < comment_limit: + while current_url: html = self._fetch_page(current_url) page_comments = self._parse_page_comments(html, post_id) comments.extend(page_comments) diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 444326a..9042c6c 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -20,39 +20,44 @@ class RedditAPI(BaseConnector): def get_new_posts_by_search(self, search: str, category: str, - post_limit: int, - comment_limit: int + post_limit: int ) -> list[Post]: - if not search: - return self._get_new_subreddit_posts(category, limit=post_limit) + prefix = f"r/{category}/" if category else "" + params = {'limit': post_limit} - params = { - 'q': search, - 'limit': post_limit, - 'restrict_sr': 'on', - 'sort': 'new' - } + if search: + endpoint = f"{prefix}search.json" + params.update({ + 'q': search, + 'sort': 'new', + 'restrict_sr': 'on' if category else 'off' + }) + else: + endpoint = f"{prefix}new.json" - logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}") - url = f"r/{category}/search.json" posts = [] - + after = None + while len(posts) < post_limit: batch_limit = min(100, post_limit - len(posts)) params['limit'] = batch_limit + if after: + params['after'] = after - data = self._fetch_post_overviews(url, params) - batch_posts = self._parse_posts(data) - - logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}") - - if not batch_posts: + data = self._fetch_post_overviews(endpoint, params) + + if not data or 'data' not in data or not data['data'].get('children'): break + batch_posts = self._parse_posts(data) posts.extend(batch_posts) - return posts + after = data['data'].get('after') + if not after: + break + + return posts[:post_limit] def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: posts = [] diff --git a/server/queue/tasks.py b/server/queue/tasks.py index fd5237f..95248d1 100644 --- a/server/queue/tasks.py +++ b/server/queue/tasks.py @@ -46,8 +46,7 @@ def fetch_and_process_dataset(self, raw_posts = connector.get_new_posts_by_search( search=search, category=category, - post_limit=limit, - comment_limit=limit + post_limit=limit ) posts.extend(post.to_dict() for post in raw_posts) From c12f1b437109e9606b5888069e277a5a21a0ce2e Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Thu, 12 Mar 2026 09:56:34 +0000 Subject: [PATCH 26/36] chore(connectors): add category and search validation fields --- server/connectors/base.py | 3 +++ server/connectors/boards_api.py | 3 +++ server/connectors/reddit_api.py | 6 ++++-- server/connectors/registry.py | 15 ++++++++++----- server/connectors/youtube_api.py | 6 ++++-- 5 files changed, 24 insertions(+), 9 deletions(-) diff --git a/server/connectors/base.py b/server/connectors/base.py index bad73c5..3614c8a 100644 --- a/server/connectors/base.py +++ b/server/connectors/base.py @@ -7,6 +7,9 @@ class BaseConnector(ABC): display_name: str # human-readable: "Reddit", "YouTube" required_env: list[str] = [] # env vars needed to activate + search_enabled: bool + categories_enabled: bool + @classmethod def is_available(cls) -> bool: """Returns True if all required env vars are set.""" diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index 600c864..6cded92 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -19,6 +19,9 @@ class BoardsAPI(BaseConnector): source_name: str = "boards.ie" display_name: str = "Boards.ie" + categories_enabled: bool = True + search_enabled: bool = False + def __init__(self): self.url = "https://www.boards.ie" diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 9042c6c..dd62119 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector logger = logging.getLogger(__name__) class RedditAPI(BaseConnector): - source_name = "reddit" - display_name = "Reddit" + source_name: str = "reddit" + display_name: str = "Reddit" + search_enabled: bool = True + categories_enabled: bool = True def __init__(self): self.url = "https://www.reddit.com/" diff --git a/server/connectors/registry.py b/server/connectors/registry.py index 47b1d6a..f2371e6 100644 --- a/server/connectors/registry.py +++ b/server/connectors/registry.py @@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]: def get_available_connectors() -> dict[str, type[BaseConnector]]: return {c.source_name: c for c in _discover_connectors() if c.is_available()} -def get_connector_metadata() -> list[dict]: - return [ - {"id": id, "label": obj.display_name} - for id, obj in get_available_connectors().items() - ] \ No newline at end of file +def get_connector_metadata() -> dict[str, dict]: + res = {} + for id, obj in get_available_connectors().items(): + res[id] = {"id": id, + "label": obj.display_name, + "search_enabled": obj.search_enabled, + "categories_enabled": obj.categories_enabled + } + + return res \ No newline at end of file diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py index a3047d7..bcf5fe8 100644 --- a/server/connectors/youtube_api.py +++ b/server/connectors/youtube_api.py @@ -13,8 +13,10 @@ load_dotenv() API_KEY = os.getenv("YOUTUBE_API_KEY") class YouTubeAPI(BaseConnector): - source_name = "youtube" - display_name = "YouTube" + source_name: str = "youtube" + display_name: str = "YouTube" + search_enabled: bool = True + categories_enabled: bool = False def __init__(self): self.youtube = build('youtube', 'v3', developerKey=API_KEY) From 6684780d233ae34e63cc6759ee0ea3d4f28177a7 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Thu, 12 Mar 2026 09:59:07 +0000 Subject: [PATCH 27/36] fix(connectors): add stronger validation to scrape endpoint Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category. --- server/app.py | 82 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/server/app.py b/server/app.py index 460cb77..b315a60 100644 --- a/server/app.py +++ b/server/app.py @@ -119,50 +119,82 @@ def get_user_datasets(): @app.route("/datasets/sources", methods=["GET"]) def get_dataset_sources(): - return jsonify(get_connector_metadata()) + list_metadata = list(get_connector_metadata().values()) + return jsonify(list_metadata) @app.route("/datasets/scrape", methods=["POST"]) @jwt_required() def scrape_data(): data = request.get_json() + connector_metadata = get_connector_metadata() + # Strong validation needed, otherwise data goes to Celery and crashes silently if not data or "sources" not in data: - return jsonify({"error": "Sources must be provided"}), 400 - - user_id = int(get_jwt_identity()) + return jsonify({"error": "Sources must be provided"}), 400 + + if "name" not in data or not str(data["name"]).strip(): + return jsonify({"error": "Dataset name is required"}), 400 + dataset_name = data["name"].strip() + user_id = int(get_jwt_identity()) + source_configs = data["sources"] if not isinstance(source_configs, list) or len(source_configs) == 0: return jsonify({"error": "Sources must be a non-empty list"}), 400 - # Light Validation for source in source_configs: + if not isinstance(source, dict): + return jsonify({"error": "Each source must be an object"}), 400 + if "name" not in source: return jsonify({"error": "Each source must contain a name"}), 400 - if "limit" in source: - source["limit"] = int(source["limit"]) - - dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list) - dataset_manager.set_dataset_status( - dataset_id, - "fetching", - f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" - ) - - try: - fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list) - return jsonify( - { - "message": "Dataset queued for processing", - "dataset_id": dataset_id, - "status": "processing", - } - ), 202 + if "limit" in source: + try: + source["limit"] = int(source["limit"]) + except (ValueError, TypeError): + return jsonify({"error": "Limit must be an integer"}), 400 + + name = source["name"] + + if name not in connector_metadata: + return jsonify({"error": "Source not supported"}), 400 + + if "search" in source and not connector_metadata[name]["search_enabled"]: + return jsonify({"error": f"Source {name} does not support search"}), 400 + + if "category" in source and not connector_metadata[name]["categories_enabled"]: + return jsonify({"error": f"Source {name} does not support categories"}), 400 + + try: + dataset_id = dataset_manager.save_dataset_info( + user_id, + dataset_name, + default_topic_list + ) + + dataset_manager.set_dataset_status( + dataset_id, + "fetching", + f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" + ) + + fetch_and_process_dataset.delay( + dataset_id, + source_configs, + default_topic_list + ) except Exception: print(traceback.format_exc()) - return jsonify({"error": "An unexpected error occurred"}), 500 + return jsonify({"error": "Failed to queue dataset processing"}), 500 + + + return jsonify({ + "message": "Dataset queued for processing", + "dataset_id": dataset_id, + "status": "processing" + }), 202 @app.route("/datasets/upload", methods=["POST"]) @jwt_required() From 162a4de64e078f8543b307ba70dc7c633ccbb73f Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Thu, 12 Mar 2026 10:07:28 +0000 Subject: [PATCH 28/36] fix(frontend): detects which sources support category or search --- frontend/src/pages/AutoScrape.tsx | 61 +++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/frontend/src/pages/AutoScrape.tsx b/frontend/src/pages/AutoScrape.tsx index 7e8e754..9e9d336 100644 --- a/frontend/src/pages/AutoScrape.tsx +++ b/frontend/src/pages/AutoScrape.tsx @@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL; type SourceOption = { id: string; label: string; + search_enabled?: boolean; + categories_enabled?: boolean; + searchEnabled?: boolean; + categoriesEnabled?: boolean; }; type SourceConfig = { @@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({ category: "", }); +const supportsSearch = (source?: SourceOption): boolean => + Boolean(source?.search_enabled ?? source?.searchEnabled); + +const supportsCategories = (source?: SourceOption): boolean => + Boolean(source?.categories_enabled ?? source?.categoriesEnabled); + const AutoScrapePage = () => { const navigate = useNavigate(); const [datasetName, setDatasetName] = useState(""); @@ -63,11 +73,18 @@ const AutoScrapePage = () => { const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => { setSourceConfigs((previous) => previous.map((config, configIndex) => - configIndex === index ? { ...config, [field]: value } : config + configIndex === index + ? field === "sourceName" + ? { ...config, sourceName: value, search: "", category: "" } + : { ...config, [field]: value } + : config ) ); }; + const getSourceOption = (sourceName: string) => + sourceOptions.find((option) => option.id === sourceName); + const addSourceConfig = () => { setSourceConfigs((previous) => [ ...previous, @@ -100,12 +117,18 @@ const AutoScrapePage = () => { return; } - const normalizedSources = sourceConfigs.map((source) => ({ - name: source.sourceName, - limit: Number(source.limit || 100), - search: source.search.trim() || undefined, - category: source.category.trim() || undefined, - })); + const normalizedSources = sourceConfigs.map((source) => { + const sourceOption = getSourceOption(source.sourceName); + + return { + name: source.sourceName, + limit: Number(source.limit || 100), + search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined, + category: supportsCategories(sourceOption) + ? source.category.trim() || undefined + : undefined, + }; + }); const invalidSource = normalizedSources.find( (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0 @@ -212,7 +235,12 @@ const AutoScrapePage = () => { {!isLoadingSources && sourceOptions.length > 0 && (
- {sourceConfigs.map((source, index) => ( + {sourceConfigs.map((source, index) => { + const sourceOption = getSourceOption(source.sourceName); + const searchEnabled = supportsSearch(sourceOption); + const categoriesEnabled = supportsCategories(sourceOption); + + return (
{ updateSourceConfig(index, "search", event.target.value)} /> updateSourceConfig(index, "category", event.target.value)} /> @@ -271,7 +309,8 @@ const AutoScrapePage = () => { )}
- ))} + ); + })}