From ca444e9cb0da2742395130098469d913c515a491 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Mon, 9 Mar 2026 20:53:13 +0000
Subject: [PATCH 01/36] refactor: move connectors to backend dir

They will now be more used in the backend.
---
 {connectors => server/connectors}/boards_api.py  | 0
 {connectors => server/connectors}/reddit_api.py  | 0
 {connectors => server/connectors}/youtube_api.py | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename {connectors => server/connectors}/boards_api.py (100%)
 rename {connectors => server/connectors}/reddit_api.py (100%)
 rename {connectors => server/connectors}/youtube_api.py (100%)

diff --git a/connectors/boards_api.py b/server/connectors/boards_api.py
similarity index 100%
rename from connectors/boards_api.py
rename to server/connectors/boards_api.py
diff --git a/connectors/reddit_api.py b/server/connectors/reddit_api.py
similarity index 100%
rename from connectors/reddit_api.py
rename to server/connectors/reddit_api.py
diff --git a/connectors/youtube_api.py b/server/connectors/youtube_api.py
similarity index 100%
rename from connectors/youtube_api.py
rename to server/connectors/youtube_api.py

From 262a70dbf331247ce2bcc76903870ec656b85582 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Mon, 9 Mar 2026 20:55:12 +0000
Subject: [PATCH 02/36] refactor(api): rename /upload endpoint

Ensures consistency with the other dataset-based endpoints and follows the REST-API rules more cleanly.
---
 frontend/src/pages/Upload.tsx | 2 +-
 server/app.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontend/src/pages/Upload.tsx b/frontend/src/pages/Upload.tsx
index 93383dc..0799f9b 100644
--- a/frontend/src/pages/Upload.tsx
+++ b/frontend/src/pages/Upload.tsx
@@ -40,7 +40,7 @@ const UploadPage = () => {
       setHasError(false);
       setReturnMessage("");
 
-      const response = await axios.post(`${API_BASE_URL}/upload`, formData, {
+      const response = await axios.post(`${API_BASE_URL}/datasets/upload`, formData, {
         headers: {
           "Content-Type": "multipart/form-data",
         },
diff --git a/server/app.py b/server/app.py
index 7cbf9d3..eb27e70 100644
--- a/server/app.py
+++ b/server/app.py
@@ -111,7 +111,7 @@ def get_user_datasets():
     current_user = int(get_jwt_identity())
     return jsonify(dataset_manager.get_user_datasets(current_user)), 200
 
-@app.route("/upload", methods=["POST"])
+@app.route("/datasets/upload", methods=["POST"])
 @jwt_required()
 def upload_data():
     if "posts" not in request.files or "topics" not in request.files:

From cc799f736875d7e8326423b78c4b650c8ebee359 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Mon, 9 Mar 2026 21:28:44 +0000
Subject: [PATCH 03/36] feat(connectors): add base connector and registry for
 detection

Idea is to have a "plugin-type" system, where new connectors can extend the `BaseConnector` class and implement the fetch posts method.

These are automatically detected by the registry, and automatically used in new Flask endpoints that give a list of possible sources.

Allows for an open-ended system where new data scrapers / API consumers can be added dynamically.
---
 server/app.py                   | 14 ++++++++++++++
 server/connectors/base.py       | 23 +++++++++++++++++++++++
 server/connectors/boards_api.py | 13 ++++++++++---
 server/connectors/registry.py   | 25 +++++++++++++++++++++++++
 server/core/datasets.py         |  2 +-
 5 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 server/connectors/base.py
 create mode 100644 server/connectors/registry.py

diff --git a/server/app.py b/server/app.py
index eb27e70..23c095f 100644
--- a/server/app.py
+++ b/server/app.py
@@ -111,6 +111,20 @@ def get_user_datasets():
     current_user = int(get_jwt_identity())
     return jsonify(dataset_manager.get_user_datasets(current_user)), 200
 
+@app.route("/datasets/sources", methods=["GET"])
+@jwt_required()
+def get_dataset_sources():
+    return jsonify({""})
+
+@app.route("/datasets/scrape", methods=["POST"])
+@jwt_required()
+def scrape_data():
+    if "sources" not in request.form:
+        return jsonify({"error": "Data source names are required."}), 400
+    
+    sources = request.form.get("sources")
+
+
 @app.route("/datasets/upload", methods=["POST"])
 @jwt_required()
 def upload_data():
diff --git a/server/connectors/base.py b/server/connectors/base.py
new file mode 100644
index 0000000..f555769
--- /dev/null
+++ b/server/connectors/base.py
@@ -0,0 +1,23 @@
+from abc import ABC, abstractmethod
+from dto.post import Post
+
+class BaseConnector(ABC):
+    # Each subclass declares these at the class level
+    source_name: str       # machine-readable: "reddit", "youtube"
+    display_name: str      # human-readable: "Reddit", "YouTube"
+    required_env: list[str] = []  # env vars needed to activate
+
+    @classmethod
+    def is_available(cls) -> bool:
+        """Returns True if all required env vars are set."""
+        import os
+        return all(os.getenv(var) for var in cls.required_env)
+
+    @abstractmethod
+    def get_new_posts_by_search(self, 
+                                search: str = None, 
+                                category: str = None, 
+                                post_limit: int = 10, 
+                                comment_limit: int = 10
+                                ) -> list[Post]:
+        ...
\ No newline at end of file
diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index 1b63aa9..e714048 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -7,6 +7,7 @@ from dto.post import Post
 from dto.comment import Comment
 from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from server.connectors.base import BaseConnector
 
 logger = logging.getLogger(__name__)
 
@@ -14,12 +15,18 @@ HEADERS = {
     "User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"
 }
 
-class BoardsAPI:
+class BoardsAPI(BaseConnector):
     def __init__(self):
         self.url = "https://www.boards.ie"
-        self.source_name = "Boards.ie"
+        self.source_name = "boards.ie"
+        self.display_name = "Boards.ie"
 
-    def get_new_category_posts(self, category: str, post_limit: int, comment_limit: int)  -> list[Post]:
+    def get_new_posts_by_search(self, 
+                                search: str,
+                                category: str, 
+                                post_limit: int, 
+                                comment_limit: int
+                                )  -> list[Post]:
         urls = []
         current_page = 1
 
diff --git a/server/connectors/registry.py b/server/connectors/registry.py
new file mode 100644
index 0000000..0883476
--- /dev/null
+++ b/server/connectors/registry.py
@@ -0,0 +1,25 @@
+import pkgutil
+import importlib
+import connectors
+from connectors.base import BaseConnector
+
+def _discover_connectors() -> list[type[BaseConnector]]:
+    """Walk the connectors package and collect all BaseConnector subclasses."""
+    for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
+        if module_name in ("base", "registry"):
+            continue
+        importlib.import_module(f"connectors.{module_name}")
+
+    return [
+        cls for cls in BaseConnector.__subclasses__()
+        if cls.source_name  # guard against abstract intermediaries
+    ]
+
+def get_available_connectors() -> list[type[BaseConnector]]:
+    return [c for c in _discover_connectors() if c.is_available()]
+
+def get_connector_metadata() -> list[dict]:
+    return [
+        {"id": c.source_name, "label": c.display_name}
+        for c in get_available_connectors()
+    ]
\ No newline at end of file
diff --git a/server/core/datasets.py b/server/core/datasets.py
index 5886cfc..3a62fc9 100644
--- a/server/core/datasets.py
+++ b/server/core/datasets.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from server.db.database import PostgresConnector
 from psycopg2.extras import Json
-from server.exceptions import NotAuthorisedException, NonExistentDatasetException
+from server.exceptions import NonExistentDatasetException
 
 class DatasetManager:
     def __init__(self, db: PostgresConnector):

From e7a8c17be4d524fd5cb7f57435ed7709396b0838 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:08:01 +0000
Subject: [PATCH 04/36] chore(connectors): add base connector inheritance

---
 server/connectors/reddit_api.py  | 3 ++-
 server/connectors/youtube_api.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index 0ec6100..61f3656 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -5,10 +5,11 @@ import time
 from dto.post import Post
 from dto.user import User
 from dto.comment import Comment
+from server.connectors.base import BaseConnector
 
 logger = logging.getLogger(__name__)
 
-class RedditAPI:
+class RedditAPI(BaseConnector):
     def __init__(self):
         self.url = "https://www.reddit.com/"
         self.source_name = "Reddit"
diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index d0e00a3..71ce6ed 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -6,12 +6,13 @@ from googleapiclient.discovery import build
 from googleapiclient.errors import HttpError
 from dto.post import Post
 from dto.comment import Comment
+from server.connectors.base import BaseConnector
 
 load_dotenv()
 
 API_KEY = os.getenv("YOUTUBE_API_KEY")
 
-class YouTubeAPI:
+class YouTubeAPI(BaseConnector):
     def __init__(self):
         self.youtube = build('youtube', 'v3', developerKey=API_KEY)
 

From 2a8d7c797237cbccd62ed35d32b1f53fcb15bf49 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:11:33 +0000
Subject: [PATCH 05/36] refactor(connectors): Youtube & Reddit connectors
 implement BaseConnector

---
 server/connectors/reddit_api.py  | 12 ++++-
 server/connectors/youtube_api.py | 91 +++++++++++++++++---------------
 2 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index 61f3656..13e5e7b 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -15,7 +15,15 @@ class RedditAPI(BaseConnector):
         self.source_name = "Reddit"
 
     # Public Methods #
-    def search_new_subreddit_posts(self, search: str, subreddit: str, limit: int) -> list[Post]:
+    def get_new_posts_by_search(self, 
+                                search: str, 
+                                subreddit: str, 
+                                limit: int
+                                ) -> list[Post]:
+        
+        if not search:
+            return self._get_new_subreddit_posts(subreddit, limit=limit)
+
         params = {
             'q': search,
             'limit': limit,
@@ -43,7 +51,7 @@ class RedditAPI(BaseConnector):
 
         return posts
     
-    def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
+    def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
         posts = []
         after = None
         url = f"r/{subreddit}/new.json"
diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index 71ce6ed..691f53d 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -16,6 +16,54 @@ class YouTubeAPI(BaseConnector):
     def __init__(self):
         self.youtube = build('youtube', 'v3', developerKey=API_KEY)
 
+    def get_new_posts_by_search(self, 
+                                search: str,
+                                category: str, 
+                                post_limit: int, 
+                                comment_limit: int
+                                )  -> list[Post]:
+            videos = self.search_videos(search, post_limit)
+            posts = []
+
+            for video in videos:
+                video_id = video['id']['videoId']
+                snippet = video['snippet']
+                title = snippet['title']
+                description = snippet['description']
+                published_at = datetime.datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp()
+                channel_title = snippet['channelTitle']
+
+                comments = []
+                comments_data = self.get_video_comments(video_id, comment_limit)
+                for comment_thread in comments_data:
+                    comment_snippet = comment_thread['snippet']['topLevelComment']['snippet']
+                    comment = Comment(
+                        id=comment_thread['id'],
+                        post_id=video_id,
+                        content=comment_snippet['textDisplay'],
+                        author=comment_snippet['authorDisplayName'],
+                        timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
+                        reply_to=None,
+                        source="YouTube"
+                    )
+
+                    comments.append(comment)
+
+                post = Post(
+                    id=video_id,
+                    content=f"{title}\n\n{description}",
+                    author=channel_title,
+                    timestamp=published_at,
+                    url=f"https://www.youtube.com/watch?v={video_id}",
+                    title=title,
+                    source="YouTube",
+                    comments=comments
+                )
+
+                posts.append(post)
+
+            return posts
+
     def search_videos(self, query, limit):
         request = self.youtube.search().list(
             q=query,
@@ -40,46 +88,3 @@ class YouTubeAPI(BaseConnector):
             print(f"Error fetching comments for video {video_id}: {e}")
             return []
         return response.get('items', [])
-    
-    def fetch_videos(self, query, video_limit, comment_limit) -> list[Post]:
-        videos = self.search_videos(query, video_limit)
-        posts = []
-
-        for video in videos:
-            video_id = video['id']['videoId']
-            snippet = video['snippet']
-            title = snippet['title']
-            description = snippet['description']
-            published_at = datetime.datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp()
-            channel_title = snippet['channelTitle']
-
-            comments = []
-            comments_data = self.get_video_comments(video_id, comment_limit)
-            for comment_thread in comments_data:
-                comment_snippet = comment_thread['snippet']['topLevelComment']['snippet']
-                comment = Comment(
-                    id=comment_thread['id'],
-                    post_id=video_id,
-                    content=comment_snippet['textDisplay'],
-                    author=comment_snippet['authorDisplayName'],
-                    timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
-                    reply_to=None,
-                    source="YouTube"
-                )
-
-                comments.append(comment)
-
-            post = Post(
-                id=video_id,
-                content=f"{title}\n\n{description}",
-                author=channel_title,
-                timestamp=published_at,
-                url=f"https://www.youtube.com/watch?v={video_id}",
-                title=title,
-                source="YouTube",
-                comments=comments
-            )
-
-            posts.append(post)
-
-        return posts
\ No newline at end of file

From 5ccb2e73cd0512fd0ee95712e2d9e001b63fd405 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:18:42 +0000
Subject: [PATCH 06/36] fix(connectors): incorrect registry location

Registry paths were using the incorrect connector path locations.
---
 server/app.py                    | 4 ++--
 server/connectors/boards_api.py  | 5 +++--
 server/connectors/reddit_api.py  | 4 +++-
 server/connectors/registry.py    | 8 ++++----
 server/connectors/youtube_api.py | 5 ++++-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/server/app.py b/server/app.py
index 23c095f..e5a8037 100644
--- a/server/app.py
+++ b/server/app.py
@@ -21,6 +21,7 @@ from server.core.auth import AuthManager
 from server.core.datasets import DatasetManager
 from server.utils import get_request_filters
 from server.queue.tasks import process_dataset
+from server.connectors.registry import get_connector_metadata
 
 app = Flask(__name__)
 
@@ -112,9 +113,8 @@ def get_user_datasets():
     return jsonify(dataset_manager.get_user_datasets(current_user)), 200
 
 @app.route("/datasets/sources", methods=["GET"])
-@jwt_required()
 def get_dataset_sources():
-    return jsonify({""})
+    return jsonify(get_connector_metadata())
 
 @app.route("/datasets/scrape", methods=["POST"])
 @jwt_required()
diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index e714048..9109e71 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -16,10 +16,11 @@ HEADERS = {
 }
 
 class BoardsAPI(BaseConnector):
+    source_name: str = "boards.ie"
+    display_name: str = "Boards.ie"
+
     def __init__(self):
         self.url = "https://www.boards.ie"
-        self.source_name = "boards.ie"
-        self.display_name = "Boards.ie"
 
     def get_new_posts_by_search(self, 
                                 search: str,
diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index 13e5e7b..2107ded 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -10,9 +10,11 @@ from server.connectors.base import BaseConnector
 logger = logging.getLogger(__name__)
 
 class RedditAPI(BaseConnector):
+    source_name = "reddit"
+    display_name = "Reddit"
+
     def __init__(self):
         self.url = "https://www.reddit.com/"
-        self.source_name = "Reddit"
 
     # Public Methods #
     def get_new_posts_by_search(self, 
diff --git a/server/connectors/registry.py b/server/connectors/registry.py
index 0883476..dfc1fda 100644
--- a/server/connectors/registry.py
+++ b/server/connectors/registry.py
@@ -1,14 +1,14 @@
 import pkgutil
 import importlib
-import connectors
-from connectors.base import BaseConnector
+import server.connectors
+from server.connectors.base import BaseConnector
 
 def _discover_connectors() -> list[type[BaseConnector]]:
     """Walk the connectors package and collect all BaseConnector subclasses."""
-    for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
+    for _, module_name, _ in pkgutil.iter_modules(server.connectors.__path__):
         if module_name in ("base", "registry"):
             continue
-        importlib.import_module(f"connectors.{module_name}")
+        importlib.import_module(f"server.connectors.{module_name}")
 
     return [
         cls for cls in BaseConnector.__subclasses__()
diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index 691f53d..323d3f8 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -13,6 +13,9 @@ load_dotenv()
 API_KEY = os.getenv("YOUTUBE_API_KEY")
 
 class YouTubeAPI(BaseConnector):
+    source_name = "youtube"
+    display_name = "YouTube"
+
     def __init__(self):
         self.youtube = build('youtube', 'v3', developerKey=API_KEY)
 
@@ -44,7 +47,7 @@ class YouTubeAPI(BaseConnector):
                         author=comment_snippet['authorDisplayName'],
                         timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
                         reply_to=None,
-                        source="YouTube"
+                        source=self.source_name
                     )
 
                     comments.append(comment)

From 0866dda8b3fe23312c9efc9d03998765b880eddd Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:25:05 +0000
Subject: [PATCH 07/36] chore: add util to always split evenly

---
 server/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/utils.py b/server/utils.py
index 815739f..5a783d2 100644
--- a/server/utils.py
+++ b/server/utils.py
@@ -48,3 +48,7 @@ def get_request_filters() -> dict:
         filters["data_sources"] = data_sources
 
     return filters
+
+def split_limit(limit: int, n: int) -> list[int]:
+    base, remainder = divmod(limit, n)
+    return [base + (1 if i < remainder else 0) for i in range(n)]

From 53cb5c2ea52bedcd5c84b45551aa43aa7fbe9e6a Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:36:08 +0000
Subject: [PATCH 08/36] feat(topics): add generalised topic list

This is easier and quicker compared to deriving a topics list based on the dataset that has been scraped.

While using LLMs to create a personalised topic list based on the query, category or dataset itself would yield better results for most, it is beyond the scope of this project.
---
 server/topics.json | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 server/topics.json

diff --git a/server/topics.json b/server/topics.json
new file mode 100644
index 0000000..271913a
--- /dev/null
+++ b/server/topics.json
@@ -0,0 +1,67 @@
+{
+  "Personal Life": "daily life, life updates, what happened today, personal stories, life events, reflections",
+
+  "Relationships": "dating, relationships, breakups, friendships, family relationships, marriage, relationship advice",
+
+  "Family & Parenting": "parents, parenting, children, raising kids, family dynamics, family stories",
+
+  "Work & Careers": "jobs, workplaces, office life, promotions, quitting jobs, career advice, workplace drama",
+
+  "Education": "school, studying, exams, university, homework, academic pressure, learning experiences",
+
+  "Money & Finance": "saving money, debt, budgeting, cost of living, financial advice, personal finance",
+
+  "Health & Fitness": "exercise, gym, workouts, running, diet, fitness routines, weight loss",
+
+  "Mental Health": "stress, anxiety, depression, burnout, therapy, emotional wellbeing",
+
+  "Food & Cooking": "meals, cooking, recipes, restaurants, snacks, food opinions",
+
+  "Travel": "holidays, trips, tourism, travel experiences, airports, flights, travel tips",
+
+  "Entertainment": "movies, TV shows, streaming services, celebrities, pop culture",
+
+  "Music": "songs, albums, artists, concerts, music opinions",
+
+  "Gaming": "video games, gaming culture, consoles, PC gaming, esports",
+
+  "Sports": "sports matches, teams, players, competitions, sports opinions",
+
+  "Technology": "phones, gadgets, apps, AI, software, tech trends",
+
+  "Internet Culture": "memes, viral trends, online jokes, internet drama, trending topics",
+
+  "Social Media": "platforms, influencers, content creators, algorithms, online communities",
+
+  "News & Current Events": "breaking news, world events, major incidents, public discussions",
+
+  "Politics": "political debates, elections, government policies, ideology",
+
+  "Culture & Society": "social issues, cultural trends, generational debates, societal changes",
+
+  "Identity & Lifestyle": "personal identity, lifestyle choices, values, self-expression",
+
+  "Hobbies & Interests": "art, photography, crafts, collecting, hobbies",
+
+  "Fashion & Beauty": "clothing, style, makeup, skincare, fashion trends",
+
+  "Animals & Pets": "pets, animal videos, pet care, wildlife",
+
+  "Humour": "jokes, funny stories, sarcasm, memes",
+
+  "Opinions & Debates": "hot takes, controversial opinions, arguments, discussions",
+
+  "Advice & Tips": "life advice, tutorials, how-to tips, recommendations",
+
+  "Product Reviews": "reviews, recommendations, experiences with products",
+
+  "Complaints & Rants": "frustrations, complaining, venting about things",
+
+  "Motivation & Inspiration": "motivational quotes, success stories, encouragement",
+
+  "Questions & Curiosity": "asking questions, seeking opinions, curiosity posts",
+
+  "Celebrations & Achievements": "birthdays, milestones, achievements, good news",
+
+  "Random Thoughts": "shower thoughts, observations, random ideas"
+}
\ No newline at end of file

From 17bd4702b22a5bce43a68a9efc1c2c1ed51df8db Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:36:40 +0000
Subject: [PATCH 09/36] fix(connectors): connector detectors returning name of
 ID alongside connector obj

---
 server/connectors/registry.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/server/connectors/registry.py b/server/connectors/registry.py
index dfc1fda..47b1d6a 100644
--- a/server/connectors/registry.py
+++ b/server/connectors/registry.py
@@ -15,11 +15,11 @@ def _discover_connectors() -> list[type[BaseConnector]]:
         if cls.source_name  # guard against abstract intermediaries
     ]
 
-def get_available_connectors() -> list[type[BaseConnector]]:
-    return [c for c in _discover_connectors() if c.is_available()]
+def get_available_connectors() -> dict[str, type[BaseConnector]]:
+    return {c.source_name: c for c in _discover_connectors() if c.is_available()}
 
 def get_connector_metadata() -> list[dict]:
     return [
-        {"id": c.source_name, "label": c.display_name}
-        for c in get_available_connectors()
+        {"id": id, "label": obj.display_name}
+        for id, obj in get_available_connectors().items()
     ]
\ No newline at end of file

From 2572664e2694c638b0bbdc4d1b94caeb806740de Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 18:50:53 +0000
Subject: [PATCH 10/36] chore(utils): add env getter that fails if env not
 found

---
 server/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/server/utils.py b/server/utils.py
index 5a783d2..1a507bb 100644
--- a/server/utils.py
+++ b/server/utils.py
@@ -1,4 +1,5 @@
 import datetime
+import os
 from flask import request
 
 def parse_datetime_filter(value):
@@ -52,3 +53,9 @@ def get_request_filters() -> dict:
 def split_limit(limit: int, n: int) -> list[int]:
     base, remainder = divmod(limit, n)
     return [base + (1 if i < remainder else 0) for i in range(n)]
+
+def get_env(name: str) -> str:
+    value = os.getenv(name)
+    if not value:
+        raise RuntimeError(f"Missing required environment variable: {name}")
+    return value

From 6ec47256d038ce7b4a30fea6812977cd8fdcf6dd Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 19:04:33 +0000
Subject: [PATCH 11/36] feat(api): add database scraping endpoints

---
 server/app.py                   | 60 ++++++++++++++++++++++++++++-----
 server/connectors/reddit_api.py | 19 ++++++-----
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/server/app.py b/server/app.py
index e5a8037..d896ac2 100644
--- a/server/app.py
+++ b/server/app.py
@@ -19,19 +19,18 @@ from server.exceptions import NotAuthorisedException, NonExistentDatasetExceptio
 from server.db.database import PostgresConnector
 from server.core.auth import AuthManager
 from server.core.datasets import DatasetManager
-from server.utils import get_request_filters
+from server.utils import get_request_filters, split_limit, get_env
 from server.queue.tasks import process_dataset
-from server.connectors.registry import get_connector_metadata
+from server.connectors.registry import get_available_connectors, get_connector_metadata
 
 app = Flask(__name__)
 
 # Env Variables
 load_dotenv()
-frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
-jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
-jwt_access_token_expires = int(
-    os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
-)  # Default to 20 minutes
+max_fetch_limit = int(get_env("MAX_FETCH_LIMIT"))
+frontend_url = get_env("FRONTEND_URL")
+jwt_secret_key = get_env("JWT_SECRET_KEY")
+jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200))  # Default to 20 minutes
 
 # Flask Configuration
 CORS(app, resources={r"/*": {"origins": frontend_url}})
@@ -45,7 +44,8 @@ db = PostgresConnector()
 auth_manager = AuthManager(db, bcrypt)
 dataset_manager = DatasetManager(db)
 stat_gen = StatGen()
-
+connectors = get_available_connectors()
+default_topic_list = json.load(open("server/topics.json"))
 
 @app.route("/register", methods=["POST"])
 def register_user():
@@ -122,8 +122,50 @@ def scrape_data():
     if "sources" not in request.form:
         return jsonify({"error": "Data source names are required."}), 400
     
-    sources = request.form.get("sources")
+    user_id = int(get_jwt_identity())
+    sources = request.form.getlist("sources")
+    limit = int(request.form.get("limit", max_fetch_limit))
 
+    dataset_name = request.form.get("name", "").strip()
+    search = request.form.get("search")
+    category = request.form.get("category")
+
+    print(sources)
+
+    if limit > max_fetch_limit:
+        return jsonify({"error": f"Due to API limitations, we cannot receive more than ${max_fetch_limit} posts"}), 400
+    
+    for source in sources:
+        if source not in connectors.keys():
+            return jsonify({"error": "Source must exist"}), 400
+        
+    limits = split_limit(limit, len(sources))
+    per_source = dict(zip(sources, limits))
+
+    try:
+        posts = []
+        for source_name, source_limit in per_source.items():
+            connector = connectors[source_name]()
+            posts.extend(connector.get_new_posts_by_search(
+                search=search,
+                category=category,
+                post_limit=source_limit,
+                comment_limit=source_limit
+            ))
+
+        dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, {})
+        process_dataset.delay(dataset_id, [p.to_dict() for p in posts], default_topic_list)
+
+        return jsonify(
+            {
+                "message": "Dataset queued for processing",
+                "dataset_id": dataset_id,
+                "status": "processing",
+            }
+        ), 202
+    except Exception:
+        print(traceback.format_exc())
+        return jsonify({"error": "An unexpected error occurred"}), 500
 
 @app.route("/datasets/upload", methods=["POST"])
 @jwt_required()
diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index 2107ded..444326a 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -19,32 +19,33 @@ class RedditAPI(BaseConnector):
     # Public Methods #
     def get_new_posts_by_search(self, 
                                 search: str, 
-                                subreddit: str, 
-                                limit: int
+                                category: str, 
+                                post_limit: int,
+                                comment_limit: int
                                 ) -> list[Post]:
         
         if not search:
-            return self._get_new_subreddit_posts(subreddit, limit=limit)
+            return self._get_new_subreddit_posts(category, limit=post_limit)
 
         params = {
             'q': search,
-            'limit': limit,
+            'limit': post_limit,
             'restrict_sr': 'on',
             'sort': 'new'
         }
 
-        logger.info(f"Searching subreddit '{subreddit}' for '{search}' with limit {limit}")
-        url = f"r/{subreddit}/search.json"
+        logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}")
+        url = f"r/{category}/search.json"
         posts = []
         
-        while len(posts) < limit:
-            batch_limit = min(100, limit - len(posts))
+        while len(posts) < post_limit:
+            batch_limit = min(100, post_limit - len(posts))
             params['limit'] = batch_limit
 
             data = self._fetch_post_overviews(url, params)
             batch_posts = self._parse_posts(data)
 
-            logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {subreddit}")
+            logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}")
 
             if not batch_posts:
                 break

From 15704a07823f6b1fd5ca40142f66d3b9e4e5acda Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 19:17:08 +0000
Subject: [PATCH 12/36] chore(db): update db schema to include "fetching"
 status

---
 server/core/datasets.py | 2 +-
 server/db/schema.sql    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/core/datasets.py b/server/core/datasets.py
index 3a62fc9..4690454 100644
--- a/server/core/datasets.py
+++ b/server/core/datasets.py
@@ -114,7 +114,7 @@ class DatasetManager:
         self.db.execute_batch(query, values)
 
     def set_dataset_status(self, dataset_id: int, status: str, status_message: str | None = None):
-        if status not in ["processing", "complete", "error"]:
+        if status not in ["fetching", "processing", "complete", "error"]:
             raise ValueError("Invalid status")
 
         query = """
diff --git a/server/db/schema.sql b/server/db/schema.sql
index 051a396..4550633 100644
--- a/server/db/schema.sql
+++ b/server/db/schema.sql
@@ -23,7 +23,7 @@ CREATE TABLE datasets (
 
     -- Enforce valid states
     CONSTRAINT datasets_status_check
-    CHECK (status IN ('processing', 'complete', 'error'))
+    CHECK (status IN ('fetching', 'processing', 'complete', 'error'))
 );
 
 CREATE TABLE events (

From a65c4a461c8e10f278e12431985c999af97e4d8a Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 19:17:41 +0000
Subject: [PATCH 13/36] fix(api): flask delegates dataset fetch to celery

---
 server/app.py         | 19 ++++---------------
 server/queue/tasks.py | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/server/app.py b/server/app.py
index d896ac2..16d6f39 100644
--- a/server/app.py
+++ b/server/app.py
@@ -20,7 +20,7 @@ from server.db.database import PostgresConnector
 from server.core.auth import AuthManager
 from server.core.datasets import DatasetManager
 from server.utils import get_request_filters, split_limit, get_env
-from server.queue.tasks import process_dataset
+from server.queue.tasks import process_dataset, fetch_and_process_dataset
 from server.connectors.registry import get_available_connectors, get_connector_metadata
 
 app = Flask(__name__)
@@ -130,8 +130,6 @@ def scrape_data():
     search = request.form.get("search")
     category = request.form.get("category")
 
-    print(sources)
-
     if limit > max_fetch_limit:
         return jsonify({"error": f"Due to API limitations, we cannot receive more than ${max_fetch_limit} posts"}), 400
     
@@ -141,20 +139,11 @@ def scrape_data():
         
     limits = split_limit(limit, len(sources))
     per_source = dict(zip(sources, limits))
+    dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
+    dataset_manager.set_dataset_status(dataset_id, "fetching", f"Data is being fetched from {str(sources)}")
 
     try:
-        posts = []
-        for source_name, source_limit in per_source.items():
-            connector = connectors[source_name]()
-            posts.extend(connector.get_new_posts_by_search(
-                search=search,
-                category=category,
-                post_limit=source_limit,
-                comment_limit=source_limit
-            ))
-
-        dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, {})
-        process_dataset.delay(dataset_id, [p.to_dict() for p in posts], default_topic_list)
+        fetch_and_process_dataset.delay(dataset_id, per_source, search, category, default_topic_list)
 
         return jsonify(
             {
diff --git a/server/queue/tasks.py b/server/queue/tasks.py
index a089596..8a71680 100644
--- a/server/queue/tasks.py
+++ b/server/queue/tasks.py
@@ -4,6 +4,7 @@ from server.queue.celery_app import celery
 from server.analysis.enrichment import DatasetEnrichment
 from server.db.database import PostgresConnector
 from server.core.datasets import DatasetManager
+from server.connectors.registry import get_available_connectors
 
 @celery.task(bind=True, max_retries=3)
 def process_dataset(self, dataset_id: int, posts: list, topics: dict):
@@ -18,5 +19,31 @@ def process_dataset(self, dataset_id: int, posts: list, topics: dict):
 
         dataset_manager.save_dataset_content(dataset_id, enriched_df)
         dataset_manager.set_dataset_status(dataset_id, "complete", "NLP Processing Completed Successfully")
+    except Exception as e:
+        dataset_manager.set_dataset_status(dataset_id, "error", f"An error occurred: {e}")
+
+@celery.task(bind=True, max_retries=3)
+def fetch_and_process_dataset(self, 
+                              dataset_id: int, 
+                              per_source: dict[str, int], 
+                              search: str, 
+                              category: str, 
+                              topics: dict):
+    connectors = get_available_connectors()
+    db = PostgresConnector()
+    dataset_manager = DatasetManager(db)
+    posts = []
+
+    try:
+        for source_name, source_limit in per_source.items():
+            connector = connectors[source_name]()
+            posts.extend(connector.get_new_posts_by_search(
+                search=search,
+                category=category,
+                post_limit=source_limit,
+                comment_limit=source_limit
+            ))
+
+        process_dataset.delay(dataset_id, [p.to_dict() for p in posts], topics)
     except Exception as e:
         dataset_manager.set_dataset_status(dataset_id, "error", f"An error occurred: {e}")
\ No newline at end of file

From a3dbe04a5715f2d5d0c7a7ef8ff605e29c229e03 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 19:23:48 +0000
Subject: [PATCH 14/36] fix(frontend): option to delete dataset not shown after
 fail

---
 frontend/src/pages/Datasets.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/frontend/src/pages/Datasets.tsx b/frontend/src/pages/Datasets.tsx
index 4c79cdc..ede2317 100644
--- a/frontend/src/pages/Datasets.tsx
+++ b/frontend/src/pages/Datasets.tsx
@@ -9,7 +9,7 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
 type DatasetItem = {
   id: number;
   name?: string;
-  status?: "processing" | "complete" | "error" | string;
+  status?: "processing" | "complete" | "error" | "fetching" | string;
   status_message?: string | null;
   completed_at?: string | null;
   created_at?: string | null;
@@ -93,7 +93,7 @@ const DatasetsPage = () => {
           <div style={{ ...styles.card, marginTop: 14, padding: 0, overflow: "hidden" }}>
             <ul style={styles.listNoBullets}>
               {datasets.map((dataset) => {
-                const isComplete = dataset.status === "complete";
+                const isComplete = dataset.status === "complete" || dataset.status === "error";
                 const editPath = `/dataset/${dataset.id}/edit`;
                 const targetPath = isComplete
                   ? `/dataset/${dataset.id}/stats`

From 7ccc934f7102138b3f413e6577182e12a9f7f7e4 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 22:14:45 +0000
Subject: [PATCH 15/36] build: change celery to debug mode

---
 docker-compose.dev.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
index 96c3430..dc3edb2 100644
--- a/docker-compose.dev.yml
+++ b/docker-compose.dev.yml
@@ -43,7 +43,7 @@ services:
       - .env
     command: >
       celery -A server.queue.celery_app.celery worker
-      --loglevel=info
+      --loglevel=debug
       --pool=solo
     depends_on:
       - postgres

From dc330b87b9696a0bfaa78310b6eeb22674b81df3 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 22:17:00 +0000
Subject: [PATCH 16/36] fix(celery): process dataset directly in fetch task

Calling the original `process_dataset` function led to issues with JSON serialisation.
---
 server/queue/tasks.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/server/queue/tasks.py b/server/queue/tasks.py
index 8a71680..7feaf9a 100644
--- a/server/queue/tasks.py
+++ b/server/queue/tasks.py
@@ -1,4 +1,5 @@
 import pandas as pd
+import json
 
 from server.queue.celery_app import celery
 from server.analysis.enrichment import DatasetEnrichment
@@ -37,13 +38,20 @@ def fetch_and_process_dataset(self,
     try:
         for source_name, source_limit in per_source.items():
             connector = connectors[source_name]()
-            posts.extend(connector.get_new_posts_by_search(
+            raw_posts = connector.get_new_posts_by_search(
                 search=search,
                 category=category,
                 post_limit=source_limit,
                 comment_limit=source_limit
-            ))
+            )
+            posts.extend(post.to_dict() for post in raw_posts)
 
-        process_dataset.delay(dataset_id, [p.to_dict() for p in posts], topics)
+        df = pd.DataFrame(posts)
+
+        processor = DatasetEnrichment(df, topics)
+        enriched_df = processor.enrich()
+
+        dataset_manager.save_dataset_content(dataset_id, enriched_df)
+        dataset_manager.set_dataset_status(dataset_id, "complete", "NLP Processing Completed Successfully")
     except Exception as e:
         dataset_manager.set_dataset_status(dataset_id, "error", f"An error occurred: {e}")
\ No newline at end of file

From 8fe84a30f6c1cf29c38ea4e4044ffd9d90756e68 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 22:45:07 +0000
Subject: [PATCH 17/36] fix: data leak when opening topics file

---
 server/app.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/server/app.py b/server/app.py
index 16d6f39..3efa6eb 100644
--- a/server/app.py
+++ b/server/app.py
@@ -37,15 +37,20 @@ CORS(app, resources={r"/*": {"origins": frontend_url}})
 app.config["JWT_SECRET_KEY"] = jwt_secret_key
 app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
 
+# Security
 bcrypt = Bcrypt(app)
 jwt = JWTManager(app)
 
+# Helper Objects
 db = PostgresConnector()
 auth_manager = AuthManager(db, bcrypt)
 dataset_manager = DatasetManager(db)
 stat_gen = StatGen()
 connectors = get_available_connectors()
-default_topic_list = json.load(open("server/topics.json"))
+
+# Default Files
+with open("server/topics.json") as f:
+    default_topic_list = json.load(f)
 
 @app.route("/register", methods=["POST"])
 def register_user():

From d520e2af98be4f2c0ab076af3b3126ba1d261f4d Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 22:48:04 +0000
Subject: [PATCH 18/36] fix(auth): missing email and username business rules

---
 server/core/auth.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/server/core/auth.py b/server/core/auth.py
index 625c3c2..34bb93c 100644
--- a/server/core/auth.py
+++ b/server/core/auth.py
@@ -1,6 +1,10 @@
+import re
+
 from server.db.database import PostgresConnector
 from flask_bcrypt import Bcrypt
 
+EMAIL_REGEX = re.compile(r"[^@]+@[^@]+\.[^@]+")
+
 class AuthManager:
     def __init__(self, db: PostgresConnector, bcrypt: Bcrypt):
         self.db = db
@@ -18,6 +22,12 @@ class AuthManager:
     def register_user(self, username, email, password):
         hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8")
 
+        if len(username) < 3:
+            raise ValueError("Username must be longer than 3 characters")
+        
+        if not EMAIL_REGEX.match(email):
+            raise ValueError("Please enter a valid email address")
+
         if self.get_user_by_email(email):
             raise ValueError("Email already registered")
         

From 2ab74d922ab61b709c42cade087cbe1ff36747eb Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 23:15:33 +0000
Subject: [PATCH 19/36] feat(api): support per-source search, category and
 limit configuration

---
 server/app.py         | 38 ++++++++++++++++++++------------------
 server/queue/tasks.py | 21 +++++++++++++--------
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/server/app.py b/server/app.py
index 3efa6eb..550537f 100644
--- a/server/app.py
+++ b/server/app.py
@@ -124,31 +124,33 @@ def get_dataset_sources():
 @app.route("/datasets/scrape", methods=["POST"])
 @jwt_required()
 def scrape_data():
-    if "sources" not in request.form:
-        return jsonify({"error": "Data source names are required."}), 400
+    data = request.get_json()
+
+    if not data or "sources" not in data:
+            return jsonify({"error": "Sources must be provided"}), 400
     
     user_id = int(get_jwt_identity())
-    sources = request.form.getlist("sources")
-    limit = int(request.form.get("limit", max_fetch_limit))
+    dataset_name = data["name"].strip()
+    source_configs = data["sources"]
 
-    dataset_name = request.form.get("name", "").strip()
-    search = request.form.get("search")
-    category = request.form.get("category")
+    if not isinstance(source_configs, list) or len(source_configs) == 0:
+        return jsonify({"error": "Sources must be a non-empty list"}), 400
 
-    if limit > max_fetch_limit:
-        return jsonify({"error": f"Due to API limitations, we cannot receive more than ${max_fetch_limit} posts"}), 400
-    
-    for source in sources:
-        if source not in connectors.keys():
-            return jsonify({"error": "Source must exist"}), 400
-        
-    limits = split_limit(limit, len(sources))
-    per_source = dict(zip(sources, limits))
+    # Light Validation
+    for source in source_configs:
+        if "name" not in source:
+            return jsonify({"error": "Each source must contain a name"}), 400
+        if "limit" in source:
+            source["limit"] = int(source["limit"])
+  
     dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
-    dataset_manager.set_dataset_status(dataset_id, "fetching", f"Data is being fetched from {str(sources)}")
+    dataset_manager.set_dataset_status(dataset_id, 
+                                       "fetching", 
+                                       f"Data is being fetched from {str(source["name"] + "," for source in source_configs)}"
+                                       )
 
     try:
-        fetch_and_process_dataset.delay(dataset_id, per_source, search, category, default_topic_list)
+        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
 
         return jsonify(
             {
diff --git a/server/queue/tasks.py b/server/queue/tasks.py
index 7feaf9a..fd5237f 100644
--- a/server/queue/tasks.py
+++ b/server/queue/tasks.py
@@ -1,5 +1,5 @@
 import pandas as pd
-import json
+import logging
 
 from server.queue.celery_app import celery
 from server.analysis.enrichment import DatasetEnrichment
@@ -7,6 +7,8 @@ from server.db.database import PostgresConnector
 from server.core.datasets import DatasetManager
 from server.connectors.registry import get_available_connectors
 
+logger = logging.getLogger(__name__)
+
 @celery.task(bind=True, max_retries=3)
 def process_dataset(self, dataset_id: int, posts: list, topics: dict):
     db = PostgresConnector()
@@ -26,9 +28,7 @@ def process_dataset(self, dataset_id: int, posts: list, topics: dict):
 @celery.task(bind=True, max_retries=3)
 def fetch_and_process_dataset(self, 
                               dataset_id: int, 
-                              per_source: dict[str, int], 
-                              search: str, 
-                              category: str, 
+                              source_info: list[dict],
                               topics: dict):
     connectors = get_available_connectors()
     db = PostgresConnector()
@@ -36,13 +36,18 @@ def fetch_and_process_dataset(self,
     posts = []
 
     try:
-        for source_name, source_limit in per_source.items():
-            connector = connectors[source_name]()
+        for metadata in source_info:
+            name = metadata["name"]
+            search = metadata.get("search")
+            category = metadata.get("category")
+            limit = metadata.get("limit", 100)
+
+            connector = connectors[name]()
             raw_posts = connector.get_new_posts_by_search(
                 search=search,
                 category=category,
-                post_limit=source_limit,
-                comment_limit=source_limit
+                post_limit=limit,
+                comment_limit=limit
             )
             posts.extend(post.to_dict() for post in raw_posts)
 

From 524c9c50a08e38880b4ae985e633b532513915db Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 23:23:22 +0000
Subject: [PATCH 20/36] fix(api): incorrect dataset status update message

---
 server/app.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/server/app.py b/server/app.py
index 550537f..b187fb0 100644
--- a/server/app.py
+++ b/server/app.py
@@ -144,11 +144,12 @@ def scrape_data():
             source["limit"] = int(source["limit"])
   
     dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
-    dataset_manager.set_dataset_status(dataset_id, 
-                                       "fetching", 
-                                       f"Data is being fetched from {str(source["name"] + "," for source in source_configs)}"
-                                       )
-
+    dataset_manager.set_dataset_status(
+        dataset_id,
+        "fetching",
+        f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+    )
+    
     try:
         fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
 

From eff416c34eef605b75b6804fdbb6ad13e8a46f86 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 10 Mar 2026 23:36:09 +0000
Subject: [PATCH 21/36] fix(connectors): hardcoded source name in Youtube
 connector

---
 server/connectors/youtube_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index 323d3f8..a3047d7 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -59,7 +59,7 @@ class YouTubeAPI(BaseConnector):
                     timestamp=published_at,
                     url=f"https://www.youtube.com/watch?v={video_id}",
                     title=title,
-                    source="YouTube",
+                    source=self.source_name,
                     comments=comments
                 )
 

From b2ae1a9f7013755387b3d3b9704992a5181abe94 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Wed, 11 Mar 2026 19:41:34 +0000
Subject: [PATCH 22/36] feat(frontend): add page for scraping endpoint

---
 frontend/src/App.tsx                |   2 +
 frontend/src/pages/AutoScrape.tsx   | 299 ++++++++++++++++++++++++++++
 frontend/src/pages/Datasets.tsx     |  15 +-
 frontend/src/utils/documentTitle.ts |   1 +
 4 files changed, 314 insertions(+), 3 deletions(-)
 create mode 100644 frontend/src/pages/AutoScrape.tsx

diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx
index b1e6045..1b10f61 100644
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -5,6 +5,7 @@ import DatasetsPage from "./pages/Datasets";
 import DatasetStatusPage from "./pages/DatasetStatus";
 import LoginPage from "./pages/Login";
 import UploadPage from "./pages/Upload";
+import AutoScrapePage from "./pages/AutoScrape";
 import StatPage from "./pages/Stats";
 import { getDocumentTitle } from "./utils/documentTitle";
 import DatasetEditPage from "./pages/DatasetEdit";
@@ -22,6 +23,7 @@ function App() {
         <Route path="/" element={<Navigate to="/login" replace />} />
         <Route path="/login" element={<LoginPage />} />
         <Route path="/upload" element={<UploadPage />} />
+        <Route path="/auto-scrape" element={<AutoScrapePage />} />
         <Route path="/datasets" element={<DatasetsPage />} />
         <Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} />
         <Route path="/dataset/:datasetId/stats" element={<StatPage />} />
diff --git a/frontend/src/pages/AutoScrape.tsx b/frontend/src/pages/AutoScrape.tsx
new file mode 100644
index 0000000..7e8e754
--- /dev/null
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -0,0 +1,299 @@
+import axios from "axios";
+import { useEffect, useState } from "react";
+import { useNavigate } from "react-router-dom";
+import StatsStyling from "../styles/stats_styling";
+
+const styles = StatsStyling;
+const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
+
+type SourceOption = {
+  id: string;
+  label: string;
+};
+
+type SourceConfig = {
+  sourceName: string;
+  limit: string;
+  search: string;
+  category: string;
+};
+
+const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
+  sourceName,
+  limit: "100",
+  search: "",
+  category: "",
+});
+
+const AutoScrapePage = () => {
+  const navigate = useNavigate();
+  const [datasetName, setDatasetName] = useState("");
+  const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]);
+  const [sourceConfigs, setSourceConfigs] = useState<SourceConfig[]>([]);
+  const [returnMessage, setReturnMessage] = useState("");
+  const [isLoadingSources, setIsLoadingSources] = useState(true);
+  const [isSubmitting, setIsSubmitting] = useState(false);
+  const [hasError, setHasError] = useState(false);
+
+  useEffect(() => {
+    axios
+      .get<SourceOption[]>(`${API_BASE_URL}/datasets/sources`)
+      .then((response) => {
+        const options = response.data || [];
+        setSourceOptions(options);
+        setSourceConfigs([buildEmptySourceConfig(options[0]?.id || "")]);
+      })
+      .catch((requestError: unknown) => {
+        setHasError(true);
+        if (axios.isAxiosError(requestError)) {
+          setReturnMessage(
+            `Failed to load available sources: ${String(
+              requestError.response?.data?.error || requestError.message
+            )}`
+          );
+        } else {
+          setReturnMessage("Failed to load available sources.");
+        }
+      })
+      .finally(() => {
+        setIsLoadingSources(false);
+      });
+  }, []);
+
+  const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
+    setSourceConfigs((previous) =>
+      previous.map((config, configIndex) =>
+        configIndex === index ? { ...config, [field]: value } : config
+      )
+    );
+  };
+
+  const addSourceConfig = () => {
+    setSourceConfigs((previous) => [
+      ...previous,
+      buildEmptySourceConfig(sourceOptions[0]?.id || ""),
+    ]);
+  };
+
+  const removeSourceConfig = (index: number) => {
+    setSourceConfigs((previous) => previous.filter((_, configIndex) => configIndex !== index));
+  };
+
+  const autoScrape = async () => {
+    const token = localStorage.getItem("access_token");
+    if (!token) {
+      setHasError(true);
+      setReturnMessage("You must be signed in to auto scrape a dataset.");
+      return;
+    }
+
+    const normalizedDatasetName = datasetName.trim();
+    if (!normalizedDatasetName) {
+      setHasError(true);
+      setReturnMessage("Please add a dataset name before continuing.");
+      return;
+    }
+
+    if (sourceConfigs.length === 0) {
+      setHasError(true);
+      setReturnMessage("Please add at least one source.");
+      return;
+    }
+
+    const normalizedSources = sourceConfigs.map((source) => ({
+      name: source.sourceName,
+      limit: Number(source.limit || 100),
+      search: source.search.trim() || undefined,
+      category: source.category.trim() || undefined,
+    }));
+
+    const invalidSource = normalizedSources.find(
+      (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
+    );
+
+    if (invalidSource) {
+      setHasError(true);
+      setReturnMessage("Every source needs a name and a limit greater than zero.");
+      return;
+    }
+
+    try {
+      setIsSubmitting(true);
+      setHasError(false);
+      setReturnMessage("");
+
+      const response = await axios.post(
+        `${API_BASE_URL}/datasets/scrape`,
+        {
+          name: normalizedDatasetName,
+          sources: normalizedSources,
+        },
+        {
+          headers: {
+            Authorization: `Bearer ${token}`,
+          },
+        }
+      );
+
+      const datasetId = Number(response.data.dataset_id);
+
+      setReturnMessage(
+        `Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...`
+      );
+
+      setTimeout(() => {
+        navigate(`/dataset/${datasetId}/status`);
+      }, 400);
+    } catch (requestError: unknown) {
+      setHasError(true);
+      if (axios.isAxiosError(requestError)) {
+        const message = String(
+          requestError.response?.data?.error || requestError.message || "Auto scrape failed."
+        );
+        setReturnMessage(`Auto scrape failed: ${message}`);
+      } else {
+        setReturnMessage("Auto scrape failed due to an unexpected error.");
+      }
+    } finally {
+      setIsSubmitting(false);
+    }
+  };
+
+  return (
+    <div style={styles.page}>
+      <div style={styles.containerWide}>
+        <div style={{ ...styles.card, ...styles.headerBar }}>
+          <div>
+            <h1 style={styles.sectionHeaderTitle}>Auto Scrape Dataset</h1>
+            <p style={styles.sectionHeaderSubtitle}>
+              Select sources and scrape settings, then queue processing automatically.
+            </p>
+          </div>
+          <button
+            type="button"
+            style={{ ...styles.buttonPrimary, opacity: isSubmitting || isLoadingSources ? 0.75 : 1 }}
+            onClick={autoScrape}
+            disabled={isSubmitting || isLoadingSources}
+          >
+            {isSubmitting ? "Queueing..." : "Auto Scrape and Analyze"}
+          </button>
+        </div>
+
+        <div
+          style={{
+            ...styles.grid,
+            marginTop: 14,
+            gridTemplateColumns: "repeat(auto-fit, minmax(280px, 1fr))",
+          }}
+        >
+          <div style={{ ...styles.card, gridColumn: "auto" }}>
+            <h2 style={{ ...styles.sectionTitle, color: "#24292f" }}>Dataset Name</h2>
+            <p style={styles.sectionSubtitle}>Use a clear label so you can identify this run later.</p>
+            <input
+              style={{ ...styles.input, ...styles.inputFullWidth }}
+              type="text"
+              placeholder="Example: r/cork subreddit - Jan 2026"
+              value={datasetName}
+              onChange={(event) => setDatasetName(event.target.value)}
+            />
+          </div>
+
+          <div style={{ ...styles.card, gridColumn: "auto" }}>
+            <h2 style={{ ...styles.sectionTitle, color: "#24292f" }}>Sources</h2>
+            <p style={styles.sectionSubtitle}>
+              Configure source, limit, optional search, and optional category.
+            </p>
+
+            {isLoadingSources && <p style={styles.subtleBodyText}>Loading sources...</p>}
+
+            {!isLoadingSources && sourceOptions.length === 0 && (
+              <p style={styles.subtleBodyText}>No source connectors are currently available.</p>
+            )}
+
+            {!isLoadingSources && sourceOptions.length > 0 && (
+              <div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
+                {sourceConfigs.map((source, index) => (
+                  <div
+                    key={`source-${index}`}
+                    style={{
+                      border: "1px solid #d0d7de",
+                      borderRadius: 8,
+                      padding: 12,
+                      background: "#f6f8fa",
+                      display: "grid",
+                      gap: 8,
+                    }}
+                  >
+                    <select
+                      value={source.sourceName}
+                      style={{ ...styles.input, ...styles.inputFullWidth }}
+                      onChange={(event) => updateSourceConfig(index, "sourceName", event.target.value)}
+                    >
+                      {sourceOptions.map((option) => (
+                        <option key={option.id} value={option.id}>
+                          {option.label}
+                        </option>
+                      ))}
+                    </select>
+
+                    <input
+                      type="number"
+                      min={1}
+                      value={source.limit}
+                      placeholder="Limit"
+                      style={{ ...styles.input, ...styles.inputFullWidth }}
+                      onChange={(event) => updateSourceConfig(index, "limit", event.target.value)}
+                    />
+
+                    <input
+                      type="text"
+                      value={source.search}
+                      placeholder="Search term (optional)"
+                      style={{ ...styles.input, ...styles.inputFullWidth }}
+                      onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
+                    />
+
+                    <input
+                      type="text"
+                      value={source.category}
+                      placeholder="Category (optional)"
+                      style={{ ...styles.input, ...styles.inputFullWidth }}
+                      onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
+                    />
+
+                    {sourceConfigs.length > 1 && (
+                      <button
+                        type="button"
+                        style={styles.buttonSecondary}
+                        onClick={() => removeSourceConfig(index)}
+                      >
+                        Remove source
+                      </button>
+                    )}
+                  </div>
+                ))}
+
+                <button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
+                  Add another source
+                </button>
+              </div>
+            )}
+          </div>
+        </div>
+
+        <div
+          style={{
+            ...styles.card,
+            marginTop: 14,
+            ...(hasError ? styles.alertCardError : styles.alertCardInfo),
+          }}
+        >
+          {returnMessage ||
+            "After queueing, your dataset is fetched and processed in the background automatically."}
+        </div>
+      </div>
+    </div>
+  );
+};
+
+export default AutoScrapePage;
diff --git a/frontend/src/pages/Datasets.tsx b/frontend/src/pages/Datasets.tsx
index ede2317..daffaf2 100644
--- a/frontend/src/pages/Datasets.tsx
+++ b/frontend/src/pages/Datasets.tsx
@@ -63,9 +63,18 @@ const DatasetsPage = () => {
               View and reopen datasets you previously uploaded.
             </p>
           </div>
-          <button type="button" style={styles.buttonPrimary} onClick={() => navigate("/upload")}>
-            Upload New Dataset
-          </button>
+          <div style={styles.controlsWrapped}>
+            <button type="button" style={styles.buttonPrimary} onClick={() => navigate("/upload")}>
+              Upload New Dataset
+            </button>
+            <button
+              type="button"
+              style={styles.buttonSecondary}
+              onClick={() => navigate("/auto-scrape")}
+            >
+              Auto Scrape Dataset
+            </button>
+          </div>
         </div>
 
         {error && (
diff --git a/frontend/src/utils/documentTitle.ts b/frontend/src/utils/documentTitle.ts
index 904a6a8..5c7d00d 100644
--- a/frontend/src/utils/documentTitle.ts
+++ b/frontend/src/utils/documentTitle.ts
@@ -3,6 +3,7 @@ const DEFAULT_TITLE = "Ethnograph View";
 const STATIC_TITLES: Record<string, string> = {
   "/login": "Sign In",
   "/upload": "Upload Dataset",
+  "/auto-scrape": "Auto Scrape Dataset",
   "/datasets": "My Datasets",
 };
 

From 0658713f422fa92f9ccf343543fdfee951c15c17 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Wed, 11 Mar 2026 19:44:38 +0000
Subject: [PATCH 23/36] chore: remove unused dataset creation script

---
 create_dataset.py | 43 -------------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 create_dataset.py

diff --git a/create_dataset.py b/create_dataset.py
deleted file mode 100644
index 791b2bd..0000000
--- a/create_dataset.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import json
-import logging
-from connectors.reddit_api import RedditAPI
-from connectors.boards_api import BoardsAPI
-from connectors.youtube_api import YouTubeAPI
-
-posts_file = 'posts_test.jsonl'
-
-reddit_connector = RedditAPI()
-boards_connector = BoardsAPI()
-youtube_connector = YouTubeAPI()
-
-logging.basicConfig(level=logging.DEBUG)
-logging.getLogger("urllib3").setLevel(logging.WARNING)
-
-def remove_empty_posts(posts):
-    return [post for post in posts if post.content.strip() != ""]
-
-def save_to_jsonl(filename, posts):
-    with open(filename, 'a', encoding='utf-8') as f:
-        for post in posts:
-            # Convert post object to dict if it's a dataclass
-            data = post.to_dict()
-            f.write(json.dumps(data) + '\n')
-
-
-def main():
-    boards_posts = boards_connector.get_new_category_posts('cork-city', 1200, 1200)
-    save_to_jsonl(posts_file, boards_posts)
-
-    reddit_posts = reddit_connector.get_new_subreddit_posts('cork', 1200)
-    reddit_posts = remove_empty_posts(reddit_posts)
-    save_to_jsonl(posts_file, reddit_posts)
-
-    ireland_posts = reddit_connector.search_new_subreddit_posts('cork', 'ireland', 1200)
-    ireland_posts = remove_empty_posts(ireland_posts)
-    save_to_jsonl(posts_file, ireland_posts)
-
-    youtube_videos = youtube_connector.fetch_videos('cork city', 1200, 1200)
-    save_to_jsonl(posts_file, youtube_videos)
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file

From 12cbc240748e41087a4af57e9ea99fbf963f2ce2 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Wed, 11 Mar 2026 19:47:44 +0000
Subject: [PATCH 24/36] chore(utils): remove `split_limit` function

---
 server/app.py   | 2 +-
 server/utils.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/server/app.py b/server/app.py
index b187fb0..460cb77 100644
--- a/server/app.py
+++ b/server/app.py
@@ -19,7 +19,7 @@ from server.exceptions import NotAuthorisedException, NonExistentDatasetExceptio
 from server.db.database import PostgresConnector
 from server.core.auth import AuthManager
 from server.core.datasets import DatasetManager
-from server.utils import get_request_filters, split_limit, get_env
+from server.utils import get_request_filters, get_env
 from server.queue.tasks import process_dataset, fetch_and_process_dataset
 from server.connectors.registry import get_available_connectors, get_connector_metadata
 
diff --git a/server/utils.py b/server/utils.py
index 1a507bb..fb42953 100644
--- a/server/utils.py
+++ b/server/utils.py
@@ -50,10 +50,6 @@ def get_request_filters() -> dict:
 
     return filters
 
-def split_limit(limit: int, n: int) -> list[int]:
-    base, remainder = divmod(limit, n)
-    return [base + (1 if i < remainder else 0) for i in range(n)]
-
 def get_env(name: str) -> str:
     value = os.getenv(name)
     if not value:

From 01d6bd01640cba4126ea262418ea366b5b4262eb Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Wed, 11 Mar 2026 21:16:26 +0000
Subject: [PATCH 25/36] fix(connectors): category / search fields breaking

Ideally category and search are fully optional, however some sites break if one or the other is not provided.

Unfortuntely `boards.ie` has a different page type for searches and I'm not bothered to implement a scraper from scratch.

In addition, removed comment limit options.
---
 server/connectors/base.py       |  3 +--
 server/connectors/boards_api.py | 35 +++++++++++++++----------
 server/connectors/reddit_api.py | 45 ++++++++++++++++++---------------
 server/queue/tasks.py           |  3 +--
 4 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/server/connectors/base.py b/server/connectors/base.py
index f555769..bad73c5 100644
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -17,7 +17,6 @@ class BaseConnector(ABC):
     def get_new_posts_by_search(self, 
                                 search: str = None, 
                                 category: str = None, 
-                                post_limit: int = 10, 
-                                comment_limit: int = 10
+                                post_limit: int = 10
                                 ) -> list[Post]:
         ...
\ No newline at end of file
diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index 9109e71..600c864 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -25,22 +25,29 @@ class BoardsAPI(BaseConnector):
     def get_new_posts_by_search(self, 
                                 search: str,
                                 category: str, 
-                                post_limit: int, 
-                                comment_limit: int
+                                post_limit: int
                                 )  -> list[Post]:
+        if search:
+            raise NotImplementedError("Search not compatible with boards.ie")
+        
+        if category:
+            return self._get_posts(f"{self.url}/categories/{category}", post_limit)
+        else:
+            return self._get_posts(f"{self.url}/discussions", post_limit)
+    
+    ## Private
+    def _get_posts(self, url, limit) -> list[Post]:
         urls = []
         current_page = 1
 
-        logger.info(f"Fetching posts from category: {category}")
-
-        while len(urls) < post_limit:
-            url = f"{self.url}/categories/{category}/p{current_page}"
+        while len(urls) < limit:
+            url = f"{self.url}/p{current_page}"
             html = self._fetch_page(url)
             soup = BeautifulSoup(html, "html.parser")
 
-            logger.debug(f"Processing page {current_page} for category {category}")
+            logger.debug(f"Processing page {current_page} for link: {url}")
             for a in soup.select("a.threadbit-threadlink"):
-                if len(urls) >= post_limit:
+                if len(urls) >= limit:
                     break
 
                 href = a.get("href")
@@ -49,14 +56,14 @@ class BoardsAPI(BaseConnector):
             
             current_page += 1
 
-        logger.debug(f"Fetched {len(urls)} post URLs from category {category}")
+        logger.debug(f"Fetched {len(urls)} post URLs")
 
         # Fetch post details for each URL and create Post objects
         posts = []
 
         def fetch_and_parse(post_url):
             html = self._fetch_page(post_url)
-            post = self._parse_thread(html, post_url, comment_limit)
+            post = self._parse_thread(html, post_url)
             return post
 
         with ThreadPoolExecutor(max_workers=30) as executor:
@@ -79,7 +86,7 @@ class BoardsAPI(BaseConnector):
         response.raise_for_status()
         return response.text
 
-    def _parse_thread(self, html: str, post_url: str, comment_limit: int) -> Post:
+    def _parse_thread(self, html: str, post_url: str) -> Post:
         soup = BeautifulSoup(html, "html.parser")
         
         # Author
@@ -108,7 +115,7 @@ class BoardsAPI(BaseConnector):
         title = title_tag.text.strip() if title_tag else None
 
         # Comments
-        comments = self._parse_comments(post_url, post_num, comment_limit)
+        comments = self._parse_comments(post_url, post_num)
 
         post = Post(
             id=post_num,
@@ -123,11 +130,11 @@ class BoardsAPI(BaseConnector):
 
         return post
 
-    def _parse_comments(self, url: str, post_id: str, comment_limit: int) -> list[Comment]:
+    def _parse_comments(self, url: str, post_id: str) -> list[Comment]:
         comments = []
         current_url = url
 
-        while current_url and len(comments) < comment_limit:
+        while current_url:
             html = self._fetch_page(current_url)
             page_comments = self._parse_page_comments(html, post_id)
             comments.extend(page_comments)
diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index 444326a..9042c6c 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -20,39 +20,44 @@ class RedditAPI(BaseConnector):
     def get_new_posts_by_search(self, 
                                 search: str, 
                                 category: str, 
-                                post_limit: int,
-                                comment_limit: int
+                                post_limit: int
                                 ) -> list[Post]:
         
-        if not search:
-            return self._get_new_subreddit_posts(category, limit=post_limit)
+        prefix = f"r/{category}/" if category else ""
+        params = {'limit': post_limit}
 
-        params = {
-            'q': search,
-            'limit': post_limit,
-            'restrict_sr': 'on',
-            'sort': 'new'
-        }
+        if search:
+            endpoint = f"{prefix}search.json"
+            params.update({
+                'q': search,
+                'sort': 'new',
+                'restrict_sr': 'on' if category else 'off' 
+            })
+        else:
+            endpoint = f"{prefix}new.json"
 
-        logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}")
-        url = f"r/{category}/search.json"
         posts = []
-        
+        after = None
+
         while len(posts) < post_limit:
             batch_limit = min(100, post_limit - len(posts))
             params['limit'] = batch_limit
+            if after:
+                params['after'] = after
 
-            data = self._fetch_post_overviews(url, params)
-            batch_posts = self._parse_posts(data)
-
-            logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}")
-
-            if not batch_posts:
+            data = self._fetch_post_overviews(endpoint, params)
+            
+            if not data or 'data' not in data or not data['data'].get('children'):
                 break
 
+            batch_posts = self._parse_posts(data)
             posts.extend(batch_posts)
 
-        return posts
+            after = data['data'].get('after')
+            if not after:
+                break
+
+        return posts[:post_limit]
     
     def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
         posts = []
diff --git a/server/queue/tasks.py b/server/queue/tasks.py
index fd5237f..95248d1 100644
--- a/server/queue/tasks.py
+++ b/server/queue/tasks.py
@@ -46,8 +46,7 @@ def fetch_and_process_dataset(self,
             raw_posts = connector.get_new_posts_by_search(
                 search=search,
                 category=category,
-                post_limit=limit,
-                comment_limit=limit
+                post_limit=limit
             )
             posts.extend(post.to_dict() for post in raw_posts)
 

From c12f1b437109e9606b5888069e277a5a21a0ce2e Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Thu, 12 Mar 2026 09:56:34 +0000
Subject: [PATCH 26/36] chore(connectors): add category and search validation
 fields

---
 server/connectors/base.py        |  3 +++
 server/connectors/boards_api.py  |  3 +++
 server/connectors/reddit_api.py  |  6 ++++--
 server/connectors/registry.py    | 15 ++++++++++-----
 server/connectors/youtube_api.py |  6 ++++--
 5 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/server/connectors/base.py b/server/connectors/base.py
index bad73c5..3614c8a 100644
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -7,6 +7,9 @@ class BaseConnector(ABC):
     display_name: str      # human-readable: "Reddit", "YouTube"
     required_env: list[str] = []  # env vars needed to activate
 
+    search_enabled: bool
+    categories_enabled: bool
+
     @classmethod
     def is_available(cls) -> bool:
         """Returns True if all required env vars are set."""
diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index 600c864..6cded92 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -19,6 +19,9 @@ class BoardsAPI(BaseConnector):
     source_name: str = "boards.ie"
     display_name: str = "Boards.ie"
 
+    categories_enabled: bool = True
+    search_enabled: bool = False
+
     def __init__(self):
         self.url = "https://www.boards.ie"
 
diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index 9042c6c..dd62119 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
 logger = logging.getLogger(__name__)
 
 class RedditAPI(BaseConnector):
-    source_name = "reddit"
-    display_name = "Reddit"
+    source_name: str = "reddit"
+    display_name: str = "Reddit"
+    search_enabled: bool = True
+    categories_enabled: bool = True
 
     def __init__(self):
         self.url = "https://www.reddit.com/"
diff --git a/server/connectors/registry.py b/server/connectors/registry.py
index 47b1d6a..f2371e6 100644
--- a/server/connectors/registry.py
+++ b/server/connectors/registry.py
@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
 def get_available_connectors() -> dict[str, type[BaseConnector]]:
     return {c.source_name: c for c in _discover_connectors() if c.is_available()}
 
-def get_connector_metadata() -> list[dict]:
-    return [
-        {"id": id, "label": obj.display_name}
-        for id, obj in get_available_connectors().items()
-    ]
\ No newline at end of file
+def get_connector_metadata() -> dict[str, dict]:
+    res = {}
+    for id, obj in get_available_connectors().items():
+        res[id] = {"id": id, 
+                   "label": obj.display_name, 
+                   "search_enabled": obj.search_enabled, 
+                   "categories_enabled": obj.categories_enabled
+                   }
+
+    return res
\ No newline at end of file
diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index a3047d7..bcf5fe8 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -13,8 +13,10 @@ load_dotenv()
 API_KEY = os.getenv("YOUTUBE_API_KEY")
 
 class YouTubeAPI(BaseConnector):
-    source_name = "youtube"
-    display_name = "YouTube"
+    source_name: str = "youtube"
+    display_name: str = "YouTube"
+    search_enabled: bool = True
+    categories_enabled: bool = False
 
     def __init__(self):
         self.youtube = build('youtube', 'v3', developerKey=API_KEY)

From 6684780d233ae34e63cc6759ee0ea3d4f28177a7 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Thu, 12 Mar 2026 09:59:07 +0000
Subject: [PATCH 27/36] fix(connectors): add stronger validation to scrape
 endpoint

Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.
---
 server/app.py | 82 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/server/app.py b/server/app.py
index 460cb77..b315a60 100644
--- a/server/app.py
+++ b/server/app.py
@@ -119,50 +119,82 @@ def get_user_datasets():
 
 @app.route("/datasets/sources", methods=["GET"])
 def get_dataset_sources():
-    return jsonify(get_connector_metadata())
+    list_metadata = list(get_connector_metadata().values())
+    return jsonify(list_metadata)
 
 @app.route("/datasets/scrape", methods=["POST"])
 @jwt_required()
 def scrape_data():
     data = request.get_json()
+    connector_metadata = get_connector_metadata()
 
+    # Strong validation needed, otherwise data goes to Celery and crashes silently
     if not data or "sources" not in data:
-            return jsonify({"error": "Sources must be provided"}), 400
-    
-    user_id = int(get_jwt_identity())
+        return jsonify({"error": "Sources must be provided"}), 400
+
+    if "name" not in data or not str(data["name"]).strip():
+        return jsonify({"error": "Dataset name is required"}), 400
+
     dataset_name = data["name"].strip()
+    user_id = int(get_jwt_identity())
+
     source_configs = data["sources"]
 
     if not isinstance(source_configs, list) or len(source_configs) == 0:
         return jsonify({"error": "Sources must be a non-empty list"}), 400
 
-    # Light Validation
     for source in source_configs:
+        if not isinstance(source, dict):
+            return jsonify({"error": "Each source must be an object"}), 400
+
         if "name" not in source:
             return jsonify({"error": "Each source must contain a name"}), 400
-        if "limit" in source:
-            source["limit"] = int(source["limit"])
-  
-    dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
-    dataset_manager.set_dataset_status(
-        dataset_id,
-        "fetching",
-        f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
-    )
-    
-    try:
-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
 
-        return jsonify(
-            {
-                "message": "Dataset queued for processing",
-                "dataset_id": dataset_id,
-                "status": "processing",
-            }
-        ), 202
+        if "limit" in source:
+            try:
+                source["limit"] = int(source["limit"])
+            except (ValueError, TypeError):
+                return jsonify({"error": "Limit must be an integer"}), 400
+
+        name = source["name"]
+
+        if name not in connector_metadata:
+            return jsonify({"error": "Source not supported"}), 400
+
+        if "search" in source and not connector_metadata[name]["search_enabled"]:
+            return jsonify({"error": f"Source {name} does not support search"}), 400
+
+        if "category" in source and not connector_metadata[name]["categories_enabled"]:
+            return jsonify({"error": f"Source {name} does not support categories"}), 400
+
+    try:
+        dataset_id = dataset_manager.save_dataset_info(
+            user_id,
+            dataset_name,
+            default_topic_list
+        )
+
+        dataset_manager.set_dataset_status(
+            dataset_id,
+            "fetching",
+            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+        )
+
+        fetch_and_process_dataset.delay(
+            dataset_id,
+            source_configs,
+            default_topic_list
+        )
     except Exception:
         print(traceback.format_exc())
-        return jsonify({"error": "An unexpected error occurred"}), 500
+        return jsonify({"error": "Failed to queue dataset processing"}), 500
+
+
+    return jsonify({
+        "message": "Dataset queued for processing",
+        "dataset_id": dataset_id,
+        "status": "processing"
+    }), 202
 
 @app.route("/datasets/upload", methods=["POST"])
 @jwt_required()

From 162a4de64e078f8543b307ba70dc7c633ccbb73f Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Thu, 12 Mar 2026 10:07:28 +0000
Subject: [PATCH 28/36] fix(frontend): detects which sources support category
 or search

---
 frontend/src/pages/AutoScrape.tsx | 61 +++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/frontend/src/pages/AutoScrape.tsx b/frontend/src/pages/AutoScrape.tsx
index 7e8e754..9e9d336 100644
--- a/frontend/src/pages/AutoScrape.tsx
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
 type SourceOption = {
   id: string;
   label: string;
+  search_enabled?: boolean;
+  categories_enabled?: boolean;
+  searchEnabled?: boolean;
+  categoriesEnabled?: boolean;
 };
 
 type SourceConfig = {
@@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
   category: "",
 });
 
+const supportsSearch = (source?: SourceOption): boolean =>
+  Boolean(source?.search_enabled ?? source?.searchEnabled);
+
+const supportsCategories = (source?: SourceOption): boolean =>
+  Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
+
 const AutoScrapePage = () => {
   const navigate = useNavigate();
   const [datasetName, setDatasetName] = useState("");
@@ -63,11 +73,18 @@ const AutoScrapePage = () => {
   const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
     setSourceConfigs((previous) =>
       previous.map((config, configIndex) =>
-        configIndex === index ? { ...config, [field]: value } : config
+        configIndex === index
+          ? field === "sourceName"
+            ? { ...config, sourceName: value, search: "", category: "" }
+            : { ...config, [field]: value }
+          : config
       )
     );
   };
 
+  const getSourceOption = (sourceName: string) =>
+    sourceOptions.find((option) => option.id === sourceName);
+
   const addSourceConfig = () => {
     setSourceConfigs((previous) => [
       ...previous,
@@ -100,12 +117,18 @@ const AutoScrapePage = () => {
       return;
     }
 
-    const normalizedSources = sourceConfigs.map((source) => ({
-      name: source.sourceName,
-      limit: Number(source.limit || 100),
-      search: source.search.trim() || undefined,
-      category: source.category.trim() || undefined,
-    }));
+    const normalizedSources = sourceConfigs.map((source) => {
+      const sourceOption = getSourceOption(source.sourceName);
+
+      return {
+        name: source.sourceName,
+        limit: Number(source.limit || 100),
+        search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined,
+        category: supportsCategories(sourceOption)
+          ? source.category.trim() || undefined
+          : undefined,
+      };
+    });
 
     const invalidSource = normalizedSources.find(
       (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
@@ -212,7 +235,12 @@ const AutoScrapePage = () => {
 
             {!isLoadingSources && sourceOptions.length > 0 && (
               <div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
-                {sourceConfigs.map((source, index) => (
+                {sourceConfigs.map((source, index) => {
+                  const sourceOption = getSourceOption(source.sourceName);
+                  const searchEnabled = supportsSearch(sourceOption);
+                  const categoriesEnabled = supportsCategories(sourceOption);
+
+                  return (
                   <div
                     key={`source-${index}`}
                     style={{
@@ -248,16 +276,26 @@ const AutoScrapePage = () => {
                     <input
                       type="text"
                       value={source.search}
-                      placeholder="Search term (optional)"
+                      placeholder={
+                        searchEnabled
+                          ? "Search term (optional)"
+                          : "Search not supported for this source"
+                      }
                       style={{ ...styles.input, ...styles.inputFullWidth }}
+                      disabled={!searchEnabled}
                       onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
                     />
 
                     <input
                       type="text"
                       value={source.category}
-                      placeholder="Category (optional)"
+                      placeholder={
+                        categoriesEnabled
+                          ? "Category (optional)"
+                          : "Categories not supported for this source"
+                      }
                       style={{ ...styles.input, ...styles.inputFullWidth }}
+                      disabled={!categoriesEnabled}
                       onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
                     />
 
@@ -271,7 +309,8 @@ const AutoScrapePage = () => {
                       </button>
                     )}
                   </div>
-                ))}
+                  );
+                })}
 
                 <button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
                   Add another source

From d96f459104757310a437ac82d7a3db0d169627ca Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Fri, 13 Mar 2026 21:59:17 +0000
Subject: [PATCH 29/36] fix(connectors): update URL references to use base_url
 in BoardsAPI

---
 server/connectors/boards_api.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index 6cded92..96309b3 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -23,7 +23,7 @@ class BoardsAPI(BaseConnector):
     search_enabled: bool = False
 
     def __init__(self):
-        self.url = "https://www.boards.ie"
+        self.base_url = "https://www.boards.ie"
 
     def get_new_posts_by_search(self, 
                                 search: str,
@@ -34,9 +34,9 @@ class BoardsAPI(BaseConnector):
             raise NotImplementedError("Search not compatible with boards.ie")
         
         if category:
-            return self._get_posts(f"{self.url}/categories/{category}", post_limit)
+            return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
         else:
-            return self._get_posts(f"{self.url}/discussions", post_limit)
+            return self._get_posts(f"{self.base_url}/discussions", post_limit)
     
     ## Private
     def _get_posts(self, url, limit) -> list[Post]:
@@ -44,7 +44,7 @@ class BoardsAPI(BaseConnector):
         current_page = 1
 
         while len(urls) < limit:
-            url = f"{self.url}/p{current_page}"
+            url = f"{url}/p{current_page}"
             html = self._fetch_page(url)
             soup = BeautifulSoup(html, "html.parser")
 
@@ -148,7 +148,7 @@ class BoardsAPI(BaseConnector):
 
             if next_link and next_link.get('href'):
                 href = next_link.get('href')
-                current_url = href if href.startswith('http') else self.url + href
+                current_url = href if href.startswith('http') else url + href
             else:
                 current_url = None
 

From 8a423b2a293959040b5e38f10c44f5ca36585b36 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 16:59:43 +0000
Subject: [PATCH 30/36] feat(connectors): implement category validation in
 scraping process

---
 server/app.py                    |  9 +++++++--
 server/connectors/base.py        |  4 ++++
 server/connectors/boards_api.py  |  3 +++
 server/connectors/reddit_api.py  | 11 +++++++++++
 server/connectors/youtube_api.py |  3 +++
 5 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/server/app.py b/server/app.py
index b315a60..9759cd7 100644
--- a/server/app.py
+++ b/server/app.py
@@ -157,15 +157,20 @@ def scrape_data():
                 return jsonify({"error": "Limit must be an integer"}), 400
 
         name = source["name"]
+        category = source.get("category")
+        search = source.get("search")
 
         if name not in connector_metadata:
             return jsonify({"error": "Source not supported"}), 400
 
-        if "search" in source and not connector_metadata[name]["search_enabled"]:
+        if category and not connector_metadata[name]["search_enabled"]:
             return jsonify({"error": f"Source {name} does not support search"}), 400
 
-        if "category" in source and not connector_metadata[name]["categories_enabled"]:
+        if category and not connector_metadata[name]["categories_enabled"]:
             return jsonify({"error": f"Source {name} does not support categories"}), 400
+        
+        if category and not connectors[name]().category_exists(category):
+            return jsonify({"error": f"Category does not exist for {name}"}), 400
 
     try:
         dataset_id = dataset_manager.save_dataset_info(
diff --git a/server/connectors/base.py b/server/connectors/base.py
index 3614c8a..48163b5 100644
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -22,4 +22,8 @@ class BaseConnector(ABC):
                                 category: str = None, 
                                 post_limit: int = 10
                                 ) -> list[Post]:
+        ...
+
+    @abstractmethod
+    def category_exists(self, category: str) -> bool:
         ...
\ No newline at end of file
diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index 96309b3..e51bdaf 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -37,6 +37,9 @@ class BoardsAPI(BaseConnector):
             return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
         else:
             return self._get_posts(f"{self.base_url}/discussions", post_limit)
+        
+    def category_exists(self, category):
+        return True
     
     ## Private
     def _get_posts(self, url, limit) -> list[Post]:
diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py
index dd62119..7955fca 100644
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -94,6 +94,17 @@ class RedditAPI(BaseConnector):
         data = self._fetch_post_overviews(f"user/{username}/about.json", {})
         return self._parse_user(data)
     
+    def category_exists(self, category: str) -> bool:
+        try:
+            data = self._fetch_post_overviews(f"r/{category}/about.json", {})
+            return (
+                data is not None
+                and 'data' in data
+                and data['data'].get('id') is not None
+            )
+        except Exception:
+            return False
+
     ## Private Methods ##
     def _parse_posts(self, data) -> list[Post]:
         posts = []
diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index bcf5fe8..c71d59c 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -68,6 +68,9 @@ class YouTubeAPI(BaseConnector):
                 posts.append(post)
 
             return posts
+    
+    def category_exists(self, category):
+        return True
 
     def search_videos(self, query, limit):
         request = self.youtube.search().list(

From c990f29645e9d51c2f32a094700aa2ca37b09b0f Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 17:05:46 +0000
Subject: [PATCH 31/36] fix(frontend): misaligned loading page for datasets

---
 frontend/src/pages/Datasets.tsx | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/frontend/src/pages/Datasets.tsx b/frontend/src/pages/Datasets.tsx
index daffaf2..d06d32a 100644
--- a/frontend/src/pages/Datasets.tsx
+++ b/frontend/src/pages/Datasets.tsx
@@ -50,7 +50,24 @@ const DatasetsPage = () => {
   }, []);
 
   if (loading) {
-    return <p style={{ ...styles.page, minHeight: "100vh" }}>Loading datasets...</p>;
+    return (
+      <div style={styles.loadingPage}>
+        <div style={{ ...styles.loadingCard, transform: "translateY(-100px)" }}>
+          <div style={styles.loadingHeader}>
+            <div style={styles.loadingSpinner} />
+            <div>
+              <h2 style={styles.loadingTitle}>Loading datasets</h2>
+            </div>
+          </div>
+
+          <div style={styles.loadingSkeleton}>
+            <div style={{ ...styles.loadingSkeletonLine, ...styles.loadingSkeletonLineLong }} />
+            <div style={{ ...styles.loadingSkeletonLine, ...styles.loadingSkeletonLineMed }} />
+            <div style={{ ...styles.loadingSkeletonLine, ...styles.loadingSkeletonLineShort }} />
+          </div>
+        </div>
+      </div>
+    )
   }
 
   return (

From 2a00795cc2707274b41af4043624db9007b528f5 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 17:11:49 +0000
Subject: [PATCH 32/36] chore(connectors): implement `category_exists` for
 Boards API

---
 server/connectors/boards_api.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index e51bdaf..57e79f7 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -38,9 +38,28 @@ class BoardsAPI(BaseConnector):
         else:
             return self._get_posts(f"{self.base_url}/discussions", post_limit)
         
-    def category_exists(self, category):
-        return True
-    
+    def category_exists(self, category: str) -> bool:
+        if not category:
+            return False
+
+        url = f"{self.base_url}/categories/{category}"
+
+        try:
+            response = requests.head(url, headers=HEADERS, allow_redirects=True)
+
+            if response.status_code == 200:
+                return True
+            if response.status_code == 404:
+                return False
+
+            # fallback if HEAD not supported
+            response = requests.get(url, headers=HEADERS)
+            return response.status_code == 200
+
+        except requests.RequestException as e:
+            logger.error(f"Error checking category '{category}': {e}")
+            return False
+        
     ## Private
     def _get_posts(self, url, limit) -> list[Post]:
         urls = []

From 062937ec3ce2bc2960c5c176e7898f31f7b87e9f Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 17:12:02 +0000
Subject: [PATCH 33/36] fix(api): incorrect validation on search

---
 server/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server/app.py b/server/app.py
index 9759cd7..3504325 100644
--- a/server/app.py
+++ b/server/app.py
@@ -163,7 +163,7 @@ def scrape_data():
         if name not in connector_metadata:
             return jsonify({"error": "Source not supported"}), 400
 
-        if category and not connector_metadata[name]["search_enabled"]:
+        if search and not connector_metadata[name]["search_enabled"]:
             return jsonify({"error": f"Source {name} does not support search"}), 400
 
         if category and not connector_metadata[name]["categories_enabled"]:

From d2b919cd66f4d78cd0c72febfd90af6cfb13c5ff Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 17:35:05 +0000
Subject: [PATCH 34/36] fix(api): enforce integer limit and cap at 1000 in
 scrape_data function

---
 server/app.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/server/app.py b/server/app.py
index 3504325..95ba846 100644
--- a/server/app.py
+++ b/server/app.py
@@ -150,16 +150,20 @@ def scrape_data():
         if "name" not in source:
             return jsonify({"error": "Each source must contain a name"}), 400
 
-        if "limit" in source:
-            try:
-                source["limit"] = int(source["limit"])
-            except (ValueError, TypeError):
-                return jsonify({"error": "Limit must be an integer"}), 400
-
         name = source["name"]
+        limit = source.get("limit", 1000)
         category = source.get("category")
         search = source.get("search")
 
+        if limit:
+            try:
+                limit = int(limit)
+            except (ValueError, TypeError):
+                return jsonify({"error": "Limit must be an integer"}), 400
+            
+            if limit > 1000:
+                limit = 1000
+
         if name not in connector_metadata:
             return jsonify({"error": "Source not supported"}), 400
 

From 5b0441c34b7dfa6ef884bcb48999b43a11c68443 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 21:53:13 +0000
Subject: [PATCH 35/36] fix(connector): unnecessary comment limits

In addition, I made some methods private to better align with the BaseConnector parent class.
---
 server/connectors/youtube_api.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py
index c71d59c..7b014d0 100644
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -24,10 +24,9 @@ class YouTubeAPI(BaseConnector):
     def get_new_posts_by_search(self, 
                                 search: str,
                                 category: str, 
-                                post_limit: int, 
-                                comment_limit: int
+                                post_limit: int
                                 )  -> list[Post]:
-            videos = self.search_videos(search, post_limit)
+            videos = self._search_videos(search, post_limit)
             posts = []
 
             for video in videos:
@@ -39,7 +38,7 @@ class YouTubeAPI(BaseConnector):
                 channel_title = snippet['channelTitle']
 
                 comments = []
-                comments_data = self.get_video_comments(video_id, comment_limit)
+                comments_data = self._get_video_comments(video_id)
                 for comment_thread in comments_data:
                     comment_snippet = comment_thread['snippet']['topLevelComment']['snippet']
                     comment = Comment(
@@ -72,7 +71,7 @@ class YouTubeAPI(BaseConnector):
     def category_exists(self, category):
         return True
 
-    def search_videos(self, query, limit):
+    def _search_videos(self, query, limit):
         request = self.youtube.search().list(
             q=query,
             part='snippet',
@@ -82,11 +81,10 @@ class YouTubeAPI(BaseConnector):
         response = request.execute()
         return response.get('items', [])
     
-    def get_video_comments(self, video_id, limit):
+    def _get_video_comments(self, video_id):
         request = self.youtube.commentThreads().list(
             part='snippet',
             videoId=video_id,
-            maxResults=limit,
             textFormat='plainText'
         )
 

From 12f59531469e23270fc27d83900bda7c372f21d2 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sat, 14 Mar 2026 21:58:00 +0000
Subject: [PATCH 36/36] fix(api): remove error exceptions in API responses

Mainly a security thing, we don't want actual code errors being given in the API response, as someone could find out how the inner workings of the code behaves.
---
 server/app.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/server/app.py b/server/app.py
index 95ba846..f373843 100644
--- a/server/app.py
+++ b/server/app.py
@@ -74,7 +74,7 @@ def register_user():
         return jsonify({"error": str(e)}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
     print(f"Registered new user: {username}")
     return jsonify({"message": f"User '{username}' registered successfully"}), 200
@@ -99,7 +99,7 @@ def login_user():
             return jsonify({"error": "Invalid username or password"}), 401
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 @app.route("/profile", methods=["GET"])
@@ -245,9 +245,9 @@ def upload_data():
             }
         ), 202
     except ValueError as e:
-        return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
+        return jsonify({"error": f"Failed to read JSONL file"}), 400
     except Exception as e:
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 @app.route("/dataset/<int:dataset_id>", methods=["GET"])
 @jwt_required()
@@ -350,10 +350,10 @@ def content_endpoint(dataset_id):
     except NonExistentDatasetException:
         return jsonify({"error": "Dataset does not exist"}), 404
     except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+        return jsonify({"error": f"Malformed or missing data"}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 @app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
@@ -372,10 +372,10 @@ def get_summary(dataset_id):
     except NonExistentDatasetException:
         return jsonify({"error": "Dataset does not exist"}), 404
     except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+        return jsonify({"error": f"Malformed or missing data"}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 @app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
@@ -394,10 +394,10 @@ def get_time_analysis(dataset_id):
     except NonExistentDatasetException:
         return jsonify({"error": "Dataset does not exist"}), 404
     except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+        return jsonify({"error": f"Malformed or missing data"}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 @app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
@@ -416,10 +416,10 @@ def get_user_analysis(dataset_id):
     except NonExistentDatasetException:
         return jsonify({"error": "Dataset does not exist"}), 404
     except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+        return jsonify({"error": f"Malformed or missing data"}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 @app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
@@ -438,10 +438,10 @@ def get_cultural_analysis(dataset_id):
     except NonExistentDatasetException:
         return jsonify({"error": "Dataset does not exist"}), 404
     except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+        return jsonify({"error": f"Malformed or missing data"}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 @app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
@@ -460,10 +460,10 @@ def get_interaction_analysis(dataset_id):
     except NonExistentDatasetException:
         return jsonify({"error": "Dataset does not exist"}), 404
     except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+        return jsonify({"error": f"Malformed or missing data"}), 400
     except Exception as e:
         print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+        return jsonify({"error": f"An unexpected error occurred"}), 500
 
 
 if __name__ == "__main__":