From cc799f736875d7e8326423b78c4b650c8ebee359 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Mon, 9 Mar 2026 21:28:44 +0000
Subject: [PATCH] feat(connectors): add base connector and registry for
 detection

Idea is to have a "plugin-type" system, where new connectors can extend the `BaseConnector` class and implement the fetch posts method.

These are automatically detected by the registry, and automatically used in new Flask endpoints that give a list of possible sources.

Allows for an open-ended system where new data scrapers / API consumers can be added dynamically.
---
 server/app.py                   | 14 ++++++++++++++
 server/connectors/base.py       | 23 +++++++++++++++++++++++
 server/connectors/boards_api.py | 13 ++++++++++---
 server/connectors/registry.py   | 25 +++++++++++++++++++++++++
 server/core/datasets.py         |  2 +-
 5 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 server/connectors/base.py
 create mode 100644 server/connectors/registry.py

diff --git a/server/app.py b/server/app.py
index eb27e70..23c095f 100644
--- a/server/app.py
+++ b/server/app.py
@@ -111,6 +111,20 @@ def get_user_datasets():
     current_user = int(get_jwt_identity())
     return jsonify(dataset_manager.get_user_datasets(current_user)), 200
 
+@app.route("/datasets/sources", methods=["GET"])
+@jwt_required()
+def get_dataset_sources():
+    return jsonify({""})
+
+@app.route("/datasets/scrape", methods=["POST"])
+@jwt_required()
+def scrape_data():
+    if "sources" not in request.form:
+        return jsonify({"error": "Data source names are required."}), 400
+    
+    sources = request.form.get("sources")
+
+
 @app.route("/datasets/upload", methods=["POST"])
 @jwt_required()
 def upload_data():
diff --git a/server/connectors/base.py b/server/connectors/base.py
new file mode 100644
index 0000000..f555769
--- /dev/null
+++ b/server/connectors/base.py
@@ -0,0 +1,23 @@
+from abc import ABC, abstractmethod
+from dto.post import Post
+
+class BaseConnector(ABC):
+    # Each subclass declares these at the class level
+    source_name: str       # machine-readable: "reddit", "youtube"
+    display_name: str      # human-readable: "Reddit", "YouTube"
+    required_env: list[str] = []  # env vars needed to activate
+
+    @classmethod
+    def is_available(cls) -> bool:
+        """Returns True if all required env vars are set."""
+        import os
+        return all(os.getenv(var) for var in cls.required_env)
+
+    @abstractmethod
+    def get_new_posts_by_search(self, 
+                                search: str = None, 
+                                category: str = None, 
+                                post_limit: int = 10, 
+                                comment_limit: int = 10
+                                ) -> list[Post]:
+        ...
\ No newline at end of file
diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py
index 1b63aa9..e714048 100644
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -7,6 +7,7 @@ from dto.post import Post
 from dto.comment import Comment
 from bs4 import BeautifulSoup
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from server.connectors.base import BaseConnector
 
 logger = logging.getLogger(__name__)
 
@@ -14,12 +15,18 @@ HEADERS = {
     "User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"
 }
 
-class BoardsAPI:
+class BoardsAPI(BaseConnector):
     def __init__(self):
         self.url = "https://www.boards.ie"
-        self.source_name = "Boards.ie"
+        self.source_name = "boards.ie"
+        self.display_name = "Boards.ie"
 
-    def get_new_category_posts(self, category: str, post_limit: int, comment_limit: int)  -> list[Post]:
+    def get_new_posts_by_search(self, 
+                                search: str,
+                                category: str, 
+                                post_limit: int, 
+                                comment_limit: int
+                                )  -> list[Post]:
         urls = []
         current_page = 1
 
diff --git a/server/connectors/registry.py b/server/connectors/registry.py
new file mode 100644
index 0000000..0883476
--- /dev/null
+++ b/server/connectors/registry.py
@@ -0,0 +1,25 @@
+import pkgutil
+import importlib
+import connectors
+from connectors.base import BaseConnector
+
+def _discover_connectors() -> list[type[BaseConnector]]:
+    """Walk the connectors package and collect all BaseConnector subclasses."""
+    for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
+        if module_name in ("base", "registry"):
+            continue
+        importlib.import_module(f"connectors.{module_name}")
+
+    return [
+        cls for cls in BaseConnector.__subclasses__()
+        if cls.source_name  # guard against abstract intermediaries
+    ]
+
+def get_available_connectors() -> list[type[BaseConnector]]:
+    return [c for c in _discover_connectors() if c.is_available()]
+
+def get_connector_metadata() -> list[dict]:
+    return [
+        {"id": c.source_name, "label": c.display_name}
+        for c in get_available_connectors()
+    ]
\ No newline at end of file
diff --git a/server/core/datasets.py b/server/core/datasets.py
index 5886cfc..3a62fc9 100644
--- a/server/core/datasets.py
+++ b/server/core/datasets.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from server.db.database import PostgresConnector
 from psycopg2.extras import Json
-from server.exceptions import NotAuthorisedException, NonExistentDatasetException
+from server.exceptions import NonExistentDatasetException
 
 class DatasetManager:
     def __init__(self, db: PostgresConnector):