From cc799f736875d7e8326423b78c4b650c8ebee359 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 9 Mar 2026 21:28:44 +0000 Subject: [PATCH] feat(connectors): add base connector and registry for detection Idea is to have a "plugin-type" system, where new connectors can extend the `BaseConnector` class and implement the fetch posts method. These are automatically detected by the registry, and automatically used in new Flask endpoints that give a list of possible sources. Allows for an open-ended system where new data scrapers / API consumers can be added dynamically. --- server/app.py | 14 ++++++++++++++ server/connectors/base.py | 23 +++++++++++++++++++++++ server/connectors/boards_api.py | 13 ++++++++++--- server/connectors/registry.py | 25 +++++++++++++++++++++++++ server/core/datasets.py | 2 +- 5 files changed, 73 insertions(+), 4 deletions(-) create mode 100644 server/connectors/base.py create mode 100644 server/connectors/registry.py diff --git a/server/app.py b/server/app.py index eb27e70..23c095f 100644 --- a/server/app.py +++ b/server/app.py @@ -111,6 +111,20 @@ def get_user_datasets(): current_user = int(get_jwt_identity()) return jsonify(dataset_manager.get_user_datasets(current_user)), 200 +@app.route("/datasets/sources", methods=["GET"]) +@jwt_required() +def get_dataset_sources(): + return jsonify({""}) + +@app.route("/datasets/scrape", methods=["POST"]) +@jwt_required() +def scrape_data(): + if "sources" not in request.form: + return jsonify({"error": "Data source names are required."}), 400 + + sources = request.form.get("sources") + + @app.route("/datasets/upload", methods=["POST"]) @jwt_required() def upload_data(): diff --git a/server/connectors/base.py b/server/connectors/base.py new file mode 100644 index 0000000..f555769 --- /dev/null +++ b/server/connectors/base.py @@ -0,0 +1,23 @@ +from abc import ABC, abstractmethod +from dto.post import Post + +class BaseConnector(ABC): + # Each subclass declares these at the class level + source_name: str # machine-readable: "reddit", "youtube" + display_name: str # human-readable: "Reddit", "YouTube" + required_env: list[str] = [] # env vars needed to activate + + @classmethod + def is_available(cls) -> bool: + """Returns True if all required env vars are set.""" + import os + return all(os.getenv(var) for var in cls.required_env) + + @abstractmethod + def get_new_posts_by_search(self, + search: str = None, + category: str = None, + post_limit: int = 10, + comment_limit: int = 10 + ) -> list[Post]: + ... \ No newline at end of file diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index 1b63aa9..e714048 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -7,6 +7,7 @@ from dto.post import Post from dto.comment import Comment from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed +from server.connectors.base import BaseConnector logger = logging.getLogger(__name__) @@ -14,12 +15,18 @@ HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)" } -class BoardsAPI: +class BoardsAPI(BaseConnector): def __init__(self): self.url = "https://www.boards.ie" - self.source_name = "Boards.ie" + self.source_name = "boards.ie" + self.display_name = "Boards.ie" - def get_new_category_posts(self, category: str, post_limit: int, comment_limit: int) -> list[Post]: + def get_new_posts_by_search(self, + search: str, + category: str, + post_limit: int, + comment_limit: int + ) -> list[Post]: urls = [] current_page = 1 diff --git a/server/connectors/registry.py b/server/connectors/registry.py new file mode 100644 index 0000000..0883476 --- /dev/null +++ b/server/connectors/registry.py @@ -0,0 +1,25 @@ +import pkgutil +import importlib +import connectors +from connectors.base import BaseConnector + +def _discover_connectors() -> list[type[BaseConnector]]: + """Walk the connectors package and collect all BaseConnector subclasses.""" + for _, module_name, _ in pkgutil.iter_modules(connectors.__path__): + if module_name in ("base", "registry"): + continue + importlib.import_module(f"connectors.{module_name}") + + return [ + cls for cls in BaseConnector.__subclasses__() + if cls.source_name # guard against abstract intermediaries + ] + +def get_available_connectors() -> list[type[BaseConnector]]: + return [c for c in _discover_connectors() if c.is_available()] + +def get_connector_metadata() -> list[dict]: + return [ + {"id": c.source_name, "label": c.display_name} + for c in get_available_connectors() + ] \ No newline at end of file diff --git a/server/core/datasets.py b/server/core/datasets.py index 5886cfc..3a62fc9 100644 --- a/server/core/datasets.py +++ b/server/core/datasets.py @@ -1,7 +1,7 @@ import pandas as pd from server.db.database import PostgresConnector from psycopg2.extras import Json -from server.exceptions import NotAuthorisedException, NonExistentDatasetException +from server.exceptions import NonExistentDatasetException class DatasetManager: def __init__(self, db: PostgresConnector):