feat(connectors): add base connector and registry for detection
Idea is to have a "plugin-type" system, where new connectors can extend the `BaseConnector` class and implement the fetch posts method. These are automatically detected by the registry, and automatically used in new Flask endpoints that give a list of possible sources. Allows for an open-ended system where new data scrapers / API consumers can be added dynamically.
This commit is contained in:
@@ -111,6 +111,20 @@ def get_user_datasets():
|
||||
current_user = int(get_jwt_identity())
|
||||
return jsonify(dataset_manager.get_user_datasets(current_user)), 200
|
||||
|
||||
@app.route("/datasets/sources", methods=["GET"])
|
||||
@jwt_required()
|
||||
def get_dataset_sources():
|
||||
return jsonify({""})
|
||||
|
||||
@app.route("/datasets/scrape", methods=["POST"])
|
||||
@jwt_required()
|
||||
def scrape_data():
|
||||
if "sources" not in request.form:
|
||||
return jsonify({"error": "Data source names are required."}), 400
|
||||
|
||||
sources = request.form.get("sources")
|
||||
|
||||
|
||||
@app.route("/datasets/upload", methods=["POST"])
|
||||
@jwt_required()
|
||||
def upload_data():
|
||||
|
||||
23
server/connectors/base.py
Normal file
23
server/connectors/base.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dto.post import Post
|
||||
|
||||
class BaseConnector(ABC):
|
||||
# Each subclass declares these at the class level
|
||||
source_name: str # machine-readable: "reddit", "youtube"
|
||||
display_name: str # human-readable: "Reddit", "YouTube"
|
||||
required_env: list[str] = [] # env vars needed to activate
|
||||
|
||||
@classmethod
|
||||
def is_available(cls) -> bool:
|
||||
"""Returns True if all required env vars are set."""
|
||||
import os
|
||||
return all(os.getenv(var) for var in cls.required_env)
|
||||
|
||||
@abstractmethod
|
||||
def get_new_posts_by_search(self,
|
||||
search: str = None,
|
||||
category: str = None,
|
||||
post_limit: int = 10,
|
||||
comment_limit: int = 10
|
||||
) -> list[Post]:
|
||||
...
|
||||
@@ -7,6 +7,7 @@ from dto.post import Post
|
||||
from dto.comment import Comment
|
||||
from bs4 import BeautifulSoup
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from server.connectors.base import BaseConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -14,12 +15,18 @@ HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"
|
||||
}
|
||||
|
||||
class BoardsAPI:
|
||||
class BoardsAPI(BaseConnector):
|
||||
def __init__(self):
|
||||
self.url = "https://www.boards.ie"
|
||||
self.source_name = "Boards.ie"
|
||||
self.source_name = "boards.ie"
|
||||
self.display_name = "Boards.ie"
|
||||
|
||||
def get_new_category_posts(self, category: str, post_limit: int, comment_limit: int) -> list[Post]:
|
||||
def get_new_posts_by_search(self,
|
||||
search: str,
|
||||
category: str,
|
||||
post_limit: int,
|
||||
comment_limit: int
|
||||
) -> list[Post]:
|
||||
urls = []
|
||||
current_page = 1
|
||||
|
||||
|
||||
25
server/connectors/registry.py
Normal file
25
server/connectors/registry.py
Normal file
@@ -0,0 +1,25 @@
|
||||
import pkgutil
|
||||
import importlib
|
||||
import connectors
|
||||
from connectors.base import BaseConnector
|
||||
|
||||
def _discover_connectors() -> list[type[BaseConnector]]:
|
||||
"""Walk the connectors package and collect all BaseConnector subclasses."""
|
||||
for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
|
||||
if module_name in ("base", "registry"):
|
||||
continue
|
||||
importlib.import_module(f"connectors.{module_name}")
|
||||
|
||||
return [
|
||||
cls for cls in BaseConnector.__subclasses__()
|
||||
if cls.source_name # guard against abstract intermediaries
|
||||
]
|
||||
|
||||
def get_available_connectors() -> list[type[BaseConnector]]:
|
||||
return [c for c in _discover_connectors() if c.is_available()]
|
||||
|
||||
def get_connector_metadata() -> list[dict]:
|
||||
return [
|
||||
{"id": c.source_name, "label": c.display_name}
|
||||
for c in get_available_connectors()
|
||||
]
|
||||
@@ -1,7 +1,7 @@
|
||||
import pandas as pd
|
||||
from server.db.database import PostgresConnector
|
||||
from psycopg2.extras import Json
|
||||
from server.exceptions import NotAuthorisedException, NonExistentDatasetException
|
||||
from server.exceptions import NonExistentDatasetException
|
||||
|
||||
class DatasetManager:
|
||||
def __init__(self, db: PostgresConnector):
|
||||
|
||||
Reference in New Issue
Block a user