Automatic Scraping of dataset options #9

Merged
dylan merged 36 commits from feat/automatic-scraping-datasets into main 2026-03-14 21:58:49 +00:00
5 changed files with 16 additions and 10 deletions
Showing only changes of commit 5ccb2e73cd - Show all commits

View File

@@ -21,6 +21,7 @@ from server.core.auth import AuthManager
from server.core.datasets import DatasetManager
from server.utils import get_request_filters
from server.queue.tasks import process_dataset
from server.connectors.registry import get_connector_metadata
app = Flask(__name__)
@@ -112,9 +113,8 @@ def get_user_datasets():
return jsonify(dataset_manager.get_user_datasets(current_user)), 200
@app.route("/datasets/sources", methods=["GET"])
@jwt_required()
def get_dataset_sources():
return jsonify({""})
return jsonify(get_connector_metadata())
@app.route("/datasets/scrape", methods=["POST"])
@jwt_required()

View File

@@ -16,10 +16,11 @@ HEADERS = {
}
class BoardsAPI(BaseConnector):
source_name: str = "boards.ie"
display_name: str = "Boards.ie"
def __init__(self):
self.url = "https://www.boards.ie"
self.source_name = "boards.ie"
self.display_name = "Boards.ie"
def get_new_posts_by_search(self,
search: str,

View File

@@ -10,9 +10,11 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__)
class RedditAPI(BaseConnector):
source_name = "reddit"
display_name = "Reddit"
def __init__(self):
self.url = "https://www.reddit.com/"
self.source_name = "Reddit"
# Public Methods #
def get_new_posts_by_search(self,

View File

@@ -1,14 +1,14 @@
import pkgutil
import importlib
import connectors
from connectors.base import BaseConnector
import server.connectors
from server.connectors.base import BaseConnector
def _discover_connectors() -> list[type[BaseConnector]]:
"""Walk the connectors package and collect all BaseConnector subclasses."""
for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
for _, module_name, _ in pkgutil.iter_modules(server.connectors.__path__):
if module_name in ("base", "registry"):
continue
importlib.import_module(f"connectors.{module_name}")
importlib.import_module(f"server.connectors.{module_name}")
return [
cls for cls in BaseConnector.__subclasses__()

View File

@@ -13,6 +13,9 @@ load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
class YouTubeAPI(BaseConnector):
source_name = "youtube"
display_name = "YouTube"
def __init__(self):
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
@@ -44,7 +47,7 @@ class YouTubeAPI(BaseConnector):
author=comment_snippet['authorDisplayName'],
timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
reply_to=None,
source="YouTube"
source=self.source_name
)
comments.append(comment)