From 8a423b2a293959040b5e38f10c44f5ca36585b36 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sat, 14 Mar 2026 16:59:43 +0000 Subject: [PATCH] feat(connectors): implement category validation in scraping process --- server/app.py | 9 +++++++-- server/connectors/base.py | 4 ++++ server/connectors/boards_api.py | 3 +++ server/connectors/reddit_api.py | 11 +++++++++++ server/connectors/youtube_api.py | 3 +++ 5 files changed, 28 insertions(+), 2 deletions(-) diff --git a/server/app.py b/server/app.py index b315a60..9759cd7 100644 --- a/server/app.py +++ b/server/app.py @@ -157,15 +157,20 @@ def scrape_data(): return jsonify({"error": "Limit must be an integer"}), 400 name = source["name"] + category = source.get("category") + search = source.get("search") if name not in connector_metadata: return jsonify({"error": "Source not supported"}), 400 - if "search" in source and not connector_metadata[name]["search_enabled"]: + if category and not connector_metadata[name]["search_enabled"]: return jsonify({"error": f"Source {name} does not support search"}), 400 - if "category" in source and not connector_metadata[name]["categories_enabled"]: + if category and not connector_metadata[name]["categories_enabled"]: return jsonify({"error": f"Source {name} does not support categories"}), 400 + + if category and not connectors[name]().category_exists(category): + return jsonify({"error": f"Category does not exist for {name}"}), 400 try: dataset_id = dataset_manager.save_dataset_info( diff --git a/server/connectors/base.py b/server/connectors/base.py index 3614c8a..48163b5 100644 --- a/server/connectors/base.py +++ b/server/connectors/base.py @@ -22,4 +22,8 @@ class BaseConnector(ABC): category: str = None, post_limit: int = 10 ) -> list[Post]: + ... + + @abstractmethod + def category_exists(self, category: str) -> bool: ... \ No newline at end of file diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index 96309b3..e51bdaf 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -37,6 +37,9 @@ class BoardsAPI(BaseConnector): return self._get_posts(f"{self.base_url}/categories/{category}", post_limit) else: return self._get_posts(f"{self.base_url}/discussions", post_limit) + + def category_exists(self, category): + return True ## Private def _get_posts(self, url, limit) -> list[Post]: diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index dd62119..7955fca 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -94,6 +94,17 @@ class RedditAPI(BaseConnector): data = self._fetch_post_overviews(f"user/{username}/about.json", {}) return self._parse_user(data) + def category_exists(self, category: str) -> bool: + try: + data = self._fetch_post_overviews(f"r/{category}/about.json", {}) + return ( + data is not None + and 'data' in data + and data['data'].get('id') is not None + ) + except Exception: + return False + ## Private Methods ## def _parse_posts(self, data) -> list[Post]: posts = [] diff --git a/server/connectors/youtube_api.py b/server/connectors/youtube_api.py index bcf5fe8..c71d59c 100644 --- a/server/connectors/youtube_api.py +++ b/server/connectors/youtube_api.py @@ -68,6 +68,9 @@ class YouTubeAPI(BaseConnector): posts.append(post) return posts + + def category_exists(self, category): + return True def search_videos(self, query, limit): request = self.youtube.search().list(