Automatic Scraping of dataset options #9

Merged
dylan merged 36 commits from feat/automatic-scraping-datasets into main 2026-03-14 21:58:49 +00:00
5 changed files with 28 additions and 2 deletions
Showing only changes of commit 8a423b2a29 - Show all commits

View File

@@ -157,16 +157,21 @@ def scrape_data():
return jsonify({"error": "Limit must be an integer"}), 400 return jsonify({"error": "Limit must be an integer"}), 400
name = source["name"] name = source["name"]
category = source.get("category")
search = source.get("search")
if name not in connector_metadata: if name not in connector_metadata:
return jsonify({"error": "Source not supported"}), 400 return jsonify({"error": "Source not supported"}), 400
if "search" in source and not connector_metadata[name]["search_enabled"]: if category and not connector_metadata[name]["search_enabled"]:
return jsonify({"error": f"Source {name} does not support search"}), 400 return jsonify({"error": f"Source {name} does not support search"}), 400
if "category" in source and not connector_metadata[name]["categories_enabled"]: if category and not connector_metadata[name]["categories_enabled"]:
return jsonify({"error": f"Source {name} does not support categories"}), 400 return jsonify({"error": f"Source {name} does not support categories"}), 400
if category and not connectors[name]().category_exists(category):
return jsonify({"error": f"Category does not exist for {name}"}), 400
try: try:
dataset_id = dataset_manager.save_dataset_info( dataset_id = dataset_manager.save_dataset_info(
user_id, user_id,

View File

@@ -23,3 +23,7 @@ class BaseConnector(ABC):
post_limit: int = 10 post_limit: int = 10
) -> list[Post]: ) -> list[Post]:
... ...
@abstractmethod
def category_exists(self, category: str) -> bool:
...

View File

@@ -38,6 +38,9 @@ class BoardsAPI(BaseConnector):
else: else:
return self._get_posts(f"{self.base_url}/discussions", post_limit) return self._get_posts(f"{self.base_url}/discussions", post_limit)
def category_exists(self, category):
return True
## Private ## Private
def _get_posts(self, url, limit) -> list[Post]: def _get_posts(self, url, limit) -> list[Post]:
urls = [] urls = []

View File

@@ -94,6 +94,17 @@ class RedditAPI(BaseConnector):
data = self._fetch_post_overviews(f"user/{username}/about.json", {}) data = self._fetch_post_overviews(f"user/{username}/about.json", {})
return self._parse_user(data) return self._parse_user(data)
def category_exists(self, category: str) -> bool:
try:
data = self._fetch_post_overviews(f"r/{category}/about.json", {})
return (
data is not None
and 'data' in data
and data['data'].get('id') is not None
)
except Exception:
return False
## Private Methods ## ## Private Methods ##
def _parse_posts(self, data) -> list[Post]: def _parse_posts(self, data) -> list[Post]:
posts = [] posts = []

View File

@@ -69,6 +69,9 @@ class YouTubeAPI(BaseConnector):
return posts return posts
def category_exists(self, category):
return True
def search_videos(self, query, limit): def search_videos(self, query, limit):
request = self.youtube.search().list( request = self.youtube.search().list(
q=query, q=query,