Automatic Scraping of dataset options #9

Merged
dylan merged 36 commits from feat/automatic-scraping-datasets into main 2026-03-14 21:58:49 +00:00
5 changed files with 28 additions and 2 deletions
Showing only changes of commit 8a423b2a29 - Show all commits

View File

@@ -157,16 +157,21 @@ def scrape_data():
return jsonify({"error": "Limit must be an integer"}), 400
name = source["name"]
category = source.get("category")
search = source.get("search")
if name not in connector_metadata:
return jsonify({"error": "Source not supported"}), 400
if "search" in source and not connector_metadata[name]["search_enabled"]:
if category and not connector_metadata[name]["search_enabled"]:
return jsonify({"error": f"Source {name} does not support search"}), 400
if "category" in source and not connector_metadata[name]["categories_enabled"]:
if category and not connector_metadata[name]["categories_enabled"]:
return jsonify({"error": f"Source {name} does not support categories"}), 400
if category and not connectors[name]().category_exists(category):
return jsonify({"error": f"Category does not exist for {name}"}), 400
try:
dataset_id = dataset_manager.save_dataset_info(
user_id,

View File

@@ -23,3 +23,7 @@ class BaseConnector(ABC):
post_limit: int = 10
) -> list[Post]:
...
@abstractmethod
def category_exists(self, category: str) -> bool:
...

View File

@@ -38,6 +38,9 @@ class BoardsAPI(BaseConnector):
else:
return self._get_posts(f"{self.base_url}/discussions", post_limit)
def category_exists(self, category):
return True
## Private
def _get_posts(self, url, limit) -> list[Post]:
urls = []

View File

@@ -94,6 +94,17 @@ class RedditAPI(BaseConnector):
data = self._fetch_post_overviews(f"user/{username}/about.json", {})
return self._parse_user(data)
def category_exists(self, category: str) -> bool:
try:
data = self._fetch_post_overviews(f"r/{category}/about.json", {})
return (
data is not None
and 'data' in data
and data['data'].get('id') is not None
)
except Exception:
return False
## Private Methods ##
def _parse_posts(self, data) -> list[Post]:
posts = []

View File

@@ -69,6 +69,9 @@ class YouTubeAPI(BaseConnector):
return posts
def category_exists(self, category):
return True
def search_videos(self, query, limit):
request = self.youtube.search().list(
q=query,