Automatic Scraping of dataset options #9
@@ -157,15 +157,20 @@ def scrape_data():
|
||||
return jsonify({"error": "Limit must be an integer"}), 400
|
||||
|
||||
name = source["name"]
|
||||
category = source.get("category")
|
||||
search = source.get("search")
|
||||
|
||||
if name not in connector_metadata:
|
||||
return jsonify({"error": "Source not supported"}), 400
|
||||
|
||||
if "search" in source and not connector_metadata[name]["search_enabled"]:
|
||||
if category and not connector_metadata[name]["search_enabled"]:
|
||||
return jsonify({"error": f"Source {name} does not support search"}), 400
|
||||
|
||||
if "category" in source and not connector_metadata[name]["categories_enabled"]:
|
||||
if category and not connector_metadata[name]["categories_enabled"]:
|
||||
return jsonify({"error": f"Source {name} does not support categories"}), 400
|
||||
|
||||
if category and not connectors[name]().category_exists(category):
|
||||
return jsonify({"error": f"Category does not exist for {name}"}), 400
|
||||
|
||||
try:
|
||||
dataset_id = dataset_manager.save_dataset_info(
|
||||
|
||||
@@ -22,4 +22,8 @@ class BaseConnector(ABC):
|
||||
category: str = None,
|
||||
post_limit: int = 10
|
||||
) -> list[Post]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def category_exists(self, category: str) -> bool:
|
||||
...
|
||||
@@ -37,6 +37,9 @@ class BoardsAPI(BaseConnector):
|
||||
return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
|
||||
else:
|
||||
return self._get_posts(f"{self.base_url}/discussions", post_limit)
|
||||
|
||||
def category_exists(self, category):
|
||||
return True
|
||||
|
||||
## Private
|
||||
def _get_posts(self, url, limit) -> list[Post]:
|
||||
|
||||
@@ -94,6 +94,17 @@ class RedditAPI(BaseConnector):
|
||||
data = self._fetch_post_overviews(f"user/{username}/about.json", {})
|
||||
return self._parse_user(data)
|
||||
|
||||
def category_exists(self, category: str) -> bool:
|
||||
try:
|
||||
data = self._fetch_post_overviews(f"r/{category}/about.json", {})
|
||||
return (
|
||||
data is not None
|
||||
and 'data' in data
|
||||
and data['data'].get('id') is not None
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
## Private Methods ##
|
||||
def _parse_posts(self, data) -> list[Post]:
|
||||
posts = []
|
||||
|
||||
@@ -68,6 +68,9 @@ class YouTubeAPI(BaseConnector):
|
||||
posts.append(post)
|
||||
|
||||
return posts
|
||||
|
||||
def category_exists(self, category):
|
||||
return True
|
||||
|
||||
def search_videos(self, query, limit):
|
||||
request = self.youtube.search().list(
|
||||
|
||||
Reference in New Issue
Block a user