Automatic Scraping of dataset options #9
@@ -157,16 +157,21 @@ def scrape_data():
|
|||||||
return jsonify({"error": "Limit must be an integer"}), 400
|
return jsonify({"error": "Limit must be an integer"}), 400
|
||||||
|
|
||||||
name = source["name"]
|
name = source["name"]
|
||||||
|
category = source.get("category")
|
||||||
|
search = source.get("search")
|
||||||
|
|
||||||
if name not in connector_metadata:
|
if name not in connector_metadata:
|
||||||
return jsonify({"error": "Source not supported"}), 400
|
return jsonify({"error": "Source not supported"}), 400
|
||||||
|
|
||||||
if "search" in source and not connector_metadata[name]["search_enabled"]:
|
if category and not connector_metadata[name]["search_enabled"]:
|
||||||
return jsonify({"error": f"Source {name} does not support search"}), 400
|
return jsonify({"error": f"Source {name} does not support search"}), 400
|
||||||
|
|
||||||
if "category" in source and not connector_metadata[name]["categories_enabled"]:
|
if category and not connector_metadata[name]["categories_enabled"]:
|
||||||
return jsonify({"error": f"Source {name} does not support categories"}), 400
|
return jsonify({"error": f"Source {name} does not support categories"}), 400
|
||||||
|
|
||||||
|
if category and not connectors[name]().category_exists(category):
|
||||||
|
return jsonify({"error": f"Category does not exist for {name}"}), 400
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dataset_id = dataset_manager.save_dataset_info(
|
dataset_id = dataset_manager.save_dataset_info(
|
||||||
user_id,
|
user_id,
|
||||||
|
|||||||
@@ -23,3 +23,7 @@ class BaseConnector(ABC):
|
|||||||
post_limit: int = 10
|
post_limit: int = 10
|
||||||
) -> list[Post]:
|
) -> list[Post]:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def category_exists(self, category: str) -> bool:
|
||||||
|
...
|
||||||
@@ -38,6 +38,9 @@ class BoardsAPI(BaseConnector):
|
|||||||
else:
|
else:
|
||||||
return self._get_posts(f"{self.base_url}/discussions", post_limit)
|
return self._get_posts(f"{self.base_url}/discussions", post_limit)
|
||||||
|
|
||||||
|
def category_exists(self, category):
|
||||||
|
return True
|
||||||
|
|
||||||
## Private
|
## Private
|
||||||
def _get_posts(self, url, limit) -> list[Post]:
|
def _get_posts(self, url, limit) -> list[Post]:
|
||||||
urls = []
|
urls = []
|
||||||
|
|||||||
@@ -94,6 +94,17 @@ class RedditAPI(BaseConnector):
|
|||||||
data = self._fetch_post_overviews(f"user/{username}/about.json", {})
|
data = self._fetch_post_overviews(f"user/{username}/about.json", {})
|
||||||
return self._parse_user(data)
|
return self._parse_user(data)
|
||||||
|
|
||||||
|
def category_exists(self, category: str) -> bool:
|
||||||
|
try:
|
||||||
|
data = self._fetch_post_overviews(f"r/{category}/about.json", {})
|
||||||
|
return (
|
||||||
|
data is not None
|
||||||
|
and 'data' in data
|
||||||
|
and data['data'].get('id') is not None
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
## Private Methods ##
|
## Private Methods ##
|
||||||
def _parse_posts(self, data) -> list[Post]:
|
def _parse_posts(self, data) -> list[Post]:
|
||||||
posts = []
|
posts = []
|
||||||
|
|||||||
@@ -69,6 +69,9 @@ class YouTubeAPI(BaseConnector):
|
|||||||
|
|
||||||
return posts
|
return posts
|
||||||
|
|
||||||
|
def category_exists(self, category):
|
||||||
|
return True
|
||||||
|
|
||||||
def search_videos(self, query, limit):
|
def search_videos(self, query, limit):
|
||||||
request = self.youtube.search().list(
|
request = self.youtube.search().list(
|
||||||
q=query,
|
q=query,
|
||||||
|
|||||||
Reference in New Issue
Block a user