Automatic Scraping of dataset options #9

Merged
dylan merged 36 commits from feat/automatic-scraping-datasets into main 2026-03-14 21:58:49 +00:00
2 changed files with 58 additions and 45 deletions
Showing only changes of commit 2a8d7c7972 - Show all commits

View File

@@ -15,7 +15,15 @@ class RedditAPI(BaseConnector):
self.source_name = "Reddit"
# Public Methods #
def search_new_subreddit_posts(self, search: str, subreddit: str, limit: int) -> list[Post]:
def get_new_posts_by_search(self,
search: str,
subreddit: str,
limit: int
) -> list[Post]:
if not search:
return self._get_new_subreddit_posts(subreddit, limit=limit)
params = {
'q': search,
'limit': limit,
@@ -43,7 +51,7 @@ class RedditAPI(BaseConnector):
return posts
def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
posts = []
after = None
url = f"r/{subreddit}/new.json"

View File

@@ -16,33 +16,13 @@ class YouTubeAPI(BaseConnector):
def __init__(self):
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
def search_videos(self, query, limit):
request = self.youtube.search().list(
q=query,
part='snippet',
type='video',
maxResults=limit
)
response = request.execute()
return response.get('items', [])
def get_video_comments(self, video_id, limit):
request = self.youtube.commentThreads().list(
part='snippet',
videoId=video_id,
maxResults=limit,
textFormat='plainText'
)
try:
response = request.execute()
except HttpError as e:
print(f"Error fetching comments for video {video_id}: {e}")
return []
return response.get('items', [])
def fetch_videos(self, query, video_limit, comment_limit) -> list[Post]:
videos = self.search_videos(query, video_limit)
def get_new_posts_by_search(self,
search: str,
category: str,
post_limit: int,
comment_limit: int
) -> list[Post]:
videos = self.search_videos(search, post_limit)
posts = []
for video in videos:
@@ -83,3 +63,28 @@ class YouTubeAPI(BaseConnector):
posts.append(post)
return posts
def search_videos(self, query, limit):
request = self.youtube.search().list(
q=query,
part='snippet',
type='video',
maxResults=limit
)
response = request.execute()
return response.get('items', [])
def get_video_comments(self, video_id, limit):
request = self.youtube.commentThreads().list(
part='snippet',
videoId=video_id,
maxResults=limit,
textFormat='plainText'
)
try:
response = request.execute()
except HttpError as e:
print(f"Error fetching comments for video {video_id}: {e}")
return []
return response.get('items', [])