diff --git a/connectors/youtube_api.py b/connectors/youtube_api.py index 61132a3..5dcf73d 100644 --- a/connectors/youtube_api.py +++ b/connectors/youtube_api.py @@ -40,9 +40,10 @@ class YouTubeAPI: return [] return response.get('items', []) - def fetch_and_parse_videos(self, query, video_limit, comment_limit): + def fetch_video_and_comments(self, query, video_limit, comment_limit) -> tuple[list[Post], list[Comment]]: videos = self.search_videos(query, video_limit) posts = [] + comments = [] for video in videos: video_id = video['id']['videoId'] @@ -62,7 +63,6 @@ class YouTubeAPI: source="YouTube" ) - post.comments = [] comments_data = self.get_video_comments(video_id, comment_limit) for comment_thread in comments_data: comment_snippet = comment_thread['snippet']['topLevelComment']['snippet'] @@ -75,8 +75,8 @@ class YouTubeAPI: reply_to=None, source="YouTube" ) - post.comments.append(comment) + comments.append(comment) posts.append(post) - return posts \ No newline at end of file + return posts, comments \ No newline at end of file diff --git a/create_dataset.py b/create_dataset.py index 8bb985f..d4baa1e 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -30,20 +30,23 @@ def save_to_jsonl(filename, posts): def main(): - boards_posts, boards_comments = boards_connector.get_new_category_posts('cork-city', limit=5) + boards_posts, boards_comments = boards_connector.get_new_category_posts('cork-city', limit=400) save_to_jsonl(posts_file, boards_posts) save_to_jsonl(comments_file, boards_comments) - #reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=350) - #reddit_posts = remove_empty_posts(reddit_posts) - #save_to_jsonl(data_file, reddit_posts) - - #ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=350, timeframe='year') - #ireland_posts = remove_empty_posts(ireland_posts) - #save_to_jsonl(data_file, ireland_posts) + reddit_posts, reddit_comments = reddit_connector.get_new_subreddit_posts('cork', limit=400) + reddit_posts = remove_empty_posts(reddit_posts) + save_to_jsonl(posts_file, reddit_posts) + save_to_jsonl(comments_file, reddit_comments) - #youtube_videos = youtube_connector.fetch_and_parse_videos('cork city', 100, 100) - #save_to_jsonl(data_file, youtube_videos) + ireland_posts, ireland_comments = reddit_connector.search_new_subreddit_posts('cork', 'ireland', limit=10) + ireland_posts = remove_empty_posts(ireland_posts) + save_to_jsonl(posts_file, ireland_posts) + save_to_jsonl(comments_file, ireland_comments) + + youtube_videos, youtube_comments = youtube_connector.fetch_video_and_comments('cork city', 100, 100) + save_to_jsonl(posts_file, youtube_videos) + save_to_jsonl(comments_file, youtube_comments) if __name__ == "__main__": main() \ No newline at end of file