diff --git a/create_dataset.py b/create_dataset.py index 2ec54a7..c6bbaed 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -1,16 +1,25 @@ import json +import logging from connectors.reddit_api import RedditAPI +from connectors.boards_api import BoardsAPI data_file = 'data/reddit_posts.json' reddit_connector = RedditAPI() +boards_connector = BoardsAPI() + +logging.basicConfig(level=logging.DEBUG) +logging.getLogger("urllib3").setLevel(logging.WARNING) + def remove_empty_posts(posts): return [post for post in posts if post.content.strip() != ""] def main(): - posts = reddit_connector.get_new_subreddit_posts('cork', limit=1000) - posts = remove_empty_posts(posts) + boards_posts = boards_connector.get_new_category_posts('cork-city', limit=500) + + reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=500) + reddit_posts = remove_empty_posts(reddit_posts) - print(f"Fetched {len(posts)} posts from r/cork") + posts = boards_posts + reddit_posts with open(data_file, 'w') as f: json.dump([post.__dict__ for post in posts], f, indent=4)