diff --git a/fetch_data.py b/create_dataset.py similarity index 68% rename from fetch_data.py rename to create_dataset.py index 46b8f5e..6e74b31 100644 --- a/fetch_data.py +++ b/create_dataset.py @@ -1,11 +1,16 @@ import json from connectors.reddit_connector import RedditConnector -data_file = 'reddit_posts.json' +data_file = 'data/reddit_posts.json' reddit_connector = RedditConnector() +def remove_empty_posts(posts): + return [post for post in posts if post.content.strip() != ""] + def main(): posts = reddit_connector.get_new_subreddit_posts('cork', limit=1000) + posts = remove_empty_posts(posts) + print(f"Fetched {len(posts)} posts from r/cork") with open(data_file, 'w') as f: