From 193ff4397598e8553724b2ecf82e337c563a5c5e Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sat, 17 Jan 2026 22:14:15 +0000 Subject: [PATCH] Refactor dataset creation to use post_to_dict for improved data structure and limit API calls to 400 --- create_dataset.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/create_dataset.py b/create_dataset.py index baa532f..412973e 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -13,19 +13,27 @@ logging.getLogger("urllib3").setLevel(logging.WARNING) def remove_empty_posts(posts): return [post for post in posts if post.content.strip() != ""] +def post_to_dict(post): + d = post.__dict__.copy() + + if post.comments: + d["comments"] = [c.__dict__ for c in post.comments] + return d + + def main(): - boards_posts = boards_connector.get_new_category_posts('cork-city', limit=500) + boards_posts = boards_connector.get_new_category_posts('cork-city', limit=400) - reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=500) + reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=400) reddit_posts = remove_empty_posts(reddit_posts) - ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=500, timeframe='year') + ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=400, timeframe='year') ireland_posts = remove_empty_posts(ireland_posts) posts = boards_posts + reddit_posts + ireland_posts with open(data_file, 'w') as f: - json.dump([post.__dict__ for post in posts], f, indent=4) + json.dump([post_to_dict(post) for post in posts], f, indent=4) if __name__ == "__main__": main() \ No newline at end of file