From 9c66ec8b8263c9df1505809e5b77b6cb722bcfc0 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 19 Jan 2026 20:22:47 +0000 Subject: [PATCH] Save to jsonl file after every fetch Reduces errors and lost data --- create_dataset.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/create_dataset.py b/create_dataset.py index 412973e..59a3753 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -20,20 +20,25 @@ def post_to_dict(post): d["comments"] = [c.__dict__ for c in post.comments] return d +def save_to_jsonl(filename, posts): + with open(filename, 'a', encoding='utf-8') as f: + for post in posts: + # Convert post object to dict if it's a dataclass + data = post_to_dict(post) + f.write(json.dumps(data) + '\n') + def main(): - boards_posts = boards_connector.get_new_category_posts('cork-city', limit=400) + boards_posts = boards_connector.get_new_category_posts('cork-city', limit=350) + save_to_jsonl(data_file, boards_posts) - reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=400) + reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=350) reddit_posts = remove_empty_posts(reddit_posts) - - ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=400, timeframe='year') + save_to_jsonl(data_file, reddit_posts) + + ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=350, timeframe='year') ireland_posts = remove_empty_posts(ireland_posts) - - posts = boards_posts + reddit_posts + ireland_posts - - with open(data_file, 'w') as f: - json.dump([post_to_dict(post) for post in posts], f, indent=4) + save_to_jsonl(data_file, ireland_posts) if __name__ == "__main__": main() \ No newline at end of file