From 610bab67d50737c557005d3e035c22a2df38f8e3 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sat, 17 Jan 2026 14:43:56 +0000 Subject: [PATCH] Add boards.ie to dataset creation & add logging config --- create_dataset.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/create_dataset.py b/create_dataset.py index 2ec54a7..c6bbaed 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -1,16 +1,25 @@ import json +import logging from connectors.reddit_api import RedditAPI +from connectors.boards_api import BoardsAPI data_file = 'data/reddit_posts.json' reddit_connector = RedditAPI() +boards_connector = BoardsAPI() + +logging.basicConfig(level=logging.DEBUG) +logging.getLogger("urllib3").setLevel(logging.WARNING) + def remove_empty_posts(posts): return [post for post in posts if post.content.strip() != ""] def main(): - posts = reddit_connector.get_new_subreddit_posts('cork', limit=1000) - posts = remove_empty_posts(posts) + boards_posts = boards_connector.get_new_category_posts('cork-city', limit=500) + + reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=500) + reddit_posts = remove_empty_posts(reddit_posts) - print(f"Fetched {len(posts)} posts from r/cork") + posts = boards_posts + reddit_posts with open(data_file, 'w') as f: json.dump([post.__dict__ for post in posts], f, indent=4)