Refactor dataset creation to use post_to_dict for improved data structure and limit API calls to 400
This commit is contained in:
@@ -13,19 +13,27 @@ logging.getLogger("urllib3").setLevel(logging.WARNING)
|
|||||||
def remove_empty_posts(posts):
|
def remove_empty_posts(posts):
|
||||||
return [post for post in posts if post.content.strip() != ""]
|
return [post for post in posts if post.content.strip() != ""]
|
||||||
|
|
||||||
|
def post_to_dict(post):
|
||||||
|
d = post.__dict__.copy()
|
||||||
|
|
||||||
|
if post.comments:
|
||||||
|
d["comments"] = [c.__dict__ for c in post.comments]
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
boards_posts = boards_connector.get_new_category_posts('cork-city', limit=500)
|
boards_posts = boards_connector.get_new_category_posts('cork-city', limit=400)
|
||||||
|
|
||||||
reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=500)
|
reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=400)
|
||||||
reddit_posts = remove_empty_posts(reddit_posts)
|
reddit_posts = remove_empty_posts(reddit_posts)
|
||||||
|
|
||||||
ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=500, timeframe='year')
|
ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=400, timeframe='year')
|
||||||
ireland_posts = remove_empty_posts(ireland_posts)
|
ireland_posts = remove_empty_posts(ireland_posts)
|
||||||
|
|
||||||
posts = boards_posts + reddit_posts + ireland_posts
|
posts = boards_posts + reddit_posts + ireland_posts
|
||||||
|
|
||||||
with open(data_file, 'w') as f:
|
with open(data_file, 'w') as f:
|
||||||
json.dump([post.__dict__ for post in posts], f, indent=4)
|
json.dump([post_to_dict(post) for post in posts], f, indent=4)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
Reference in New Issue
Block a user