From 152264bda9e6e7af418b15c8c47552606c46bae9 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Thu, 22 Jan 2026 15:53:47 +0000 Subject: [PATCH] separate comment and post data structures This allows for a flat data structure, benefical to data analysis --- connectors/boards_api.py | 20 +++++++++++++------- connectors/reddit_api.py | 1 - create_dataset.py | 32 ++++++++++++++++---------------- dto/post.py | 3 +-- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/connectors/boards_api.py b/connectors/boards_api.py index 4a605c4..675ca1e 100644 --- a/connectors/boards_api.py +++ b/connectors/boards_api.py @@ -19,7 +19,7 @@ class BoardsAPI: self.url = "https://www.boards.ie" self.source_name = "Boards.ie" - def get_new_category_posts(self, category: str, limit: int = 100) -> list[Post]: + def get_new_category_posts(self, category: str, limit: int = 100) -> tuple[list[Post], list[Comment]]: urls = [] current_page = 1 @@ -45,10 +45,13 @@ class BoardsAPI: # Fetch post details for each URL and create Post objects posts = [] + comments = [] def fetch_and_parse(post_url): html = self._fetch_page(post_url) - return self._parse_thread(html, post_url) + post = self._parse_thread(html, post_url) + comments = self._parse_comments(post_url, post.id, comment_limit=500) + return (post, comments) with ThreadPoolExecutor(max_workers=30) as executor: futures = {executor.submit(fetch_and_parse, url): url for url in urls} @@ -56,9 +59,14 @@ class BoardsAPI: for i, future in enumerate(as_completed(futures)): post_url = futures[future] logger.debug(f"Fetching Post {i + 1} / {len(urls)} details from URL: {post_url}") - posts.append(future.result()) + try: + post, post_comments = future.result() + posts.append(post) + comments.extend(post_comments) + except Exception as e: + logger.error(f"Error fetching post from {post_url}: {e}") - return posts + return posts, comments def _fetch_page(self, url: str) -> str: @@ -104,10 +112,8 @@ class BoardsAPI: source=self.source_name ) - post.comments = self._parse_comments(post_url, post.id) - return post - + def _parse_comments(self, url: str, post_id: str, comment_limit: int = 500) -> list[Comment]: comments = [] current_url = url diff --git a/connectors/reddit_api.py b/connectors/reddit_api.py index 8a3dd0b..a713205 100644 --- a/connectors/reddit_api.py +++ b/connectors/reddit_api.py @@ -93,7 +93,6 @@ class RedditAPI: source=self.source_name) post.subreddit = post_data['subreddit'] post.upvotes = post_data['ups'] - post.comments = self._get_post_comments(post.id) posts.append(post) return posts diff --git a/create_dataset.py b/create_dataset.py index e7f371f..8bb985f 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -4,7 +4,9 @@ from connectors.reddit_api import RedditAPI from connectors.boards_api import BoardsAPI from connectors.youtube_api import YouTubeAPI -data_file = 'data/posts.jsonl' +posts_file = 'data/posts.jsonl' +comments_file = 'data/comments.jsonl' + reddit_connector = RedditAPI() boards_connector = BoardsAPI() youtube_connector = YouTubeAPI() @@ -17,9 +19,6 @@ def remove_empty_posts(posts): def post_to_dict(post): d = post.__dict__.copy() - - if post.comments: - d["comments"] = [c.__dict__ for c in post.comments] return d def save_to_jsonl(filename, posts): @@ -31,19 +30,20 @@ def save_to_jsonl(filename, posts): def main(): - boards_posts = boards_connector.get_new_category_posts('cork-city', limit=350) - save_to_jsonl(data_file, boards_posts) - - reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=350) - reddit_posts = remove_empty_posts(reddit_posts) - save_to_jsonl(data_file, reddit_posts) - - ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=350, timeframe='year') - ireland_posts = remove_empty_posts(ireland_posts) - save_to_jsonl(data_file, ireland_posts) + boards_posts, boards_comments = boards_connector.get_new_category_posts('cork-city', limit=5) + save_to_jsonl(posts_file, boards_posts) + save_to_jsonl(comments_file, boards_comments) - youtube_videos = youtube_connector.fetch_and_parse_videos('cork city', 100, 100) - save_to_jsonl(data_file, youtube_videos) + #reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=350) + #reddit_posts = remove_empty_posts(reddit_posts) + #save_to_jsonl(data_file, reddit_posts) + + #ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=350, timeframe='year') + #ireland_posts = remove_empty_posts(ireland_posts) + #save_to_jsonl(data_file, ireland_posts) + + #youtube_videos = youtube_connector.fetch_and_parse_videos('cork city', 100, 100) + #save_to_jsonl(data_file, youtube_videos) if __name__ == "__main__": main() \ No newline at end of file diff --git a/dto/post.py b/dto/post.py index 90bf1c6..167aa0b 100644 --- a/dto/post.py +++ b/dto/post.py @@ -17,5 +17,4 @@ class Post: # Optionals self.subreddit = None - self.upvotes = None - self.comments = None \ No newline at end of file + self.upvotes = None \ No newline at end of file