separate comment and post data structures
This allows for a flat data structure, benefical to data analysis
This commit is contained in:
@@ -19,7 +19,7 @@ class BoardsAPI:
|
|||||||
self.url = "https://www.boards.ie"
|
self.url = "https://www.boards.ie"
|
||||||
self.source_name = "Boards.ie"
|
self.source_name = "Boards.ie"
|
||||||
|
|
||||||
def get_new_category_posts(self, category: str, limit: int = 100) -> list[Post]:
|
def get_new_category_posts(self, category: str, limit: int = 100) -> tuple[list[Post], list[Comment]]:
|
||||||
urls = []
|
urls = []
|
||||||
current_page = 1
|
current_page = 1
|
||||||
|
|
||||||
@@ -45,10 +45,13 @@ class BoardsAPI:
|
|||||||
|
|
||||||
# Fetch post details for each URL and create Post objects
|
# Fetch post details for each URL and create Post objects
|
||||||
posts = []
|
posts = []
|
||||||
|
comments = []
|
||||||
|
|
||||||
def fetch_and_parse(post_url):
|
def fetch_and_parse(post_url):
|
||||||
html = self._fetch_page(post_url)
|
html = self._fetch_page(post_url)
|
||||||
return self._parse_thread(html, post_url)
|
post = self._parse_thread(html, post_url)
|
||||||
|
comments = self._parse_comments(post_url, post.id, comment_limit=500)
|
||||||
|
return (post, comments)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=30) as executor:
|
with ThreadPoolExecutor(max_workers=30) as executor:
|
||||||
futures = {executor.submit(fetch_and_parse, url): url for url in urls}
|
futures = {executor.submit(fetch_and_parse, url): url for url in urls}
|
||||||
@@ -56,9 +59,14 @@ class BoardsAPI:
|
|||||||
for i, future in enumerate(as_completed(futures)):
|
for i, future in enumerate(as_completed(futures)):
|
||||||
post_url = futures[future]
|
post_url = futures[future]
|
||||||
logger.debug(f"Fetching Post {i + 1} / {len(urls)} details from URL: {post_url}")
|
logger.debug(f"Fetching Post {i + 1} / {len(urls)} details from URL: {post_url}")
|
||||||
posts.append(future.result())
|
try:
|
||||||
|
post, post_comments = future.result()
|
||||||
|
posts.append(post)
|
||||||
|
comments.extend(post_comments)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error fetching post from {post_url}: {e}")
|
||||||
|
|
||||||
return posts
|
return posts, comments
|
||||||
|
|
||||||
|
|
||||||
def _fetch_page(self, url: str) -> str:
|
def _fetch_page(self, url: str) -> str:
|
||||||
@@ -104,10 +112,8 @@ class BoardsAPI:
|
|||||||
source=self.source_name
|
source=self.source_name
|
||||||
)
|
)
|
||||||
|
|
||||||
post.comments = self._parse_comments(post_url, post.id)
|
|
||||||
|
|
||||||
return post
|
return post
|
||||||
|
|
||||||
def _parse_comments(self, url: str, post_id: str, comment_limit: int = 500) -> list[Comment]:
|
def _parse_comments(self, url: str, post_id: str, comment_limit: int = 500) -> list[Comment]:
|
||||||
comments = []
|
comments = []
|
||||||
current_url = url
|
current_url = url
|
||||||
|
|||||||
@@ -93,7 +93,6 @@ class RedditAPI:
|
|||||||
source=self.source_name)
|
source=self.source_name)
|
||||||
post.subreddit = post_data['subreddit']
|
post.subreddit = post_data['subreddit']
|
||||||
post.upvotes = post_data['ups']
|
post.upvotes = post_data['ups']
|
||||||
post.comments = self._get_post_comments(post.id)
|
|
||||||
|
|
||||||
posts.append(post)
|
posts.append(post)
|
||||||
return posts
|
return posts
|
||||||
|
|||||||
@@ -4,7 +4,9 @@ from connectors.reddit_api import RedditAPI
|
|||||||
from connectors.boards_api import BoardsAPI
|
from connectors.boards_api import BoardsAPI
|
||||||
from connectors.youtube_api import YouTubeAPI
|
from connectors.youtube_api import YouTubeAPI
|
||||||
|
|
||||||
data_file = 'data/posts.jsonl'
|
posts_file = 'data/posts.jsonl'
|
||||||
|
comments_file = 'data/comments.jsonl'
|
||||||
|
|
||||||
reddit_connector = RedditAPI()
|
reddit_connector = RedditAPI()
|
||||||
boards_connector = BoardsAPI()
|
boards_connector = BoardsAPI()
|
||||||
youtube_connector = YouTubeAPI()
|
youtube_connector = YouTubeAPI()
|
||||||
@@ -17,9 +19,6 @@ def remove_empty_posts(posts):
|
|||||||
|
|
||||||
def post_to_dict(post):
|
def post_to_dict(post):
|
||||||
d = post.__dict__.copy()
|
d = post.__dict__.copy()
|
||||||
|
|
||||||
if post.comments:
|
|
||||||
d["comments"] = [c.__dict__ for c in post.comments]
|
|
||||||
return d
|
return d
|
||||||
|
|
||||||
def save_to_jsonl(filename, posts):
|
def save_to_jsonl(filename, posts):
|
||||||
@@ -31,19 +30,20 @@ def save_to_jsonl(filename, posts):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
boards_posts = boards_connector.get_new_category_posts('cork-city', limit=350)
|
boards_posts, boards_comments = boards_connector.get_new_category_posts('cork-city', limit=5)
|
||||||
save_to_jsonl(data_file, boards_posts)
|
save_to_jsonl(posts_file, boards_posts)
|
||||||
|
save_to_jsonl(comments_file, boards_comments)
|
||||||
reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=350)
|
|
||||||
reddit_posts = remove_empty_posts(reddit_posts)
|
|
||||||
save_to_jsonl(data_file, reddit_posts)
|
|
||||||
|
|
||||||
ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=350, timeframe='year')
|
|
||||||
ireland_posts = remove_empty_posts(ireland_posts)
|
|
||||||
save_to_jsonl(data_file, ireland_posts)
|
|
||||||
|
|
||||||
youtube_videos = youtube_connector.fetch_and_parse_videos('cork city', 100, 100)
|
#reddit_posts = reddit_connector.get_new_subreddit_posts('cork', limit=350)
|
||||||
save_to_jsonl(data_file, youtube_videos)
|
#reddit_posts = remove_empty_posts(reddit_posts)
|
||||||
|
#save_to_jsonl(data_file, reddit_posts)
|
||||||
|
|
||||||
|
#ireland_posts = reddit_connector.search_subreddit('cork', 'ireland', limit=350, timeframe='year')
|
||||||
|
#ireland_posts = remove_empty_posts(ireland_posts)
|
||||||
|
#save_to_jsonl(data_file, ireland_posts)
|
||||||
|
|
||||||
|
#youtube_videos = youtube_connector.fetch_and_parse_videos('cork city', 100, 100)
|
||||||
|
#save_to_jsonl(data_file, youtube_videos)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
@@ -17,5 +17,4 @@ class Post:
|
|||||||
|
|
||||||
# Optionals
|
# Optionals
|
||||||
self.subreddit = None
|
self.subreddit = None
|
||||||
self.upvotes = None
|
self.upvotes = None
|
||||||
self.comments = None
|
|
||||||
Reference in New Issue
Block a user