diff --git a/connectors/boards_api.py b/connectors/boards_api.py index 675ca1e..1b63aa9 100644 --- a/connectors/boards_api.py +++ b/connectors/boards_api.py @@ -19,20 +19,20 @@ class BoardsAPI: self.url = "https://www.boards.ie" self.source_name = "Boards.ie" - def get_new_category_posts(self, category: str, limit: int = 100) -> tuple[list[Post], list[Comment]]: + def get_new_category_posts(self, category: str, post_limit: int, comment_limit: int) -> list[Post]: urls = [] current_page = 1 logger.info(f"Fetching posts from category: {category}") - while len(urls) < limit: + while len(urls) < post_limit: url = f"{self.url}/categories/{category}/p{current_page}" html = self._fetch_page(url) soup = BeautifulSoup(html, "html.parser") logger.debug(f"Processing page {current_page} for category {category}") for a in soup.select("a.threadbit-threadlink"): - if len(urls) >= limit: + if len(urls) >= post_limit: break href = a.get("href") @@ -45,13 +45,11 @@ class BoardsAPI: # Fetch post details for each URL and create Post objects posts = [] - comments = [] def fetch_and_parse(post_url): html = self._fetch_page(post_url) - post = self._parse_thread(html, post_url) - comments = self._parse_comments(post_url, post.id, comment_limit=500) - return (post, comments) + post = self._parse_thread(html, post_url, comment_limit) + return post with ThreadPoolExecutor(max_workers=30) as executor: futures = {executor.submit(fetch_and_parse, url): url for url in urls} @@ -60,13 +58,12 @@ class BoardsAPI: post_url = futures[future] logger.debug(f"Fetching Post {i + 1} / {len(urls)} details from URL: {post_url}") try: - post, post_comments = future.result() + post = future.result() posts.append(post) - comments.extend(post_comments) except Exception as e: logger.error(f"Error fetching post from {post_url}: {e}") - return posts, comments + return posts def _fetch_page(self, url: str) -> str: @@ -74,7 +71,7 @@ class BoardsAPI: response.raise_for_status() return response.text - def _parse_thread(self, html: str, post_url: str) -> Post: + def _parse_thread(self, html: str, post_url: str, comment_limit: int) -> Post: soup = BeautifulSoup(html, "html.parser") # Author @@ -102,6 +99,9 @@ class BoardsAPI: title_tag = soup.select_one(".PageTitle h1") title = title_tag.text.strip() if title_tag else None + # Comments + comments = self._parse_comments(post_url, post_num, comment_limit) + post = Post( id=post_num, author=author, @@ -109,12 +109,13 @@ class BoardsAPI: content=content, url=post_url, timestamp=timestamp, - source=self.source_name + source=self.source_name, + comments=comments ) return post - def _parse_comments(self, url: str, post_id: str, comment_limit: int = 500) -> list[Comment]: + def _parse_comments(self, url: str, post_id: str, comment_limit: int) -> list[Comment]: comments = [] current_url = url diff --git a/connectors/reddit_api.py b/connectors/reddit_api.py index 810635c..0ec6100 100644 --- a/connectors/reddit_api.py +++ b/connectors/reddit_api.py @@ -14,7 +14,7 @@ class RedditAPI: self.source_name = "Reddit" # Public Methods # - def search_new_subreddit_posts(self, search: str, subreddit: str, limit: int = 10) -> tuple[list[Post], list[Comment]]: + def search_new_subreddit_posts(self, search: str, subreddit: str, limit: int) -> list[Post]: params = { 'q': search, 'limit': limit, @@ -25,27 +25,25 @@ class RedditAPI: logger.info(f"Searching subreddit '{subreddit}' for '{search}' with limit {limit}") url = f"r/{subreddit}/search.json" posts = [] - comments = [] while len(posts) < limit: batch_limit = min(100, limit - len(posts)) params['limit'] = batch_limit - data = self._fetch_data(url, params) - batch_posts, batch_comments = self._parse_posts(data) + data = self._fetch_post_overviews(url, params) + batch_posts = self._parse_posts(data) - logger.debug(f"Fetched {len(batch_posts)} posts and {len(batch_comments)} comments from search in subreddit {subreddit}") + logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {subreddit}") if not batch_posts: break posts.extend(batch_posts) - comments.extend(batch_comments) - return posts, comments + + return posts - def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> tuple[list[Post], list[Comment]]: + def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: posts = [] - comments = [] after = None url = f"r/{subreddit}/new.json" @@ -58,30 +56,28 @@ class RedditAPI: 'after': after } - data = self._fetch_data(url, params) - batch_posts, batch_comments = self._parse_posts(data) + data = self._fetch_post_overviews(url, params) + batch_posts = self._parse_posts(data) - logger.debug(f"Fetched {len(batch_posts)} new posts and {len(batch_comments)} comments from subreddit {subreddit}") + logger.debug(f"Fetched {len(batch_posts)} new posts from subreddit {subreddit}") if not batch_posts: break posts.extend(batch_posts) - comments.extend(batch_comments) after = data['data'].get('after') if not after: break - return posts, comments + return posts def get_user(self, username: str) -> User: - data = self._fetch_data(f"user/{username}/about.json", {}) + data = self._fetch_post_overviews(f"user/{username}/about.json", {}) return self._parse_user(data) ## Private Methods ## - def _parse_posts(self, data) -> tuple[list[Post], list[Comment]]: + def _parse_posts(self, data) -> list[Post]: posts = [] - comments = [] total_num_posts = len(data['data']['children']) current_index = 0 @@ -98,19 +94,19 @@ class RedditAPI: content=post_data.get('selftext', ''), url=post_data['url'], timestamp=post_data['created_utc'], - source=self.source_name) + source=self.source_name, + comments=self._get_post_comments(post_data['id'])) post.subreddit = post_data['subreddit'] post.upvotes = post_data['ups'] posts.append(post) - comments.extend(self._get_post_comments(post.id)) - return posts, comments + return posts def _get_post_comments(self, post_id: str) -> list[Comment]: comments: list[Comment] = [] url = f"comments/{post_id}.json" - data = self._fetch_data(url, {}) + data = self._fetch_post_overviews(url, {}) if len(data) < 2: return comments @@ -151,7 +147,7 @@ class RedditAPI: user.karma = user_data['total_karma'] return user - def _fetch_data(self, endpoint: str, params: dict) -> dict: + def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict: url = f"{self.url}{endpoint}" max_retries = 15 backoff = 1 # seconds diff --git a/connectors/youtube_api.py b/connectors/youtube_api.py index 2fbe1bb..d0e00a3 100644 --- a/connectors/youtube_api.py +++ b/connectors/youtube_api.py @@ -40,10 +40,9 @@ class YouTubeAPI: return [] return response.get('items', []) - def fetch_video_and_comments(self, query, video_limit, comment_limit) -> tuple[list[Post], list[Comment]]: + def fetch_videos(self, query, video_limit, comment_limit) -> list[Post]: videos = self.search_videos(query, video_limit) posts = [] - comments = [] for video in videos: video_id = video['id']['videoId'] @@ -53,16 +52,7 @@ class YouTubeAPI: published_at = datetime.datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp() channel_title = snippet['channelTitle'] - post = Post( - id=video_id, - content=f"{title}\n\n{description}", - author=channel_title, - timestamp=published_at, - url=f"https://www.youtube.com/watch?v={video_id}", - title=title, - source="YouTube" - ) - + comments = [] comments_data = self.get_video_comments(video_id, comment_limit) for comment_thread in comments_data: comment_snippet = comment_thread['snippet']['topLevelComment']['snippet'] @@ -77,6 +67,18 @@ class YouTubeAPI: ) comments.append(comment) + + post = Post( + id=video_id, + content=f"{title}\n\n{description}", + author=channel_title, + timestamp=published_at, + url=f"https://www.youtube.com/watch?v={video_id}", + title=title, + source="YouTube", + comments=comments + ) + posts.append(post) - return posts, comments \ No newline at end of file + return posts \ No newline at end of file diff --git a/create_dataset.py b/create_dataset.py index 5ae0186..dd9ae25 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -4,8 +4,7 @@ from connectors.reddit_api import RedditAPI from connectors.boards_api import BoardsAPI from connectors.youtube_api import YouTubeAPI -posts_file = 'posts.jsonl' -comments_file = 'comments.jsonl' +posts_file = 'posts_test.jsonl' reddit_connector = RedditAPI() boards_connector = BoardsAPI() @@ -17,36 +16,28 @@ logging.getLogger("urllib3").setLevel(logging.WARNING) def remove_empty_posts(posts): return [post for post in posts if post.content.strip() != ""] -def post_to_dict(post): - d = post.__dict__.copy() - return d - def save_to_jsonl(filename, posts): with open(filename, 'a', encoding='utf-8') as f: for post in posts: # Convert post object to dict if it's a dataclass - data = post_to_dict(post) + data = post.to_dict() f.write(json.dumps(data) + '\n') def main(): - boards_posts, boards_comments = boards_connector.get_new_category_posts('cork-city', limit=400) + boards_posts = boards_connector.get_new_category_posts('cork-city', 10, 10) save_to_jsonl(posts_file, boards_posts) - save_to_jsonl(comments_file, boards_comments) - reddit_posts, reddit_comments = reddit_connector.get_new_subreddit_posts('cork', limit=400) + reddit_posts = reddit_connector.get_new_subreddit_posts('cork', 10) reddit_posts = remove_empty_posts(reddit_posts) save_to_jsonl(posts_file, reddit_posts) - save_to_jsonl(comments_file, reddit_comments) - ireland_posts, ireland_comments = reddit_connector.search_new_subreddit_posts('cork', 'ireland', limit=10) + ireland_posts = reddit_connector.search_new_subreddit_posts('cork', 'ireland', 10) ireland_posts = remove_empty_posts(ireland_posts) save_to_jsonl(posts_file, ireland_posts) - save_to_jsonl(comments_file, ireland_comments) - youtube_videos, youtube_comments = youtube_connector.fetch_video_and_comments('cork city', 100, 100) + youtube_videos = youtube_connector.fetch_videos('cork city', 10, 10) save_to_jsonl(posts_file, youtube_videos) - save_to_jsonl(comments_file, youtube_comments) if __name__ == "__main__": main() \ No newline at end of file diff --git a/dto/comment.py b/dto/comment.py index 7b1cd10..a574d8d 100644 --- a/dto/comment.py +++ b/dto/comment.py @@ -14,3 +14,14 @@ class Comment: self.timestamp = timestamp self.reply_to = reply_to self.source = source + + def to_dict(self): + return { + "id": self.id, + "post_id": self.post_id, + "author": self.author, + "content": self.content, + "timestamp": self.timestamp, + "reply_to": self.reply_to, + "source": self.source, + } diff --git a/dto/post.py b/dto/post.py index 167aa0b..819be43 100644 --- a/dto/post.py +++ b/dto/post.py @@ -1,3 +1,5 @@ +from dto.comment import Comment + class Post: def __init__(self, id: str, @@ -6,7 +8,8 @@ class Post: content: str, url: str, timestamp: float, - source: str): + source: str, + comments: list[Comment]): self.id = id self.author = author self.title = title @@ -14,7 +17,25 @@ class Post: self.url = url self.timestamp = timestamp self.source = source + self.comments = comments # Optionals self.subreddit = None - self.upvotes = None \ No newline at end of file + self.upvotes = None + + def to_dict(self): + return { + "id": self.id, + "author": self.author, + "title": self.title, + "content": self.content, + "url": self.url, + "timestamp": self.timestamp, + "source": self.source, + "comments": [ + c.to_dict() if hasattr(c, "to_dict") else c + for c in self.comments + ], + "subreddit": self.subreddit, + "upvotes": self.upvotes, + } \ No newline at end of file