diff --git a/server/connectors/base.py b/server/connectors/base.py index f555769..bad73c5 100644 --- a/server/connectors/base.py +++ b/server/connectors/base.py @@ -17,7 +17,6 @@ class BaseConnector(ABC): def get_new_posts_by_search(self, search: str = None, category: str = None, - post_limit: int = 10, - comment_limit: int = 10 + post_limit: int = 10 ) -> list[Post]: ... \ No newline at end of file diff --git a/server/connectors/boards_api.py b/server/connectors/boards_api.py index 9109e71..600c864 100644 --- a/server/connectors/boards_api.py +++ b/server/connectors/boards_api.py @@ -25,22 +25,29 @@ class BoardsAPI(BaseConnector): def get_new_posts_by_search(self, search: str, category: str, - post_limit: int, - comment_limit: int + post_limit: int ) -> list[Post]: + if search: + raise NotImplementedError("Search not compatible with boards.ie") + + if category: + return self._get_posts(f"{self.url}/categories/{category}", post_limit) + else: + return self._get_posts(f"{self.url}/discussions", post_limit) + + ## Private + def _get_posts(self, url, limit) -> list[Post]: urls = [] current_page = 1 - logger.info(f"Fetching posts from category: {category}") - - while len(urls) < post_limit: - url = f"{self.url}/categories/{category}/p{current_page}" + while len(urls) < limit: + url = f"{self.url}/p{current_page}" html = self._fetch_page(url) soup = BeautifulSoup(html, "html.parser") - logger.debug(f"Processing page {current_page} for category {category}") + logger.debug(f"Processing page {current_page} for link: {url}") for a in soup.select("a.threadbit-threadlink"): - if len(urls) >= post_limit: + if len(urls) >= limit: break href = a.get("href") @@ -49,14 +56,14 @@ class BoardsAPI(BaseConnector): current_page += 1 - logger.debug(f"Fetched {len(urls)} post URLs from category {category}") + logger.debug(f"Fetched {len(urls)} post URLs") # Fetch post details for each URL and create Post objects posts = [] def fetch_and_parse(post_url): html = self._fetch_page(post_url) - post = self._parse_thread(html, post_url, comment_limit) + post = self._parse_thread(html, post_url) return post with ThreadPoolExecutor(max_workers=30) as executor: @@ -79,7 +86,7 @@ class BoardsAPI(BaseConnector): response.raise_for_status() return response.text - def _parse_thread(self, html: str, post_url: str, comment_limit: int) -> Post: + def _parse_thread(self, html: str, post_url: str) -> Post: soup = BeautifulSoup(html, "html.parser") # Author @@ -108,7 +115,7 @@ class BoardsAPI(BaseConnector): title = title_tag.text.strip() if title_tag else None # Comments - comments = self._parse_comments(post_url, post_num, comment_limit) + comments = self._parse_comments(post_url, post_num) post = Post( id=post_num, @@ -123,11 +130,11 @@ class BoardsAPI(BaseConnector): return post - def _parse_comments(self, url: str, post_id: str, comment_limit: int) -> list[Comment]: + def _parse_comments(self, url: str, post_id: str) -> list[Comment]: comments = [] current_url = url - while current_url and len(comments) < comment_limit: + while current_url: html = self._fetch_page(current_url) page_comments = self._parse_page_comments(html, post_id) comments.extend(page_comments) diff --git a/server/connectors/reddit_api.py b/server/connectors/reddit_api.py index 444326a..9042c6c 100644 --- a/server/connectors/reddit_api.py +++ b/server/connectors/reddit_api.py @@ -20,39 +20,44 @@ class RedditAPI(BaseConnector): def get_new_posts_by_search(self, search: str, category: str, - post_limit: int, - comment_limit: int + post_limit: int ) -> list[Post]: - if not search: - return self._get_new_subreddit_posts(category, limit=post_limit) + prefix = f"r/{category}/" if category else "" + params = {'limit': post_limit} - params = { - 'q': search, - 'limit': post_limit, - 'restrict_sr': 'on', - 'sort': 'new' - } + if search: + endpoint = f"{prefix}search.json" + params.update({ + 'q': search, + 'sort': 'new', + 'restrict_sr': 'on' if category else 'off' + }) + else: + endpoint = f"{prefix}new.json" - logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}") - url = f"r/{category}/search.json" posts = [] - + after = None + while len(posts) < post_limit: batch_limit = min(100, post_limit - len(posts)) params['limit'] = batch_limit + if after: + params['after'] = after - data = self._fetch_post_overviews(url, params) - batch_posts = self._parse_posts(data) - - logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}") - - if not batch_posts: + data = self._fetch_post_overviews(endpoint, params) + + if not data or 'data' not in data or not data['data'].get('children'): break + batch_posts = self._parse_posts(data) posts.extend(batch_posts) - return posts + after = data['data'].get('after') + if not after: + break + + return posts[:post_limit] def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: posts = [] diff --git a/server/queue/tasks.py b/server/queue/tasks.py index fd5237f..95248d1 100644 --- a/server/queue/tasks.py +++ b/server/queue/tasks.py @@ -46,8 +46,7 @@ def fetch_and_process_dataset(self, raw_posts = connector.get_new_posts_by_search( search=search, category=category, - post_limit=limit, - comment_limit=limit + post_limit=limit ) posts.extend(post.to_dict() for post in raw_posts)