fix(connectors): category / search fields breaking

Ideally category and search are fully optional, however some sites break if one or the other is not provided. Unfortuntely `boards.ie` has a different page type for searches and I'm not bothered to implement a scraper from scratch. In addition, removed comment limit options.
2026-03-11 21:16:26 +00:00
parent 12cbc24074
commit 01d6bd0164
4 changed files with 48 additions and 38 deletions
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -17,7 +17,6 @@ class BaseConnector(ABC):
    def get_new_posts_by_search(self, 
                                search: str = None, 
                                category: str = None, 
-                                post_limit: int = 10, 
+                                post_limit: int = 10
                                comment_limit: int = 10
                                ) -> list[Post]:
        ...
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -25,22 +25,29 @@ class BoardsAPI(BaseConnector):
    def get_new_posts_by_search(self, 
                                search: str,
                                category: str, 
-                                post_limit: int, 
+                                post_limit: int
                                comment_limit: int
                                )  -> list[Post]:
        if search:
            raise NotImplementedError("Search not compatible with boards.ie")
        if category:
            return self._get_posts(f"{self.url}/categories/{category}", post_limit)
        else:
            return self._get_posts(f"{self.url}/discussions", post_limit)
    ## Private
    def _get_posts(self, url, limit) -> list[Post]:
        urls = []
        current_page = 1
-        logger.info(f"Fetching posts from category: {category}")
+        while len(urls) < limit:
-
+            url = f"{self.url}/p{current_page}"
        while len(urls) < post_limit:
            url = f"{self.url}/categories/{category}/p{current_page}"
            html = self._fetch_page(url)
            soup = BeautifulSoup(html, "html.parser")
-            logger.debug(f"Processing page {current_page} for category {category}")
+            logger.debug(f"Processing page {current_page} for link: {url}")
            for a in soup.select("a.threadbit-threadlink"):
-                if len(urls) >= post_limit:
+                if len(urls) >= limit:
                    break
                href = a.get("href")
@@ -49,14 +56,14 @@ class BoardsAPI(BaseConnector):
            current_page += 1
-        logger.debug(f"Fetched {len(urls)} post URLs from category {category}")
+        logger.debug(f"Fetched {len(urls)} post URLs")
        # Fetch post details for each URL and create Post objects
        posts = []
        def fetch_and_parse(post_url):
            html = self._fetch_page(post_url)
-            post = self._parse_thread(html, post_url, comment_limit)
+            post = self._parse_thread(html, post_url)
            return post
        with ThreadPoolExecutor(max_workers=30) as executor:
@@ -79,7 +86,7 @@ class BoardsAPI(BaseConnector):
        response.raise_for_status()
        return response.text
-    def _parse_thread(self, html: str, post_url: str, comment_limit: int) -> Post:
+    def _parse_thread(self, html: str, post_url: str) -> Post:
        soup = BeautifulSoup(html, "html.parser")
        # Author
@@ -108,7 +115,7 @@ class BoardsAPI(BaseConnector):
        title = title_tag.text.strip() if title_tag else None
        # Comments
-        comments = self._parse_comments(post_url, post_num, comment_limit)
+        comments = self._parse_comments(post_url, post_num)
        post = Post(
            id=post_num,
@@ -123,11 +130,11 @@ class BoardsAPI(BaseConnector):
        return post
-    def _parse_comments(self, url: str, post_id: str, comment_limit: int) -> list[Comment]:
+    def _parse_comments(self, url: str, post_id: str) -> list[Comment]:
        comments = []
        current_url = url
-        while current_url and len(comments) < comment_limit:
+        while current_url:
            html = self._fetch_page(current_url)
            page_comments = self._parse_page_comments(html, post_id)
            comments.extend(page_comments)
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -20,39 +20,44 @@ class RedditAPI(BaseConnector):
    def get_new_posts_by_search(self, 
                                search: str, 
                                category: str, 
-                                post_limit: int,
+                                post_limit: int
                                comment_limit: int
                                ) -> list[Post]:
-        if not search:
+        prefix = f"r/{category}/" if category else ""
-            return self._get_new_subreddit_posts(category, limit=post_limit)
+        params = {'limit': post_limit}
-        params = {
+        if search:
-            'q': search,
+            endpoint = f"{prefix}search.json"
-            'limit': post_limit,
+            params.update({
-            'restrict_sr': 'on',
+                'q': search,
-            'sort': 'new'
+                'sort': 'new',
-        }
+                'restrict_sr': 'on' if category else 'off' 
            })
        else:
            endpoint = f"{prefix}new.json"
        logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}")
        url = f"r/{category}/search.json"
        posts = []
        after = None
        while len(posts) < post_limit:
            batch_limit = min(100, post_limit - len(posts))
            params['limit'] = batch_limit
            if after:
                params['after'] = after
-            data = self._fetch_post_overviews(url, params)
+            data = self._fetch_post_overviews(endpoint, params)
            batch_posts = self._parse_posts(data)
-            logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}")
+            if not data or 'data' not in data or not data['data'].get('children'):
            if not batch_posts:
                break
            batch_posts = self._parse_posts(data)
            posts.extend(batch_posts)
-        return posts
+            after = data['data'].get('after')
            if not after:
                break
        return posts[:post_limit]
    def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
        posts = []
--- a/server/queue/tasks.py
+++ b/server/queue/tasks.py
@@ -46,8 +46,7 @@ def fetch_and_process_dataset(self,
            raw_posts = connector.get_new_posts_by_search(
                search=search,
                category=category,
-                post_limit=limit,
+                post_limit=limit
                comment_limit=limit
            )
            posts.extend(post.to_dict() for post in raw_posts)