2026-03-14 21:58:49 +00:00
4 changed files with 48 additions and 38 deletions
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -17,7 +17,6 @@ class BaseConnector(ABC):
    def get_new_posts_by_search(self, 
                                search: str = None, 
                                category: str = None, 
-                                post_limit: int = 10, 
-                                comment_limit: int = 10
+                                post_limit: int = 10
                                ) -> list[Post]:
        ...
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -25,22 +25,29 @@ class BoardsAPI(BaseConnector):
    def get_new_posts_by_search(self, 
                                search: str,
                                category: str, 
-                                post_limit: int, 
-                                comment_limit: int
+                                post_limit: int
                                )  -> list[Post]:
+        if search:
+            raise NotImplementedError("Search not compatible with boards.ie")
+        
+        if category:
+            return self._get_posts(f"{self.url}/categories/{category}", post_limit)
+        else:
+            return self._get_posts(f"{self.url}/discussions", post_limit)
+    
+    ## Private
+    def _get_posts(self, url, limit) -> list[Post]:
        urls = []
        current_page = 1

-        logger.info(f"Fetching posts from category: {category}")
-
-        while len(urls) < post_limit:
-            url = f"{self.url}/categories/{category}/p{current_page}"
+        while len(urls) < limit:
+            url = f"{self.url}/p{current_page}"
            html = self._fetch_page(url)
            soup = BeautifulSoup(html, "html.parser")

-            logger.debug(f"Processing page {current_page} for category {category}")
+            logger.debug(f"Processing page {current_page} for link: {url}")
            for a in soup.select("a.threadbit-threadlink"):
-                if len(urls) >= post_limit:
+                if len(urls) >= limit:
                    break

                href = a.get("href")
@@ -49,14 +56,14 @@ class BoardsAPI(BaseConnector):
            
            current_page += 1

-        logger.debug(f"Fetched {len(urls)} post URLs from category {category}")
+        logger.debug(f"Fetched {len(urls)} post URLs")

        # Fetch post details for each URL and create Post objects
        posts = []

        def fetch_and_parse(post_url):
            html = self._fetch_page(post_url)
-            post = self._parse_thread(html, post_url, comment_limit)
+            post = self._parse_thread(html, post_url)
            return post

        with ThreadPoolExecutor(max_workers=30) as executor:
@@ -79,7 +86,7 @@ class BoardsAPI(BaseConnector):
        response.raise_for_status()
        return response.text

-    def _parse_thread(self, html: str, post_url: str, comment_limit: int) -> Post:
+    def _parse_thread(self, html: str, post_url: str) -> Post:
        soup = BeautifulSoup(html, "html.parser")
        
        # Author
@@ -108,7 +115,7 @@ class BoardsAPI(BaseConnector):
        title = title_tag.text.strip() if title_tag else None

        # Comments
-        comments = self._parse_comments(post_url, post_num, comment_limit)
+        comments = self._parse_comments(post_url, post_num)

        post = Post(
            id=post_num,
@@ -123,11 +130,11 @@ class BoardsAPI(BaseConnector):

        return post

-    def _parse_comments(self, url: str, post_id: str, comment_limit: int) -> list[Comment]:
+    def _parse_comments(self, url: str, post_id: str) -> list[Comment]:
        comments = []
        current_url = url

-        while current_url and len(comments) < comment_limit:
+        while current_url:
            html = self._fetch_page(current_url)
            page_comments = self._parse_page_comments(html, post_id)
            comments.extend(page_comments)
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -20,39 +20,44 @@ class RedditAPI(BaseConnector):
    def get_new_posts_by_search(self, 
                                search: str, 
                                category: str, 
-                                post_limit: int,
-                                comment_limit: int
+                                post_limit: int
                                ) -> list[Post]:
        
-        if not search:
-            return self._get_new_subreddit_posts(category, limit=post_limit)
+        prefix = f"r/{category}/" if category else ""
+        params = {'limit': post_limit}

-        params = {
-            'q': search,
-            'limit': post_limit,
-            'restrict_sr': 'on',
-            'sort': 'new'
-        }
+        if search:
+            endpoint = f"{prefix}search.json"
+            params.update({
+                'q': search,
+                'sort': 'new',
+                'restrict_sr': 'on' if category else 'off' 
+            })
+        else:
+            endpoint = f"{prefix}new.json"

-        logger.info(f"Searching subreddit '{category}' for '{search}' with limit {post_limit}")
-        url = f"r/{category}/search.json"
        posts = []
-        
+        after = None
+
        while len(posts) < post_limit:
            batch_limit = min(100, post_limit - len(posts))
            params['limit'] = batch_limit
+            if after:
+                params['after'] = after

-            data = self._fetch_post_overviews(url, params)
-            batch_posts = self._parse_posts(data)
-
-            logger.debug(f"Fetched {len(batch_posts)} posts from search in subreddit {category}")
-
-            if not batch_posts:
+            data = self._fetch_post_overviews(endpoint, params)
+            
+            if not data or 'data' not in data or not data['data'].get('children'):
                break

+            batch_posts = self._parse_posts(data)
            posts.extend(batch_posts)

-        return posts
+            after = data['data'].get('after')
+            if not after:
+                break
+
+        return posts[:post_limit]
    
    def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
        posts = []
--- a/server/queue/tasks.py
+++ b/server/queue/tasks.py
@@ -46,8 +46,7 @@ def fetch_and_process_dataset(self,
            raw_posts = connector.get_new_posts_by_search(
                search=search,
                category=category,
-                post_limit=limit,
-                comment_limit=limit
+                post_limit=limit
            )
            posts.extend(post.to_dict() for post in raw_posts)