style: run python linter & prettifier on backend code

2026-03-25 19:34:43 +00:00
parent aae10c4d9d
commit 376773a0cc
17 changed files with 408 additions and 315 deletions
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -1,10 +1,11 @@
 from abc import ABC, abstractmethod
 from dto.post import Post

+
 class BaseConnector(ABC):
    # Each subclass declares these at the class level
-    source_name: str       # machine-readable: "reddit", "youtube"
-    display_name: str      # human-readable: "Reddit", "YouTube"
+    source_name: str  # machine-readable: "reddit", "youtube"
+    display_name: str  # human-readable: "Reddit", "YouTube"
    required_env: list[str] = []  # env vars needed to activate

    search_enabled: bool
@@ -14,16 +15,13 @@ class BaseConnector(ABC):
    def is_available(cls) -> bool:
        """Returns True if all required env vars are set."""
        import os
+
        return all(os.getenv(var) for var in cls.required_env)

    @abstractmethod
-    def get_new_posts_by_search(self, 
-                                search: str = None, 
-                                category: str = None, 
-                                post_limit: int = 10
-                                ) -> list[Post]:
-        ...
+    def get_new_posts_by_search(
+        self, search: str = None, category: str = None, post_limit: int = 10
+    ) -> list[Post]: ...

    @abstractmethod
-    def category_exists(self, category: str) -> bool:
-        ...
+    def category_exists(self, category: str) -> bool: ...
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -11,9 +11,8 @@ from server.connectors.base import BaseConnector

 logger = logging.getLogger(__name__)

-HEADERS = {
-    "User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"
-}
+HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"}
+

 class BoardsAPI(BaseConnector):
    source_name: str = "boards.ie"
@@ -25,19 +24,17 @@ class BoardsAPI(BaseConnector):
    def __init__(self):
        self.base_url = "https://www.boards.ie"

-    def get_new_posts_by_search(self, 
-                                search: str,
-                                category: str, 
-                                post_limit: int
-                                )  -> list[Post]:
+    def get_new_posts_by_search(
+        self, search: str, category: str, post_limit: int
+    ) -> list[Post]:
        if search:
            raise NotImplementedError("Search not compatible with boards.ie")
-        
+
        if category:
            return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
        else:
            return self._get_posts(f"{self.base_url}/discussions", post_limit)
-        
+
    def category_exists(self, category: str) -> bool:
        if not category:
            return False
@@ -59,7 +56,7 @@ class BoardsAPI(BaseConnector):
        except requests.RequestException as e:
            logger.error(f"Error checking category '{category}': {e}")
            return False
-        
+
    ## Private
    def _get_posts(self, url, limit) -> list[Post]:
        urls = []
@@ -78,7 +75,7 @@ class BoardsAPI(BaseConnector):
                href = a.get("href")
                if href:
                    urls.append(href)
-            
+
            current_page += 1

        logger.debug(f"Fetched {len(urls)} post URLs")
@@ -96,7 +93,9 @@ class BoardsAPI(BaseConnector):

            for i, future in enumerate(as_completed(futures)):
                post_url = futures[future]
-                logger.debug(f"Fetching Post {i + 1} / {len(urls)} details from URL: {post_url}")
+                logger.debug(
+                    f"Fetching Post {i + 1} / {len(urls)} details from URL: {post_url}"
+                )
                try:
                    post = future.result()
                    posts.append(post)
@@ -105,7 +104,6 @@ class BoardsAPI(BaseConnector):

        return posts

-
    def _fetch_page(self, url: str) -> str:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
@@ -113,7 +111,7 @@ class BoardsAPI(BaseConnector):

    def _parse_thread(self, html: str, post_url: str) -> Post:
        soup = BeautifulSoup(html, "html.parser")
-        
+
        # Author
        author_tag = soup.select_one(".userinfo-username-title")
        author = author_tag.text.strip() if author_tag else None
@@ -122,10 +120,16 @@ class BoardsAPI(BaseConnector):
        timestamp_tag = soup.select_one(".postbit-header")
        timestamp = None
        if timestamp_tag:
-            match = re.search(r"\d{2}-\d{2}-\d{4}\s+\d{2}:\d{2}[AP]M", timestamp_tag.get_text())
+            match = re.search(
+                r"\d{2}-\d{2}-\d{4}\s+\d{2}:\d{2}[AP]M", timestamp_tag.get_text()
+            )
            timestamp = match.group(0) if match else None
            # convert to unix epoch
-            timestamp = datetime.datetime.strptime(timestamp, "%d-%m-%Y %I:%M%p").timestamp() if timestamp else None
+            timestamp = (
+                datetime.datetime.strptime(timestamp, "%d-%m-%Y %I:%M%p").timestamp()
+                if timestamp
+                else None
+            )

        # Post ID
        post_num = re.search(r"discussion/(\d+)", post_url)
@@ -133,7 +137,9 @@ class BoardsAPI(BaseConnector):

        # Content
        content_tag = soup.select_one(".Message.userContent")
-        content = content_tag.get_text(separator="\n", strip=True) if content_tag else None
+        content = (
+            content_tag.get_text(separator="\n", strip=True) if content_tag else None
+        )

        # Title
        title_tag = soup.select_one(".PageTitle h1")
@@ -150,7 +156,7 @@ class BoardsAPI(BaseConnector):
            url=post_url,
            timestamp=timestamp,
            source=self.source_name,
-            comments=comments
+            comments=comments,
        )

        return post
@@ -168,9 +174,9 @@ class BoardsAPI(BaseConnector):
            soup = BeautifulSoup(html, "html.parser")
            next_link = soup.find("a", class_="Next")

-            if next_link and next_link.get('href'):
-                href = next_link.get('href')
-                current_url = href if href.startswith('http') else url + href
+            if next_link and next_link.get("href"):
+                href = next_link.get("href")
+                current_url = href if href.startswith("http") else url + href
            else:
                current_url = None

@@ -186,21 +192,29 @@ class BoardsAPI(BaseConnector):
            comment_id = tag.get("id")

            # Author
-            user_elem = tag.find('span', class_='userinfo-username-title')
+            user_elem = tag.find("span", class_="userinfo-username-title")
            username = user_elem.get_text(strip=True) if user_elem else None

            # Timestamp
-            date_elem = tag.find('span', class_='DateCreated')
+            date_elem = tag.find("span", class_="DateCreated")
            timestamp = date_elem.get_text(strip=True) if date_elem else None
-            timestamp = datetime.datetime.strptime(timestamp, "%d-%m-%Y %I:%M%p").timestamp() if timestamp else None
+            timestamp = (
+                datetime.datetime.strptime(timestamp, "%d-%m-%Y %I:%M%p").timestamp()
+                if timestamp
+                else None
+            )

            # Content
-            message_div = tag.find('div', class_='Message userContent')
+            message_div = tag.find("div", class_="Message userContent")

            if message_div.blockquote:
                message_div.blockquote.decompose()

-            content = message_div.get_text(separator="\n", strip=True) if message_div else None
+            content = (
+                message_div.get_text(separator="\n", strip=True)
+                if message_div
+                else None
+            )

            comment = Comment(
                id=comment_id,
@@ -209,10 +223,8 @@ class BoardsAPI(BaseConnector):
                content=content,
                timestamp=timestamp,
                reply_to=None,
-                source=self.source_name
+                source=self.source_name,
            )
            comments.append(comment)

        return comments
-
-
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -9,6 +9,7 @@ from server.connectors.base import BaseConnector

 logger = logging.getLogger(__name__)

+
 class RedditAPI(BaseConnector):
    source_name: str = "reddit"
    display_name: str = "Reddit"
@@ -19,22 +20,18 @@ class RedditAPI(BaseConnector):
        self.url = "https://www.reddit.com/"

    # Public Methods #
-    def get_new_posts_by_search(self, 
-                                search: str, 
-                                category: str, 
-                                post_limit: int
-                                ) -> list[Post]:
-        
+    def get_new_posts_by_search(
+        self, search: str, category: str, post_limit: int
+    ) -> list[Post]:
+
        prefix = f"r/{category}/" if category else ""
-        params = {'limit': post_limit}
+        params = {"limit": post_limit}

        if search:
            endpoint = f"{prefix}search.json"
-            params.update({
-                'q': search,
-                'sort': 'new',
-                'restrict_sr': 'on' if category else 'off' 
-            })
+            params.update(
+                {"q": search, "sort": "new", "restrict_sr": "on" if category else "off"}
+            )
        else:
            endpoint = f"{prefix}new.json"

@@ -43,24 +40,24 @@ class RedditAPI(BaseConnector):

        while len(posts) < post_limit:
            batch_limit = min(100, post_limit - len(posts))
-            params['limit'] = batch_limit
+            params["limit"] = batch_limit
            if after:
-                params['after'] = after
+                params["after"] = after

            data = self._fetch_post_overviews(endpoint, params)
-            
-            if not data or 'data' not in data or not data['data'].get('children'):
+
+            if not data or "data" not in data or not data["data"].get("children"):
                break

            batch_posts = self._parse_posts(data)
            posts.extend(batch_posts)

-            after = data['data'].get('after')
+            after = data["data"].get("after")
            if not after:
                break

        return posts[:post_limit]
-    
+
    def _get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]:
        posts = []
        after = None
@@ -70,37 +67,36 @@ class RedditAPI(BaseConnector):

        while len(posts) < limit:
            batch_limit = min(100, limit - len(posts))
-            params = {
-                'limit': batch_limit,
-                'after': after
-            }
+            params = {"limit": batch_limit, "after": after}

            data = self._fetch_post_overviews(url, params)
            batch_posts = self._parse_posts(data)

-            logger.debug(f"Fetched {len(batch_posts)} new posts from subreddit {subreddit}")
+            logger.debug(
+                f"Fetched {len(batch_posts)} new posts from subreddit {subreddit}"
+            )

            if not batch_posts:
                break

            posts.extend(batch_posts)
-            after = data['data'].get('after')
+            after = data["data"].get("after")
            if not after:
                break

        return posts
-    
+
    def get_user(self, username: str) -> User:
        data = self._fetch_post_overviews(f"user/{username}/about.json", {})
        return self._parse_user(data)
-    
+
    def category_exists(self, category: str) -> bool:
        try:
            data = self._fetch_post_overviews(f"r/{category}/about.json", {})
            return (
                data is not None
-                and 'data' in data
-                and data['data'].get('id') is not None
+                and "data" in data
+                and data["data"].get("id") is not None
            )
        except Exception:
            return False
@@ -109,25 +105,26 @@ class RedditAPI(BaseConnector):
    def _parse_posts(self, data) -> list[Post]:
        posts = []

-        total_num_posts = len(data['data']['children'])
+        total_num_posts = len(data["data"]["children"])
        current_index = 0

-        for item in data['data']['children']:
+        for item in data["data"]["children"]:
            current_index += 1
            logger.debug(f"Parsing post {current_index} of {total_num_posts}")

-            post_data = item['data']
+            post_data = item["data"]
            post = Post(
-                id=post_data['id'],
-                author=post_data['author'],
-                title=post_data['title'],
-                content=post_data.get('selftext', ''),
-                url=post_data['url'],
-                timestamp=post_data['created_utc'],
+                id=post_data["id"],
+                author=post_data["author"],
+                title=post_data["title"],
+                content=post_data.get("selftext", ""),
+                url=post_data["url"],
+                timestamp=post_data["created_utc"],
                source=self.source_name,
-                comments=self._get_post_comments(post_data['id']))
-            post.subreddit = post_data['subreddit']
-            post.upvotes = post_data['ups']
+                comments=self._get_post_comments(post_data["id"]),
+            )
+            post.subreddit = post_data["subreddit"]
+            post.upvotes = post_data["ups"]

            posts.append(post)
        return posts
@@ -140,56 +137,62 @@ class RedditAPI(BaseConnector):
        if len(data) < 2:
            return comments

-        comment_data = data[1]['data']['children']
+        comment_data = data[1]["data"]["children"]

        def _parse_comment_tree(items, parent_id=None):
            for item in items:
-                if item['kind'] != 't1':
+                if item["kind"] != "t1":
                    continue

-                comment_info = item['data']
+                comment_info = item["data"]
                comment = Comment(
-                    id=comment_info['id'],
+                    id=comment_info["id"],
                    post_id=post_id,
-                    author=comment_info['author'],
-                    content=comment_info.get('body', ''),
-                    timestamp=comment_info['created_utc'],
-                    reply_to=parent_id or comment_info.get('parent_id', None),
-                    source=self.source_name
+                    author=comment_info["author"],
+                    content=comment_info.get("body", ""),
+                    timestamp=comment_info["created_utc"],
+                    reply_to=parent_id or comment_info.get("parent_id", None),
+                    source=self.source_name,
                )

                comments.append(comment)

                # Process replies recursively
-                replies = comment_info.get('replies')
+                replies = comment_info.get("replies")
                if replies and isinstance(replies, dict):
-                    reply_items = replies.get('data', {}).get('children', [])
+                    reply_items = replies.get("data", {}).get("children", [])
                    _parse_comment_tree(reply_items, parent_id=comment.id)

        _parse_comment_tree(comment_data)
        return comments
-    
+
    def _parse_user(self, data) -> User:
-        user_data = data['data']
-        user = User(
-            username=user_data['name'],
-            created_utc=user_data['created_utc'])
-        user.karma = user_data['total_karma']
+        user_data = data["data"]
+        user = User(username=user_data["name"], created_utc=user_data["created_utc"])
+        user.karma = user_data["total_karma"]
        return user
-    
+
    def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict:
        url = f"{self.url}{endpoint}"
        max_retries = 15
-        backoff = 1 # seconds
+        backoff = 1  # seconds

        for attempt in range(max_retries):
            try:
-                response = requests.get(url, headers={'User-agent': 'python:ethnography-college-project:0.1 (by /u/ThisBirchWood)'}, params=params)
+                response = requests.get(
+                    url,
+                    headers={
+                        "User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
+                    },
+                    params=params,
+                )

                if response.status_code == 429:
                    wait_time = response.headers.get("Retry-After", backoff)

-                    logger.warning(f"Rate limited by Reddit API. Retrying in {wait_time} seconds...")
+                    logger.warning(
+                        f"Rate limited by Reddit API. Retrying in {wait_time} seconds..."
+                    )

                    time.sleep(wait_time)
                    backoff *= 2
@@ -205,4 +208,4 @@ class RedditAPI(BaseConnector):
                return response.json()
            except requests.RequestException as e:
                print(f"Error fetching data from Reddit API: {e}")
-                return {}
+                return {}
--- a/server/connectors/registry.py
+++ b/server/connectors/registry.py
@@ -3,6 +3,7 @@ import importlib
 import server.connectors
 from server.connectors.base import BaseConnector

+
 def _discover_connectors() -> list[type[BaseConnector]]:
    """Walk the connectors package and collect all BaseConnector subclasses."""
    for _, module_name, _ in pkgutil.iter_modules(server.connectors.__path__):
@@ -11,20 +12,24 @@ def _discover_connectors() -> list[type[BaseConnector]]:
        importlib.import_module(f"server.connectors.{module_name}")

    return [
-        cls for cls in BaseConnector.__subclasses__()
+        cls
+        for cls in BaseConnector.__subclasses__()
        if cls.source_name  # guard against abstract intermediaries
    ]

+
 def get_available_connectors() -> dict[str, type[BaseConnector]]:
    return {c.source_name: c for c in _discover_connectors() if c.is_available()}

+
 def get_connector_metadata() -> dict[str, dict]:
    res = {}
    for id, obj in get_available_connectors().items():
-        res[id] = {"id": id, 
-                   "label": obj.display_name, 
-                   "search_enabled": obj.search_enabled, 
-                   "categories_enabled": obj.categories_enabled
-                   }
+        res[id] = {
+            "id": id,
+            "label": obj.display_name,
+            "search_enabled": obj.search_enabled,
+            "categories_enabled": obj.categories_enabled,
+        }

-    return res
+    return res
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -12,6 +12,7 @@ load_dotenv()

 API_KEY = os.getenv("YOUTUBE_API_KEY")

+
 class YouTubeAPI(BaseConnector):
    source_name: str = "youtube"
    display_name: str = "YouTube"
@@ -19,73 +20,72 @@ class YouTubeAPI(BaseConnector):
    categories_enabled: bool = False

    def __init__(self):
-        self.youtube = build('youtube', 'v3', developerKey=API_KEY)
+        self.youtube = build("youtube", "v3", developerKey=API_KEY)

-    def get_new_posts_by_search(self, 
-                                search: str,
-                                category: str, 
-                                post_limit: int
-                                )  -> list[Post]:
-            videos = self._search_videos(search, post_limit)
-            posts = []
+    def get_new_posts_by_search(
+        self, search: str, category: str, post_limit: int
+    ) -> list[Post]:
+        videos = self._search_videos(search, post_limit)
+        posts = []

-            for video in videos:
-                video_id = video['id']['videoId']
-                snippet = video['snippet']
-                title = snippet['title']
-                description = snippet['description']
-                published_at = datetime.datetime.strptime(snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp()
-                channel_title = snippet['channelTitle']
+        for video in videos:
+            video_id = video["id"]["videoId"]
+            snippet = video["snippet"]
+            title = snippet["title"]
+            description = snippet["description"]
+            published_at = datetime.datetime.strptime(
+                snippet["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"
+            ).timestamp()
+            channel_title = snippet["channelTitle"]

-                comments = []
-                comments_data = self._get_video_comments(video_id)
-                for comment_thread in comments_data:
-                    comment_snippet = comment_thread['snippet']['topLevelComment']['snippet']
-                    comment = Comment(
-                        id=comment_thread['id'],
-                        post_id=video_id,
-                        content=comment_snippet['textDisplay'],
-                        author=comment_snippet['authorDisplayName'],
-                        timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
-                        reply_to=None,
-                        source=self.source_name
-                    )
-
-                    comments.append(comment)
-
-                post = Post(
-                    id=video_id,
-                    content=f"{title}\n\n{description}",
-                    author=channel_title,
-                    timestamp=published_at,
-                    url=f"https://www.youtube.com/watch?v={video_id}",
-                    title=title,
+            comments = []
+            comments_data = self._get_video_comments(video_id)
+            for comment_thread in comments_data:
+                comment_snippet = comment_thread["snippet"]["topLevelComment"][
+                    "snippet"
+                ]
+                comment = Comment(
+                    id=comment_thread["id"],
+                    post_id=video_id,
+                    content=comment_snippet["textDisplay"],
+                    author=comment_snippet["authorDisplayName"],
+                    timestamp=datetime.datetime.strptime(
+                        comment_snippet["publishedAt"], "%Y-%m-%dT%H:%M:%SZ"
+                    ).timestamp(),
+                    reply_to=None,
                    source=self.source_name,
-                    comments=comments
                )

-                posts.append(post)
+                comments.append(comment)
+
+            post = Post(
+                id=video_id,
+                content=f"{title}\n\n{description}",
+                author=channel_title,
+                timestamp=published_at,
+                url=f"https://www.youtube.com/watch?v={video_id}",
+                title=title,
+                source=self.source_name,
+                comments=comments,
+            )
+
+            posts.append(post)
+
+        return posts

-            return posts
-    
    def category_exists(self, category):
        return True

    def _search_videos(self, query, limit):
        request = self.youtube.search().list(
-            q=query,
-            part='snippet',
-            type='video',
-            maxResults=limit
+            q=query, part="snippet", type="video", maxResults=limit
        )
        response = request.execute()
-        return response.get('items', [])
-    
+        return response.get("items", [])
+
    def _get_video_comments(self, video_id):
        request = self.youtube.commentThreads().list(
-            part='snippet',
-            videoId=video_id,
-            textFormat='plainText'
+            part="snippet", videoId=video_id, textFormat="plainText"
        )

        try:
@@ -93,4 +93,4 @@ class YouTubeAPI(BaseConnector):
        except HttpError as e:
            print(f"Error fetching comments for video {video_id}: {e}")
            return []
-        return response.get('items', [])
+        return response.get("items", [])