Merge remote-tracking branch 'origin/main' into feat/corpus-explorer

2026-04-10 13:19:17 +01:00
parent 37d08c63b8 99afe82464
commit 4dd2721e98
14 changed files with 881 additions and 38 deletions
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -67,6 +67,12 @@ class CulturalAnalysis:

    def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
        s = df[self.content_col].fillna("").astype(str)
+        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
+        emotion_cols = [
+            c
+            for c in df.columns
+            if c.startswith("emotion_") and c not in emotion_exclusions
+        ]

        hedge_pattern = re.compile(
            r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
@@ -88,7 +94,7 @@ class CulturalAnalysis:
            0, 1
        )

-        return {
+        result = {
            "hedge_total": int(hedge_counts.sum()),
            "certainty_total": int(certainty_counts.sum()),
            "deontic_total": int(deontic_counts.sum()),
@@ -107,6 +113,32 @@ class CulturalAnalysis:
            ),
        }

+        if emotion_cols:
+            emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
+
+            result["hedge_emotion_avg"] = (
+                emo.loc[hedge_counts > 0].mean()
+                if (hedge_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+            result["certainty_emotion_avg"] = (
+                emo.loc[certainty_counts > 0].mean()
+                if (certainty_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+            result["deontic_emotion_avg"] = (
+                emo.loc[deontic_counts > 0].mean()
+                if (deontic_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+            result["permission_emotion_avg"] = (
+                emo.loc[perm_counts > 0].mean()
+                if (perm_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+
+        return result
+
    def get_avg_emotions_per_entity(
        self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
    ) -> dict[str, Any]:
--- a/server/analysis/user.py
+++ b/server/analysis/user.py
@@ -71,6 +71,7 @@ class UserAnalysis:
        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)

        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
+        dominant_topic_by_author = {}

        avg_emotions_by_author = {}
        if emotion_cols:
@@ -80,6 +81,31 @@ class UserAnalysis:
                for author, row in avg_emotions.iterrows()
            }

+        if "topic" in df.columns:
+            topic_df = df[
+                df["topic"].notna()
+                & (df["topic"] != "")
+                & (df["topic"] != "Misc")
+            ]
+            if not topic_df.empty:
+                topic_counts = (
+                    topic_df.groupby(["author", "topic"])
+                    .size()
+                    .reset_index(name="count")
+                    .sort_values(
+                        ["author", "count", "topic"],
+                        ascending=[True, False, True],
+                    )
+                    .drop_duplicates(subset=["author"])
+                )
+                dominant_topic_by_author = {
+                    row["author"]: {
+                        "topic": row["topic"],
+                        "count": int(row["count"]),
+                    }
+                    for _, row in topic_counts.iterrows()
+                }
+
        # ensure columns always exist
        for col in ("post", "comment"):
            if col not in per_user.columns:
@@ -109,6 +135,7 @@ class UserAnalysis:
                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                    "comment_share": float(row.get("comment_share", 0)),
                    "avg_emotions": avg_emotions_by_author.get(author, {}),
+                    "dominant_topic": dominant_topic_by_author.get(author),
                    "vocab": vocab_by_author.get(
                        author,
                        {
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -11,8 +11,7 @@ from server.connectors.base import BaseConnector

 logger = logging.getLogger(__name__)

-HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumFetcher/1.0)"}
-
+HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; Digital-Ethnography-Aid/1.0)"}

 class BoardsAPI(BaseConnector):
    source_name: str = "boards.ie"
@@ -88,7 +87,7 @@ class BoardsAPI(BaseConnector):
            post = self._parse_thread(html, post_url)
            return post

-        with ThreadPoolExecutor(max_workers=30) as executor:
+        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(fetch_and_parse, url): url for url in urls}

            for i, future in enumerate(as_completed(futures)):
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -1,6 +1,10 @@
 import requests
 import logging
 import time
+import os
+
+from dotenv import load_dotenv
+from requests.auth import HTTPBasicAuth

 from dto.post import Post
 from dto.user import User
@@ -9,6 +13,8 @@ from server.connectors.base import BaseConnector

 logger = logging.getLogger(__name__)

+CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
+CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")

 class RedditAPI(BaseConnector):
    source_name: str = "reddit"
@@ -18,6 +24,8 @@ class RedditAPI(BaseConnector):

    def __init__(self):
        self.url = "https://www.reddit.com/"
+        self.token = None
+        self.token_expiry = 0

    # Public Methods #
    def get_new_posts_by_search(
@@ -171,9 +179,44 @@ class RedditAPI(BaseConnector):
        user = User(username=user_data["name"], created_utc=user_data["created_utc"])
        user.karma = user_data["total_karma"]
        return user
+    
+    def _get_token(self):
+        if self.token and time.time() < self.token_expiry:
+            return self.token
+
+        logger.info("Fetching new Reddit access token...")
+
+        auth = HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET)
+
+        data = {
+            "grant_type": "client_credentials"
+        }
+
+        headers = {
+            "User-Agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
+        }
+
+        response = requests.post(
+            "https://www.reddit.com/api/v1/access_token",
+            auth=auth,
+            data=data,
+            headers=headers,
+        )
+
+        response.raise_for_status()
+        token_json = response.json()
+
+        self.token = token_json["access_token"]
+        self.token_expiry = time.time() + token_json["expires_in"] - 60
+
+        logger.info(
+            f"Obtained new Reddit access token (expires in {token_json['expires_in']}s)"
+        )
+
+        return self.token

    def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict:
-        url = f"{self.url}{endpoint}"
+        url = f"https://oauth.reddit.com/{endpoint.lstrip('/')}"
        max_retries = 15
        backoff = 1  # seconds

@@ -182,13 +225,14 @@ class RedditAPI(BaseConnector):
                response = requests.get(
                    url,
                    headers={
-                        "User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
+                        "User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)",
+                        "Authorization": f"Bearer {self._get_token()}",
                    },
                    params=params,
                )

                if response.status_code == 429:
-                    wait_time = response.headers.get("Retry-After", backoff)
+                    wait_time = response.headers.get("X-Ratelimit-Reset", backoff)

                    logger.warning(
                        f"Rate limited by Reddit API. Retrying in {wait_time} seconds..."