Merge remote-tracking branch 'origin/main' into feat/corpus-explorer

2026-04-10 13:19:17 +01:00
parent 37d08c63b8 99afe82464
commit 4dd2721e98
14 changed files with 881 additions and 38 deletions
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -67,6 +67,12 @@ class CulturalAnalysis:

    def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
        s = df[self.content_col].fillna("").astype(str)
+        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
+        emotion_cols = [
+            c
+            for c in df.columns
+            if c.startswith("emotion_") and c not in emotion_exclusions
+        ]

        hedge_pattern = re.compile(
            r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
@@ -88,7 +94,7 @@ class CulturalAnalysis:
            0, 1
        )

-        return {
+        result = {
            "hedge_total": int(hedge_counts.sum()),
            "certainty_total": int(certainty_counts.sum()),
            "deontic_total": int(deontic_counts.sum()),
@@ -107,6 +113,32 @@ class CulturalAnalysis:
            ),
        }

+        if emotion_cols:
+            emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
+
+            result["hedge_emotion_avg"] = (
+                emo.loc[hedge_counts > 0].mean()
+                if (hedge_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+            result["certainty_emotion_avg"] = (
+                emo.loc[certainty_counts > 0].mean()
+                if (certainty_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+            result["deontic_emotion_avg"] = (
+                emo.loc[deontic_counts > 0].mean()
+                if (deontic_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+            result["permission_emotion_avg"] = (
+                emo.loc[perm_counts > 0].mean()
+                if (perm_counts > 0).any()
+                else pd.Series(0.0, index=emotion_cols)
+            ).to_dict()
+
+        return result
+
    def get_avg_emotions_per_entity(
        self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
    ) -> dict[str, Any]:
--- a/server/analysis/user.py
+++ b/server/analysis/user.py
@@ -71,6 +71,7 @@ class UserAnalysis:
        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)

        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
+        dominant_topic_by_author = {}

        avg_emotions_by_author = {}
        if emotion_cols:
@@ -80,6 +81,31 @@ class UserAnalysis:
                for author, row in avg_emotions.iterrows()
            }

+        if "topic" in df.columns:
+            topic_df = df[
+                df["topic"].notna()
+                & (df["topic"] != "")
+                & (df["topic"] != "Misc")
+            ]
+            if not topic_df.empty:
+                topic_counts = (
+                    topic_df.groupby(["author", "topic"])
+                    .size()
+                    .reset_index(name="count")
+                    .sort_values(
+                        ["author", "count", "topic"],
+                        ascending=[True, False, True],
+                    )
+                    .drop_duplicates(subset=["author"])
+                )
+                dominant_topic_by_author = {
+                    row["author"]: {
+                        "topic": row["topic"],
+                        "count": int(row["count"]),
+                    }
+                    for _, row in topic_counts.iterrows()
+                }
+
        # ensure columns always exist
        for col in ("post", "comment"):
            if col not in per_user.columns:
@@ -109,6 +135,7 @@ class UserAnalysis:
                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                    "comment_share": float(row.get("comment_share", 0)),
                    "avg_emotions": avg_emotions_by_author.get(author, {}),
+                    "dominant_topic": dominant_topic_by_author.get(author),
                    "vocab": vocab_by_author.get(
                        author,
                        {