refactor(stats): move user stats out of interactional into users

2026-03-17 12:23:03 +00:00
parent 8a13444b16
commit 9093059d05
3 changed files with 65 additions and 82 deletions
--- a/server/analysis/user.py
+++ b/server/analysis/user.py
@@ -1,20 +1,70 @@
 import pandas as pd

-from server.analysis.interactional import InteractionAnalysis
-

 class UserAnalysis:
-    def __init__(self, interaction_analysis: InteractionAnalysis):
-        self.interaction_analysis = interaction_analysis
-
    def top_users(self, df: pd.DataFrame) -> list:
-        return self.interaction_analysis.top_users(df)
+        counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)

-    def users(self, df: pd.DataFrame) -> dict | list:
-        return self.interaction_analysis.per_user_analysis(df)
+        top_users = [
+            {"author": author, "source": source, "count": int(count)}
+            for (author, source), count in counts.items()
+        ]

-    def user(self, df: pd.DataFrame) -> dict:
-        return {
-            "top_users": self.top_users(df),
-            "users": self.users(df),
-        }
+        return top_users
+
+    def per_user_analysis(self, df: pd.DataFrame) -> dict:
+        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
+
+        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
+
+        avg_emotions_by_author = {}
+        if emotion_cols:
+            avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
+            avg_emotions_by_author = {
+                author: {emotion: float(score) for emotion, score in row.items()}
+                for author, row in avg_emotions.iterrows()
+            }
+
+        # ensure columns always exist
+        for col in ("post", "comment"):
+            if col not in per_user.columns:
+                per_user[col] = 0
+
+        per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
+            0, 1
+        )
+        per_user["comment_share"] = per_user["comment"] / (
+            per_user["post"] + per_user["comment"]
+        ).replace(0, 1)
+        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
+        per_user_records = per_user.reset_index().to_dict(orient="records")
+
+        vocab_rows = self._vocab_richness_per_user(df)
+        vocab_by_author = {row["author"]: row for row in vocab_rows}
+
+        # merge vocab richness + per_user information
+        merged_users = []
+        for row in per_user_records:
+            author = row["author"]
+            merged_users.append(
+                {
+                    "author": author,
+                    "post": int(row.get("post", 0)),
+                    "comment": int(row.get("comment", 0)),
+                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
+                    "comment_share": float(row.get("comment_share", 0)),
+                    "avg_emotions": avg_emotions_by_author.get(author, {}),
+                    "vocab": vocab_by_author.get(
+                        author,
+                        {
+                            "vocab_richness": 0,
+                            "avg_words_per_event": 0,
+                            "top_words": [],
+                        },
+                    ),
+                }
+            )
+
+        merged_users.sort(key=lambda u: u["comment_post_ratio"])
+
+        return merged_users