diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 4ed4950..6fd5b3f 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -3,6 +3,7 @@ import re from collections import Counter + class InteractionAnalysis: def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): self.df = df @@ -12,7 +13,9 @@ class InteractionAnalysis: tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in self.word_exclusions] - def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list: + def _vocab_richness_per_user( + self, min_words: int = 20, top_most_used_words: int = 100 + ) -> list: df = self.df.copy() df["content"] = df["content"].fillna("").astype(str).str.lower() df["tokens"] = df["content"].apply(self._tokenize) @@ -39,15 +42,17 @@ class InteractionAnalysis: for w, c in counts.most_common(top_most_used_words) ] - rows.append({ - "author": author, - "events": int(events), - "total_words": int(total_words), - "unique_words": int(unique_words), - "vocab_richness": round(vocab_richness, 3), - "avg_words_per_event": round(avg_words, 2), - "top_words": top_words - }) + rows.append( + { + "author": author, + "events": int(events), + "total_words": int(total_words), + "unique_words": int(unique_words), + "vocab_richness": round(vocab_richness, 3), + "avg_words_per_event": round(avg_words, 2), + "top_words": top_words, + } + ) rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) @@ -55,9 +60,7 @@ class InteractionAnalysis: def top_users(self) -> list: counts = ( - self.df.groupby(["author", "source"]) - .size() - .sort_values(ascending=False) + self.df.groupby(["author", "source"]).size().sort_values(ascending=False) ) top_users = [ @@ -66,21 +69,31 @@ class InteractionAnalysis: ] return top_users - + def per_user_analysis(self) -> dict: - per_user = ( - self.df.groupby(["author", "type"]) - .size() - .unstack(fill_value=0) - ) + per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0) + + emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")] + + avg_emotions_by_author = {} + if emotion_cols: + avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0) + avg_emotions_by_author = { + author: {emotion: float(score) for emotion, score in row.items()} + for author, row in avg_emotions.iterrows() + } # ensure columns always exist for col in ("post", "comment"): if col not in per_user.columns: per_user[col] = 0 - per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1) - per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1) + per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( + 0, 1 + ) + per_user["comment_share"] = per_user["comment"] / ( + per_user["post"] + per_user["comment"] + ).replace(0, 1) per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user_records = per_user.reset_index().to_dict(orient="records") @@ -91,19 +104,22 @@ class InteractionAnalysis: merged_users = [] for row in per_user_records: author = row["author"] - merged_users.append({ - "author": author, - "post": int(row.get("post", 0)), - "comment": int(row.get("comment", 0)), - "comment_post_ratio": float(row.get("comment_post_ratio", 0)), - "comment_share": float(row.get("comment_share", 0)), - "vocab": vocab_by_author.get(author) - }) + merged_users.append( + { + "author": author, + "post": int(row.get("post", 0)), + "comment": int(row.get("comment", 0)), + "comment_post_ratio": float(row.get("comment_post_ratio", 0)), + "comment_share": float(row.get("comment_share", 0)), + "avg_emotions": avg_emotions_by_author.get(author, {}), + "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}), + } + ) merged_users.sort(key=lambda u: u["comment_post_ratio"]) return merged_users - + def interaction_graph(self): interactions = {a: {} for a in self.df["author"].dropna().unique()} @@ -124,7 +140,7 @@ class InteractionAnalysis: interactions[a][b] = interactions[a].get(b, 0) + 1 return interactions - + def average_thread_depth(self): depths = [] id_to_reply = self.df.set_index("id")["reply_to"].to_dict() @@ -144,14 +160,15 @@ class InteractionAnalysis: if not depths: return 0 - + return round(sum(depths) / len(depths), 2) - + def average_thread_length_by_emotion(self): emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_cols = [ - c for c in self.df.columns + c + for c in self.df.columns if c.startswith("emotion_") and c not in emotion_exclusions ] @@ -174,14 +191,18 @@ class InteractionAnalysis: reply_to = id_to_reply.get(current) - if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "": + if ( + reply_to is None + or (isinstance(reply_to, float) and pd.isna(reply_to)) + or reply_to == "" + ): break length += 1 current = reply_to if current in length_cache: - length += (length_cache[current] - 1) + length += length_cache[current] - 1 break length_cache[start_id] = length @@ -205,4 +226,4 @@ class InteractionAnalysis: return { emotion: round(sum(lengths) / len(lengths), 2) for emotion, lengths in emotion_to_lengths.items() - } \ No newline at end of file + }