feat(api): add average emotion per user into per user analysis

This commit is contained in:
2026-02-24 18:57:00 +00:00
parent 6695d3d272
commit fb99d4ae15

View File

@@ -3,6 +3,7 @@ import re
from collections import Counter from collections import Counter
class InteractionAnalysis: class InteractionAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
self.df = df self.df = df
@@ -12,7 +13,9 @@ class InteractionAnalysis:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions] return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list: def _vocab_richness_per_user(
self, min_words: int = 20, top_most_used_words: int = 100
) -> list:
df = self.df.copy() df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower() df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize) df["tokens"] = df["content"].apply(self._tokenize)
@@ -39,15 +42,17 @@ class InteractionAnalysis:
for w, c in counts.most_common(top_most_used_words) for w, c in counts.most_common(top_most_used_words)
] ]
rows.append({ rows.append(
{
"author": author, "author": author,
"events": int(events), "events": int(events),
"total_words": int(total_words), "total_words": int(total_words),
"unique_words": int(unique_words), "unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3), "vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2), "avg_words_per_event": round(avg_words, 2),
"top_words": top_words "top_words": top_words,
}) }
)
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
@@ -55,9 +60,7 @@ class InteractionAnalysis:
def top_users(self) -> list: def top_users(self) -> list:
counts = ( counts = (
self.df.groupby(["author", "source"]) self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
.size()
.sort_values(ascending=False)
) )
top_users = [ top_users = [
@@ -68,19 +71,29 @@ class InteractionAnalysis:
return top_users return top_users
def per_user_analysis(self) -> dict: def per_user_analysis(self) -> dict:
per_user = ( per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
self.df.groupby(["author", "type"])
.size() emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
.unstack(fill_value=0)
) avg_emotions_by_author = {}
if emotion_cols:
avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows()
}
# ensure columns always exist # ensure columns always exist
for col in ("post", "comment"): for col in ("post", "comment"):
if col not in per_user.columns: if col not in per_user.columns:
per_user[col] = 0 per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1) per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1) 0, 1
)
per_user["comment_share"] = per_user["comment"] / (
per_user["post"] + per_user["comment"]
).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records") per_user_records = per_user.reset_index().to_dict(orient="records")
@@ -91,14 +104,17 @@ class InteractionAnalysis:
merged_users = [] merged_users = []
for row in per_user_records: for row in per_user_records:
author = row["author"] author = row["author"]
merged_users.append({ merged_users.append(
{
"author": author, "author": author,
"post": int(row.get("post", 0)), "post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)), "comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)), "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)), "comment_share": float(row.get("comment_share", 0)),
"vocab": vocab_by_author.get(author) "avg_emotions": avg_emotions_by_author.get(author, {}),
}) "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
}
)
merged_users.sort(key=lambda u: u["comment_post_ratio"]) merged_users.sort(key=lambda u: u["comment_post_ratio"])
@@ -151,7 +167,8 @@ class InteractionAnalysis:
emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [ emotion_cols = [
c for c in self.df.columns c
for c in self.df.columns
if c.startswith("emotion_") and c not in emotion_exclusions if c.startswith("emotion_") and c not in emotion_exclusions
] ]
@@ -174,14 +191,18 @@ class InteractionAnalysis:
reply_to = id_to_reply.get(current) reply_to = id_to_reply.get(current)
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "": if (
reply_to is None
or (isinstance(reply_to, float) and pd.isna(reply_to))
or reply_to == ""
):
break break
length += 1 length += 1
current = reply_to current = reply_to
if current in length_cache: if current in length_cache:
length += (length_cache[current] - 1) length += length_cache[current] - 1
break break
length_cache[start_id] = length length_cache[start_id] = length