diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 864980d..8220747 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -57,73 +57,6 @@ class InteractionAnalysis: return rows - def top_users(self, df: pd.DataFrame) -> list: - counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) - - top_users = [ - {"author": author, "source": source, "count": int(count)} - for (author, source), count in counts.items() - ] - - return top_users - - def per_user_analysis(self, df: pd.DataFrame) -> dict: - per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) - - emotion_cols = [col for col in df.columns if col.startswith("emotion_")] - - avg_emotions_by_author = {} - if emotion_cols: - avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) - avg_emotions_by_author = { - author: {emotion: float(score) for emotion, score in row.items()} - for author, row in avg_emotions.iterrows() - } - - # ensure columns always exist - for col in ("post", "comment"): - if col not in per_user.columns: - per_user[col] = 0 - - per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( - 0, 1 - ) - per_user["comment_share"] = per_user["comment"] / ( - per_user["post"] + per_user["comment"] - ).replace(0, 1) - per_user = per_user.sort_values("comment_post_ratio", ascending=True) - per_user_records = per_user.reset_index().to_dict(orient="records") - - vocab_rows = self._vocab_richness_per_user(df) - vocab_by_author = {row["author"]: row for row in vocab_rows} - - # merge vocab richness + per_user information - merged_users = [] - for row in per_user_records: - author = row["author"] - merged_users.append( - { - "author": author, - "post": int(row.get("post", 0)), - "comment": int(row.get("comment", 0)), - "comment_post_ratio": float(row.get("comment_post_ratio", 0)), - "comment_share": float(row.get("comment_share", 0)), - "avg_emotions": avg_emotions_by_author.get(author, {}), - "vocab": vocab_by_author.get( - author, - { - "vocab_richness": 0, - "avg_words_per_event": 0, - "top_words": [], - }, - ), - } - ) - - merged_users.sort(key=lambda u: u["comment_post_ratio"]) - - return merged_users - def interaction_graph(self, df: pd.DataFrame): interactions = {a: {} for a in df["author"].dropna().unique()} diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index 0d1ffc9..bec7eeb 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -39,7 +39,7 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() self.summary_analysis = SummaryAnalysis() - self.user_analysis = UserAnalysis(self.interaction_analysis) + self.user_analysis = UserAnalysis() ## Private Methods def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: @@ -111,7 +111,7 @@ class StatGen: return { "top_users": self.user_analysis.top_users(filtered_df), - "users": self.user_analysis.users(filtered_df) + "users": self.user_analysis.per_user_analysis(filtered_df) } def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: diff --git a/server/analysis/user.py b/server/analysis/user.py index 57ddc1e..d5e9917 100644 --- a/server/analysis/user.py +++ b/server/analysis/user.py @@ -1,20 +1,70 @@ import pandas as pd -from server.analysis.interactional import InteractionAnalysis - class UserAnalysis: - def __init__(self, interaction_analysis: InteractionAnalysis): - self.interaction_analysis = interaction_analysis - def top_users(self, df: pd.DataFrame) -> list: - return self.interaction_analysis.top_users(df) + counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) - def users(self, df: pd.DataFrame) -> dict | list: - return self.interaction_analysis.per_user_analysis(df) + top_users = [ + {"author": author, "source": source, "count": int(count)} + for (author, source), count in counts.items() + ] - def user(self, df: pd.DataFrame) -> dict: - return { - "top_users": self.top_users(df), - "users": self.users(df), - } + return top_users + + def per_user_analysis(self, df: pd.DataFrame) -> dict: + per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) + + emotion_cols = [col for col in df.columns if col.startswith("emotion_")] + + avg_emotions_by_author = {} + if emotion_cols: + avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) + avg_emotions_by_author = { + author: {emotion: float(score) for emotion, score in row.items()} + for author, row in avg_emotions.iterrows() + } + + # ensure columns always exist + for col in ("post", "comment"): + if col not in per_user.columns: + per_user[col] = 0 + + per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( + 0, 1 + ) + per_user["comment_share"] = per_user["comment"] / ( + per_user["post"] + per_user["comment"] + ).replace(0, 1) + per_user = per_user.sort_values("comment_post_ratio", ascending=True) + per_user_records = per_user.reset_index().to_dict(orient="records") + + vocab_rows = self._vocab_richness_per_user(df) + vocab_by_author = {row["author"]: row for row in vocab_rows} + + # merge vocab richness + per_user information + merged_users = [] + for row in per_user_records: + author = row["author"] + merged_users.append( + { + "author": author, + "post": int(row.get("post", 0)), + "comment": int(row.get("comment", 0)), + "comment_post_ratio": float(row.get("comment_post_ratio", 0)), + "comment_share": float(row.get("comment_share", 0)), + "avg_emotions": avg_emotions_by_author.get(author, {}), + "vocab": vocab_by_author.get( + author, + { + "vocab_richness": 0, + "avg_words_per_event": 0, + "top_words": [], + }, + ), + } + ) + + merged_users.sort(key=lambda u: u["comment_post_ratio"]) + + return merged_users