From 09a4f9036f67b22a278ff0556a22a44d0093500e Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 16 Mar 2026 16:43:24 +0000 Subject: [PATCH] refactor(stats): add summary and user stat classes for consistency --- server/analysis/stat_gen.py | 96 +++++++++++++------------------------ server/analysis/summary.py | 64 +++++++++++++++++++++++++ server/analysis/user.py | 20 ++++++++ 3 files changed, 118 insertions(+), 62 deletions(-) create mode 100644 server/analysis/summary.py create mode 100644 server/analysis/user.py diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index a9e9289..0d1ffc9 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -6,7 +6,9 @@ from server.analysis.cultural import CulturalAnalysis from server.analysis.emotional import EmotionalAnalysis from server.analysis.interactional import InteractionAnalysis from server.analysis.linguistic import LinguisticAnalysis +from server.analysis.summary import SummaryAnalysis from server.analysis.temporal import TemporalAnalysis +from server.analysis.user import UserAnalysis DOMAIN_STOPWORDS = { "www", @@ -36,12 +38,11 @@ class StatGen: self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() + self.summary_analysis = SummaryAnalysis() + self.user_analysis = UserAnalysis(self.interaction_analysis) ## Private Methods - def _prepare_filtered_df(self, - df: pd.DataFrame, - filters: dict | None = None - ) -> pd.DataFrame: + def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: filters = filters or {} filtered_df = df.copy() @@ -51,10 +52,9 @@ class StatGen: data_source_filter = filters.get("data_sources", None) if search_query: - mask = ( - filtered_df["content"].str.contains(search_query, case=False, na=False) - | filtered_df["author"].str.contains(search_query, case=False, na=False) - ) + mask = filtered_df["content"].str.contains( + search_query, case=False, na=False + ) | filtered_df["author"].str.contains(search_query, case=False, na=False) # Only include title if the column exists if "title" in filtered_df.columns: @@ -76,10 +76,10 @@ class StatGen: return filtered_df ## Public Methods - def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: return self._prepare_filtered_df(df, filters).to_dict(orient="records") - def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { @@ -87,40 +87,43 @@ class StatGen: "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), } - def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( - filtered_df - ) } - def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { - "top_users": self.interaction_analysis.top_users(filtered_df), - "users": self.interaction_analysis.per_user_analysis(filtered_df), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(filtered_df), + "overall_emotion_average": self.emotional_analysis.overall_emotion_average(filtered_df), + "dominant_emotion_distribution": self.emotional_analysis.dominant_emotion_distribution(filtered_df), + "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df) + } + + def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + return { + "top_users": self.user_analysis.top_users(filtered_df), + "users": self.user_analysis.users(filtered_df) + } + + def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + return { + "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df), + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(filtered_df), "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df) } - def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: - filtered_df = self._prepare_filtered_df(df, filters) - - return { - "average_thread_depth": self.interaction_analysis.average_thread_depth( - filtered_df - ), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion( - filtered_df - ), - } - - def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { @@ -136,35 +139,4 @@ class StatGen: def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) - total_posts = (filtered_df["type"] == "post").sum() - total_comments = (filtered_df["type"] == "comment").sum() - events_per_user = filtered_df.groupby("author").size() - - if filtered_df.empty: - return { - "total_events": 0, - "total_posts": 0, - "total_comments": 0, - "unique_users": 0, - "comments_per_post": 0, - "lurker_ratio": 0, - "time_range": { - "start": None, - "end": None, - }, - "sources": [], - } - - return { - "total_events": int(len(filtered_df)), - "total_posts": int(total_posts), - "total_comments": int(total_comments), - "unique_users": int(events_per_user.count()), - "comments_per_post": round(total_comments / max(total_posts, 1), 2), - "lurker_ratio": round((events_per_user == 1).mean(), 2), - "time_range": { - "start": int(filtered_df["dt"].min().timestamp()), - "end": int(filtered_df["dt"].max().timestamp()), - }, - "sources": filtered_df["source"].dropna().unique().tolist(), - } + return self.summary_analysis.summary(filtered_df) diff --git a/server/analysis/summary.py b/server/analysis/summary.py new file mode 100644 index 0000000..14cc8ca --- /dev/null +++ b/server/analysis/summary.py @@ -0,0 +1,64 @@ +import pandas as pd + + +class SummaryAnalysis: + def total_events(self, df: pd.DataFrame) -> int: + return int(len(df)) + + def total_posts(self, df: pd.DataFrame) -> int: + return int(len(df[df["type"] == "post"])) + + def total_comments(self, df: pd.DataFrame) -> int: + return int(len(df[df["type"] == "comment"])) + + def unique_users(self, df: pd.DataFrame) -> int: + return int(len(df["author"].dropna().unique())) + + def comments_per_post(self, total_comments: int, total_posts: int) -> float: + return round(total_comments / max(total_posts, 1), 2) + + def lurker_ratio(self, df: pd.DataFrame) -> float: + events_per_user = df.groupby("author").size() + return round((events_per_user == 1).mean(), 2) + + def time_range(self, df: pd.DataFrame) -> dict: + return { + "start": int(df["dt"].min().timestamp()), + "end": int(df["dt"].max().timestamp()), + } + + def sources(self, df: pd.DataFrame) -> list: + return df["source"].dropna().unique().tolist() + + def empty_summary(self) -> dict: + return { + "total_events": 0, + "total_posts": 0, + "total_comments": 0, + "unique_users": 0, + "comments_per_post": 0, + "lurker_ratio": 0, + "time_range": { + "start": None, + "end": None, + }, + "sources": [], + } + + def summary(self, df: pd.DataFrame) -> dict: + if df.empty: + return self.empty_summary() + + total_posts = self.total_posts(df) + total_comments = self.total_comments(df) + + return { + "total_events": self.total_events(df), + "total_posts": total_posts, + "total_comments": total_comments, + "unique_users": self.unique_users(df), + "comments_per_post": self.comments_per_post(total_comments, total_posts), + "lurker_ratio": self.lurker_ratio(df), + "time_range": self.time_range(df), + "sources": self.sources(df), + } diff --git a/server/analysis/user.py b/server/analysis/user.py new file mode 100644 index 0000000..57ddc1e --- /dev/null +++ b/server/analysis/user.py @@ -0,0 +1,20 @@ +import pandas as pd + +from server.analysis.interactional import InteractionAnalysis + + +class UserAnalysis: + def __init__(self, interaction_analysis: InteractionAnalysis): + self.interaction_analysis = interaction_analysis + + def top_users(self, df: pd.DataFrame) -> list: + return self.interaction_analysis.top_users(df) + + def users(self, df: pd.DataFrame) -> dict | list: + return self.interaction_analysis.per_user_analysis(df) + + def user(self, df: pd.DataFrame) -> dict: + return { + "top_users": self.top_users(df), + "users": self.users(df), + }