diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 8220747..53d97dc 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -1,9 +1,6 @@ import pandas as pd import re -from collections import Counter - - class InteractionAnalysis: def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions @@ -12,51 +9,6 @@ class InteractionAnalysis: tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in self.word_exclusions] - def _vocab_richness_per_user( - self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 - ) -> list: - df = df.copy() - df["content"] = df["content"].fillna("").astype(str).str.lower() - df["tokens"] = df["content"].apply(self._tokenize) - - rows = [] - for author, group in df.groupby("author"): - all_tokens = [t for tokens in group["tokens"] for t in tokens] - - total_words = len(all_tokens) - unique_words = len(set(all_tokens)) - events = len(group) - - # Min amount of words for a user, any less than this might give weird results - if total_words < min_words: - continue - - # 100% = they never reused a word (excluding stop words) - vocab_richness = unique_words / total_words - avg_words = total_words / max(events, 1) - - counts = Counter(all_tokens) - top_words = [ - {"word": w, "count": int(c)} - for w, c in counts.most_common(top_most_used_words) - ] - - rows.append( - { - "author": author, - "events": int(events), - "total_words": int(total_words), - "unique_words": int(unique_words), - "vocab_richness": round(vocab_richness, 3), - "avg_words_per_event": round(avg_words, 2), - "top_words": top_words, - } - ) - - rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) - - return rows - def interaction_graph(self, df: pd.DataFrame): interactions = {a: {} for a in df["author"].dropna().unique()} diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index 8435340..c2f09ed 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -39,7 +39,7 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() self.summary_analysis = SummaryAnalysis() - self.user_analysis = UserAnalysis() + self.user_analysis = UserAnalysis(EXCLUDE_WORDS) ## Private Methods def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: diff --git a/server/analysis/user.py b/server/analysis/user.py index d5e9917..fc8e618 100644 --- a/server/analysis/user.py +++ b/server/analysis/user.py @@ -1,7 +1,61 @@ import pandas as pd +import re +from collections import Counter class UserAnalysis: + def __init__(self, word_exclusions: set[str]): + self.word_exclusions = word_exclusions + + def _tokenize(self, text: str): + tokens = re.findall(r"\b[a-z]{3,}\b", text) + return [t for t in tokens if t not in self.word_exclusions] + + def _vocab_richness_per_user( + self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 + ) -> list: + df = df.copy() + df["content"] = df["content"].fillna("").astype(str).str.lower() + df["tokens"] = df["content"].apply(self._tokenize) + + rows = [] + for author, group in df.groupby("author"): + all_tokens = [t for tokens in group["tokens"] for t in tokens] + + total_words = len(all_tokens) + unique_words = len(set(all_tokens)) + events = len(group) + + # Min amount of words for a user, any less than this might give weird results + if total_words < min_words: + continue + + # 100% = they never reused a word (excluding stop words) + vocab_richness = unique_words / total_words + avg_words = total_words / max(events, 1) + + counts = Counter(all_tokens) + top_words = [ + {"word": w, "count": int(c)} + for w, c in counts.most_common(top_most_used_words) + ] + + rows.append( + { + "author": author, + "events": int(events), + "total_words": int(total_words), + "unique_words": int(unique_words), + "vocab_richness": round(vocab_richness, 3), + "avg_words_per_event": round(avg_words, 2), + "top_words": top_words, + } + ) + + rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) + + return rows + def top_users(self, df: pd.DataFrame) -> list: counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)