diff --git a/server/stat_gen.py b/server/stat_gen.py index 99c307d..fe06a62 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -38,6 +38,44 @@ class StatGen: df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) df["hour"] = df["dt"].dt.hour df["weekday"] = df["dt"].dt.day_name() + + def _tokenize(text: str): + tokens = re.findall(r"\b[a-z]{3,}\b", text) + return [t for t in tokens if t not in EXCLUDE_WORDS] + + def _vocab_richness_per_user(self, min_words: int = 20) -> dict: + df = self.df.copy() + df["content"] = df["content"].fillna("").astype(str).str.lower() + df["tokens"] = df["content"].apply(self._tokenize) + + rows = [] + for author, group in df.groupby("author"): + all_tokens = [t for tokens in group["tokens"] for t in tokens] + + total_words = len(all_tokens) + unique_words = len(set(all_tokens)) + events = len(group) + + # Min amount of words for a user, any less than this might give weird results + if total_words < min_words: + continue + + # 100% = they never reused a word (excluding stop words) + vocab_richness = unique_words / total_words + avg_words = total_words / max(events, 1) + + rows.append({ + "author": author, + "events": int(events), + "total_words": int(total_words), + "unique_words": int(unique_words), + "vocab_richness": round(vocab_richness, 3), + "avg_words_per_event": round(avg_words, 2), + }) + + rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) + + return {"vocab_richness": rows} ## Public def time_analysis(self) -> pd.DataFrame: @@ -147,7 +185,8 @@ class StatGen: {"author": author, "source": source, "count": int(count)} for (author, source), count in counts.items() ], - "users": per_user.reset_index().to_dict(orient="records") + "users": per_user.reset_index().to_dict(orient="records"), + "vocab_per_user": self._vocab_richness_per_user() } def search(self, search_query: str) -> dict: