From 8b25b7bd09d239eabb0936f0b515f5c90cf6521d Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 2 Feb 2026 15:14:56 +0000 Subject: [PATCH] feat: top most used words per user in user analysis endpoint --- server/stat_gen.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/server/stat_gen.py b/server/stat_gen.py index ef0e37b..9e3caea 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -43,7 +43,7 @@ class StatGen: tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in EXCLUDE_WORDS] - def _vocab_richness_per_user(self, min_words: int = 20) -> dict: + def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list: df = self.df.copy() df["content"] = df["content"].fillna("").astype(str).str.lower() df["tokens"] = df["content"].apply(self._tokenize) @@ -64,6 +64,12 @@ class StatGen: vocab_richness = unique_words / total_words avg_words = total_words / max(events, 1) + counts = Counter(all_tokens) + top_words = [ + {"word": w, "count": int(c)} + for w, c in counts.most_common(top_most_used_words) + ] + rows.append({ "author": author, "events": int(events), @@ -71,11 +77,12 @@ class StatGen: "unique_words": int(unique_words), "vocab_richness": round(vocab_richness, 3), "avg_words_per_event": round(avg_words, 2), + "top_words": top_words }) rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) - return {"vocab_richness": rows} + return rows ## Public def time_analysis(self) -> pd.DataFrame: