From 2fa1dff4b70e42031191fed1284cf7b2f3958879 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 13:27:39 +0000 Subject: [PATCH] feat(stat): add lexical diversity stat --- server/analysis/linguistic.py | 16 ++++++++++++++++ server/analysis/stat_gen.py | 1 + 2 files changed, 17 insertions(+) diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index dc91faf..7546bbf 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -61,3 +61,19 @@ class LinguisticAnalysis: .head(limit) .to_dict(orient="records") ) + + def lexical_diversity(self, df: pd.DataFrame) -> dict: + tokens = ( + df["content"].fillna("").astype(str).str.lower() + .str.findall(r"\b[a-z]{2,}\b") + .explode() + ) + tokens = tokens[~tokens.isin(self.word_exclusions)] + total = max(len(tokens), 1) + unique = int(tokens.nunique()) + + return { + "total_tokens": total, + "unique_tokens": unique, + "ttr": round(unique / total, 4), + } diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index 36ef9a9..8435340 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -94,6 +94,7 @@ class StatGen: "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), + "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df) } def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: