feat(stat): add lexical diversity stat

This commit is contained in:
2026-03-17 13:27:39 +00:00
parent 31fb275ee3
commit 2fa1dff4b7
2 changed files with 17 additions and 0 deletions

View File

@@ -61,3 +61,19 @@ class LinguisticAnalysis:
.head(limit) .head(limit)
.to_dict(orient="records") .to_dict(orient="records")
) )
def lexical_diversity(self, df: pd.DataFrame) -> dict:
tokens = (
df["content"].fillna("").astype(str).str.lower()
.str.findall(r"\b[a-z]{2,}\b")
.explode()
)
tokens = tokens[~tokens.isin(self.word_exclusions)]
total = max(len(tokens), 1)
unique = int(tokens.nunique())
return {
"total_tokens": total,
"unique_tokens": unique,
"ttr": round(unique / total, 4),
}

View File

@@ -94,6 +94,7 @@ class StatGen:
"word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df),
"common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df),
"common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3),
"lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
} }
def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: