refactor: extract interaction and linguistic analysis into dedicated classes

2026-02-17 18:00:16 +00:00
parent 83010aee55
commit d27ba3fca4
3 changed files with 173 additions and 143 deletions
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -0,0 +1,37 @@
+import pandas as pd
+import re
+
+from collections import Counter
+
+class LinguisticAnalysis:
+    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
+        self.df = df
+        self.word_exclusions = word_exclusions
+
+    def word_frequencies(self, limit: int = 100) -> dict:
+        texts = (
+            self.df["content"]
+            .dropna()
+            .astype(str)
+            .str.lower()
+        )
+
+        words = []
+        for text in texts:
+            tokens = re.findall(r"\b[a-z]{3,}\b", text)
+            words.extend(
+                w for w in tokens
+                if w not in self.word_exclusions
+            )
+
+
+        counts = Counter(words)
+
+        word_frequencies = (
+            pd.DataFrame(counts.items(), columns=["word", "count"])
+            .sort_values("count", ascending=False)
+            .head(limit)
+            .reset_index(drop=True)
+        )
+
+        return word_frequencies.to_dict(orient="records")