refactor: update analysis classes to accept DataFrame as parameter instead of instance variable

2026-03-01 16:25:39 +00:00
parent d20790ed4b
commit 07ab7529a9
7 changed files with 403 additions and 389 deletions
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -4,9 +4,9 @@ import re
 from collections import Counter
 from itertools import islice

+
 class LinguisticAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
-        self.df = df
+    def __init__(self, word_exclusions: set[str]):
        self.word_exclusions = word_exclusions

    def _tokenize(self, text: str):
@@ -14,29 +14,20 @@ class LinguisticAnalysis:
        return [t for t in tokens if t not in self.word_exclusions]

    def _clean_text(self, text: str) -> str:
-        text = re.sub(r"http\S+", "", text)        # remove URLs
+        text = re.sub(r"http\S+", "", text)  # remove URLs
        text = re.sub(r"www\S+", "", text)
-        text = re.sub(r"&\w+;", "", text)          # remove HTML entities
-        text = re.sub(r"\bamp\b", "", text)        # remove stray amp
+        text = re.sub(r"&\w+;", "", text)  # remove HTML entities
+        text = re.sub(r"\bamp\b", "", text)  # remove stray amp
        text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
        return text

-    def word_frequencies(self, limit: int = 100) -> dict:
-        texts = (
-            self.df["content"]
-            .dropna()
-            .astype(str)
-            .str.lower()
-        )
+    def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
+        texts = df["content"].dropna().astype(str).str.lower()

        words = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
-            words.extend(
-                w for w in tokens
-                if w not in self.word_exclusions
-            )
-
+            words.extend(w for w in tokens if w not in self.word_exclusions)

        counts = Counter(words)

@@ -48,16 +39,16 @@ class LinguisticAnalysis:
        )

        return word_frequencies.to_dict(orient="records")
-    
-    def ngrams(self, n=2, limit=100):
-        texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
+
+    def ngrams(self, df: pd.DataFrame, n=2, limit=100):
+        texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
        all_ngrams = []

        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)

            # stop word removal causes strange behaviors in ngrams
-            #tokens = [w for w in tokens if w not in self.word_exclusions]
+            # tokens = [w for w in tokens if w not in self.word_exclusions]

            ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
            all_ngrams.extend([" ".join(ng) for ng in ngrams])
@@ -69,4 +60,4 @@ class LinguisticAnalysis:
            .sort_values("count", ascending=False)
            .head(limit)
            .to_dict(orient="records")
-        )
+        )