refactor: update analysis classes to accept DataFrame as parameter instead of instance variable

This commit is contained in:
2026-03-01 16:25:39 +00:00
parent d20790ed4b
commit 07ab7529a9
7 changed files with 403 additions and 389 deletions

View File

@@ -4,9 +4,9 @@ import re
from collections import Counter
from itertools import islice
class LinguisticAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
self.df = df
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
@@ -14,29 +14,20 @@ class LinguisticAnalysis:
return [t for t in tokens if t not in self.word_exclusions]
def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text)
text = re.sub(r"&\w+;", "", text) # remove HTML entities
text = re.sub(r"\bamp\b", "", text) # remove stray amp
text = re.sub(r"&\w+;", "", text) # remove HTML entities
text = re.sub(r"\bamp\b", "", text) # remove stray amp
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text
def word_frequencies(self, limit: int = 100) -> dict:
texts = (
self.df["content"]
.dropna()
.astype(str)
.str.lower()
)
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = df["content"].dropna().astype(str).str.lower()
words = []
for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
words.extend(
w for w in tokens
if w not in self.word_exclusions
)
words.extend(w for w in tokens if w not in self.word_exclusions)
counts = Counter(words)
@@ -48,16 +39,16 @@ class LinguisticAnalysis:
)
return word_frequencies.to_dict(orient="records")
def ngrams(self, n=2, limit=100):
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
def ngrams(self, df: pd.DataFrame, n=2, limit=100):
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
all_ngrams = []
for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
# stop word removal causes strange behaviors in ngrams
#tokens = [w for w in tokens if w not in self.word_exclusions]
# tokens = [w for w in tokens if w not in self.word_exclusions]
ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
all_ngrams.extend([" ".join(ng) for ng in ngrams])
@@ -69,4 +60,4 @@ class LinguisticAnalysis:
.sort_values("count", ascending=False)
.head(limit)
.to_dict(orient="records")
)
)