refactor: extract interaction and linguistic analysis into dedicated classes
This commit is contained in:
37
server/analysis/linguistic.py
Normal file
37
server/analysis/linguistic.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
from collections import Counter
|
||||
|
||||
class LinguisticAnalysis:
|
||||
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
|
||||
self.df = df
|
||||
self.word_exclusions = word_exclusions
|
||||
|
||||
def word_frequencies(self, limit: int = 100) -> dict:
|
||||
texts = (
|
||||
self.df["content"]
|
||||
.dropna()
|
||||
.astype(str)
|
||||
.str.lower()
|
||||
)
|
||||
|
||||
words = []
|
||||
for text in texts:
|
||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||
words.extend(
|
||||
w for w in tokens
|
||||
if w not in self.word_exclusions
|
||||
)
|
||||
|
||||
|
||||
counts = Counter(words)
|
||||
|
||||
word_frequencies = (
|
||||
pd.DataFrame(counts.items(), columns=["word", "count"])
|
||||
.sort_values("count", ascending=False)
|
||||
.head(limit)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
return word_frequencies.to_dict(orient="records")
|
||||
Reference in New Issue
Block a user