121 lines
3.6 KiB
Python
121 lines
3.6 KiB
Python
import re
|
|
from collections import Counter
|
|
from dataclasses import dataclass
|
|
|
|
import pandas as pd
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class NGramConfig:
|
|
min_token_length: int = 3
|
|
min_count: int = 2
|
|
max_results: int = 100
|
|
|
|
|
|
class LinguisticAnalysis:
|
|
def __init__(self, word_exclusions: set[str]):
|
|
self.word_exclusions = word_exclusions
|
|
self.ngram_config = NGramConfig()
|
|
|
|
def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
|
|
pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
|
|
tokens = re.findall(pattern, text)
|
|
|
|
if include_exclusions:
|
|
return tokens
|
|
|
|
return [token for token in tokens if token not in self.word_exclusions]
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
text = re.sub(r"http\S+", "", text) # remove URLs
|
|
text = re.sub(r"www\S+", "", text)
|
|
text = re.sub(r"&\w+;", "", text) # remove HTML entities
|
|
text = re.sub(r"\bamp\b", "", text) # remove stray amp
|
|
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
|
return text
|
|
|
|
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
|
|
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
|
|
|
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
|
|
if any(token in self.word_exclusions for token in tokens):
|
|
return False
|
|
|
|
if len(set(tokens)) == 1:
|
|
return False
|
|
|
|
return True
|
|
|
|
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
|
|
texts = self._content_texts(df)
|
|
|
|
words = []
|
|
for text in texts:
|
|
words.extend(self._tokenize(text))
|
|
|
|
counts = Counter(words)
|
|
|
|
word_frequencies = (
|
|
pd.DataFrame(counts.items(), columns=["word", "count"])
|
|
.sort_values("count", ascending=False)
|
|
.head(limit)
|
|
.reset_index(drop=True)
|
|
)
|
|
|
|
return word_frequencies.to_dict(orient="records")
|
|
|
|
def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
|
|
if n < 2:
|
|
raise ValueError("n must be at least 2")
|
|
|
|
texts = self._content_texts(df)
|
|
all_ngrams = []
|
|
result_limit = limit or self.ngram_config.max_results
|
|
|
|
for text in texts:
|
|
tokens = self._tokenize(text, include_exclusions=True)
|
|
|
|
if len(tokens) < n:
|
|
continue
|
|
|
|
for index in range(len(tokens) - n + 1):
|
|
ngram_tokens = tuple(tokens[index : index + n])
|
|
if self._valid_ngram(ngram_tokens):
|
|
all_ngrams.append(" ".join(ngram_tokens))
|
|
|
|
counts = Counter(all_ngrams)
|
|
filtered_counts = [
|
|
(ngram, count)
|
|
for ngram, count in counts.items()
|
|
if count >= self.ngram_config.min_count
|
|
]
|
|
|
|
if not filtered_counts:
|
|
return []
|
|
|
|
return (
|
|
pd.DataFrame(filtered_counts, columns=["ngram", "count"])
|
|
.sort_values(["count", "ngram"], ascending=[False, True])
|
|
.head(result_limit)
|
|
.to_dict(orient="records")
|
|
)
|
|
|
|
def lexical_diversity(self, df: pd.DataFrame) -> dict:
|
|
tokens = (
|
|
df["content"]
|
|
.fillna("")
|
|
.astype(str)
|
|
.str.lower()
|
|
.str.findall(r"\b[a-z]{2,}\b")
|
|
.explode()
|
|
)
|
|
tokens = tokens[~tokens.isin(self.word_exclusions)]
|
|
total = max(len(tokens), 1)
|
|
unique = int(tokens.nunique())
|
|
|
|
return {
|
|
"total_tokens": total,
|
|
"unique_tokens": unique,
|
|
"ttr": round(unique / total, 4),
|
|
}
|