Files
crosspost/server/analysis/linguistic.py

121 lines
3.6 KiB
Python

import re
from collections import Counter
from dataclasses import dataclass
import pandas as pd
@dataclass(frozen=True)
class NGramConfig:
min_token_length: int = 3
min_count: int = 2
max_results: int = 100
class LinguisticAnalysis:
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
self.ngram_config = NGramConfig()
def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
tokens = re.findall(pattern, text)
if include_exclusions:
return tokens
return [token for token in tokens if token not in self.word_exclusions]
def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text)
text = re.sub(r"&\w+;", "", text) # remove HTML entities
text = re.sub(r"\bamp\b", "", text) # remove stray amp
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
if any(token in self.word_exclusions for token in tokens):
return False
if len(set(tokens)) == 1:
return False
return True
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = self._content_texts(df)
words = []
for text in texts:
words.extend(self._tokenize(text))
counts = Counter(words)
word_frequencies = (
pd.DataFrame(counts.items(), columns=["word", "count"])
.sort_values("count", ascending=False)
.head(limit)
.reset_index(drop=True)
)
return word_frequencies.to_dict(orient="records")
def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
if n < 2:
raise ValueError("n must be at least 2")
texts = self._content_texts(df)
all_ngrams = []
result_limit = limit or self.ngram_config.max_results
for text in texts:
tokens = self._tokenize(text, include_exclusions=True)
if len(tokens) < n:
continue
for index in range(len(tokens) - n + 1):
ngram_tokens = tuple(tokens[index : index + n])
if self._valid_ngram(ngram_tokens):
all_ngrams.append(" ".join(ngram_tokens))
counts = Counter(all_ngrams)
filtered_counts = [
(ngram, count)
for ngram, count in counts.items()
if count >= self.ngram_config.min_count
]
if not filtered_counts:
return []
return (
pd.DataFrame(filtered_counts, columns=["ngram", "count"])
.sort_values(["count", "ngram"], ascending=[False, True])
.head(result_limit)
.to_dict(orient="records")
)
def lexical_diversity(self, df: pd.DataFrame) -> dict:
tokens = (
df["content"]
.fillna("")
.astype(str)
.str.lower()
.str.findall(r"\b[a-z]{2,}\b")
.explode()
)
tokens = tokens[~tokens.isin(self.word_exclusions)]
total = max(len(tokens), 1)
unique = int(tokens.nunique())
return {
"total_tokens": total,
"unique_tokens": unique,
"ttr": round(unique / total, 4),
}