113 lines
3.5 KiB
Python
113 lines
3.5 KiB
Python
import pandas as pd
|
|
import re
|
|
|
|
from collections import Counter
|
|
from itertools import islice
|
|
|
|
class LinguisticAnalysis:
|
|
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
|
|
self.df = df
|
|
self.word_exclusions = word_exclusions
|
|
|
|
def _tokenize(self, text: str):
|
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
|
return [t for t in tokens if t not in self.word_exclusions]
|
|
|
|
def _clean_text(self, text: str) -> str:
|
|
text = re.sub(r"http\S+", "", text) # remove URLs
|
|
text = re.sub(r"www\S+", "", text)
|
|
text = re.sub(r"&\w+;", "", text) # remove HTML entities
|
|
text = re.sub(r"\bamp\b", "", text) # remove stray amp
|
|
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
|
return text
|
|
|
|
def word_frequencies(self, limit: int = 100) -> dict:
|
|
texts = (
|
|
self.df["content"]
|
|
.dropna()
|
|
.astype(str)
|
|
.str.lower()
|
|
)
|
|
|
|
words = []
|
|
for text in texts:
|
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
|
words.extend(
|
|
w for w in tokens
|
|
if w not in self.word_exclusions
|
|
)
|
|
|
|
|
|
counts = Counter(words)
|
|
|
|
word_frequencies = (
|
|
pd.DataFrame(counts.items(), columns=["word", "count"])
|
|
.sort_values("count", ascending=False)
|
|
.head(limit)
|
|
.reset_index(drop=True)
|
|
)
|
|
|
|
return word_frequencies.to_dict(orient="records")
|
|
|
|
def ngrams(self, n=2, limit=100):
|
|
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
|
all_ngrams = []
|
|
|
|
for text in texts:
|
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
|
|
|
# stop word removal causes strange behaviors in ngrams
|
|
#tokens = [w for w in tokens if w not in self.word_exclusions]
|
|
|
|
ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
|
|
all_ngrams.extend([" ".join(ng) for ng in ngrams])
|
|
|
|
counts = Counter(all_ngrams)
|
|
|
|
return (
|
|
pd.DataFrame(counts.items(), columns=["ngram", "count"])
|
|
.sort_values("count", ascending=False)
|
|
.head(limit)
|
|
.to_dict(orient="records")
|
|
)
|
|
|
|
def identity_markers(self):
|
|
df = self.df.copy()
|
|
df["content"] = df["content"].fillna("").astype(str).str.lower()
|
|
|
|
in_group_words = {"we", "us", "our", "ourselves"}
|
|
out_group_words = {"they", "them", "their", "themselves"}
|
|
|
|
emotion_exclusions = [
|
|
"emotion_neutral",
|
|
"emotion_surprise"
|
|
]
|
|
|
|
emotion_cols = [
|
|
col for col in self.df.columns
|
|
if col.startswith("emotion_") and col not in emotion_exclusions
|
|
]
|
|
in_count = 0
|
|
out_count = 0
|
|
in_emotions = {e: 0 for e in emotion_cols}
|
|
out_emotions = {e: 0 for e in emotion_cols}
|
|
total = 0
|
|
|
|
for post in df:
|
|
text = post["content"]
|
|
tokens = re.findall(r"\b[a-z]{2,}\b", text)
|
|
total += len(tokens)
|
|
in_count += sum(t in in_group_words for t in tokens)
|
|
out_count += sum(t in out_group_words for t in tokens)
|
|
|
|
emotions = post[emotion_cols]
|
|
print(emotions)
|
|
|
|
|
|
|
|
return {
|
|
"in_group_usage": in_count,
|
|
"out_group_usage": out_count,
|
|
"in_group_ratio": round(in_count / max(total, 1), 5),
|
|
"out_group_ratio": round(out_count / max(total, 1), 5),
|
|
} |