feat(api): add stance markers & avg emotion per entity
This commit is contained in:
@@ -1,40 +1,154 @@
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
from collections import Counter
|
||||
from typing import Any
|
||||
|
||||
|
||||
class CulturalAnalysis:
|
||||
def __init__(self, df: pd.DataFrame):
|
||||
def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
|
||||
self.df = df
|
||||
self.content_col = content_col
|
||||
self.topic_col = topic_col
|
||||
|
||||
def get_identity_markers(self):
|
||||
df = self.df.copy()
|
||||
s = df["content"].fillna("").astype(str).str.lower()
|
||||
s = df[self.content_col].fillna("").astype(str).str.lower()
|
||||
|
||||
in_group_words = {"we", "us", "our", "ourselves"}
|
||||
out_group_words = {"they", "them", "their", "themselves"}
|
||||
|
||||
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
||||
emotion_cols = [c for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions]
|
||||
emotion_cols = [
|
||||
c for c in df.columns
|
||||
if c.startswith("emotion_") and c not in emotion_exclusions
|
||||
]
|
||||
|
||||
# token counts per row
|
||||
# Tokenize per row
|
||||
tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt))
|
||||
|
||||
total_tokens = int(tokens_per_row.map(len).sum())
|
||||
in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks))
|
||||
out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks))
|
||||
in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int)
|
||||
out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int)
|
||||
|
||||
in_count = int(in_hits.sum())
|
||||
out_count = int(out_hits.sum())
|
||||
|
||||
in_mask = in_hits > out_hits
|
||||
out_mask = out_hits > in_hits
|
||||
tie_mask = ~(in_mask | out_mask)
|
||||
|
||||
result = {
|
||||
"in_group_usage": in_count,
|
||||
"out_group_usage": out_count,
|
||||
"in_group_ratio": round(in_count / max(total_tokens, 1), 5),
|
||||
"out_group_ratio": round(out_count / max(total_tokens, 1), 5),
|
||||
|
||||
"in_group_posts": int(in_mask.sum()),
|
||||
"out_group_posts": int(out_mask.sum()),
|
||||
"tie_posts": int(tie_mask.sum()),
|
||||
}
|
||||
|
||||
if emotion_cols:
|
||||
emo = df[emotion_cols].fillna(0).astype(float)
|
||||
result["in_group_emotion_sums"] = emo[in_hits > out_hits].sum().to_dict()
|
||||
result["out_group_emotion_sums"] = emo[out_hits > in_hits].sum().to_dict()
|
||||
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
|
||||
|
||||
in_avg = emo.loc[in_mask].mean() if in_mask.any() else pd.Series(0.0, index=emotion_cols)
|
||||
out_avg = emo.loc[out_mask].mean() if out_mask.any() else pd.Series(0.0, index=emotion_cols)
|
||||
|
||||
result["in_group_emotion_avg"] = in_avg.to_dict()
|
||||
result["out_group_emotion_avg"] = out_avg.to_dict()
|
||||
|
||||
return result
|
||||
|
||||
def get_stance_markers(self) -> dict[str, Any]:
|
||||
s = self.df[self.content_col].fillna("").astype(str)
|
||||
|
||||
hedges = {
|
||||
"maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
|
||||
"i think", "i feel", "i guess", "kind of", "sort of", "somewhat"
|
||||
}
|
||||
certainty = {
|
||||
"definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never"
|
||||
}
|
||||
|
||||
deontic = {
|
||||
"must", "should", "need", "needs", "have to", "has to", "ought", "required", "require"
|
||||
}
|
||||
|
||||
permission = {"can", "allowed", "okay", "ok", "permitted"}
|
||||
|
||||
def count_phrases(text: str, phrases: set[str]) -> int:
|
||||
c = 0
|
||||
for p in phrases:
|
||||
if " " in p:
|
||||
c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
|
||||
else:
|
||||
c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
|
||||
return c
|
||||
|
||||
hedge_counts = s.apply(lambda t: count_phrases(t, hedges))
|
||||
certainty_counts = s.apply(lambda t: count_phrases(t, certainty))
|
||||
deontic_counts = s.apply(lambda t: count_phrases(t, deontic))
|
||||
perm_counts = s.apply(lambda t: count_phrases(t, permission))
|
||||
|
||||
token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1)
|
||||
|
||||
return {
|
||||
"hedge_total": int(hedge_counts.sum()),
|
||||
"certainty_total": int(certainty_counts.sum()),
|
||||
"deontic_total": int(deontic_counts.sum()),
|
||||
"permission_total": int(perm_counts.sum()),
|
||||
"hedge_per_1k_tokens": round(1000 * hedge_counts.sum() / token_counts.sum(), 3),
|
||||
"certainty_per_1k_tokens": round(1000 * certainty_counts.sum() / token_counts.sum(), 3),
|
||||
"deontic_per_1k_tokens": round(1000 * deontic_counts.sum() / token_counts.sum(), 3),
|
||||
"permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
|
||||
}
|
||||
|
||||
def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
|
||||
if "entities" not in self.df.columns:
|
||||
return {"entity_emotion_avg": {}}
|
||||
|
||||
df = self.df
|
||||
emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
|
||||
|
||||
entity_counter = Counter()
|
||||
|
||||
for row in df["entities"].dropna():
|
||||
if isinstance(row, list):
|
||||
for ent in row:
|
||||
if isinstance(ent, dict):
|
||||
text = ent.get("text")
|
||||
if isinstance(text, str):
|
||||
text = text.strip()
|
||||
if len(text) >= 3: # filter short junk
|
||||
entity_counter[text] += 1
|
||||
|
||||
top_entities = entity_counter.most_common(top_n)
|
||||
|
||||
entity_emotion_avg = {}
|
||||
|
||||
for entity_text, _ in top_entities:
|
||||
mask = df["entities"].apply(
|
||||
lambda ents: isinstance(ents, list) and
|
||||
any(isinstance(e, dict) and e.get("text") == entity_text for e in ents)
|
||||
)
|
||||
|
||||
post_count = int(mask.sum())
|
||||
|
||||
if post_count >= min_posts:
|
||||
emo_means = (
|
||||
df.loc[mask, emotion_cols]
|
||||
.apply(pd.to_numeric, errors="coerce")
|
||||
.fillna(0.0)
|
||||
.mean()
|
||||
.to_dict()
|
||||
)
|
||||
|
||||
entity_emotion_avg[entity_text] = {
|
||||
"post_count": post_count,
|
||||
"emotion_avg": emo_means
|
||||
}
|
||||
|
||||
return {
|
||||
"entity_emotion_avg": entity_emotion_avg
|
||||
}
|
||||
@@ -105,7 +105,9 @@ class StatGen:
|
||||
# in-group and out-group linguistic markers
|
||||
def get_cultural_analysis(self) -> dict:
|
||||
return {
|
||||
"identity_markers": self.cultural_analysis.get_identity_markers()
|
||||
"identity_markers": self.cultural_analysis.get_identity_markers(),
|
||||
"stance_markers": self.cultural_analysis.get_stance_markers(),
|
||||
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
|
||||
}
|
||||
|
||||
def summary(self) -> dict:
|
||||
|
||||
Reference in New Issue
Block a user