From e82ac8d73b9ff36f5525d67ffda0e5d9f22b903d Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 24 Feb 2026 15:12:17 +0000 Subject: [PATCH] feat(api): add stance markers & avg emotion per entity --- server/analysis/cultural.py | 134 +++++++++++++++++++++++++++++++++--- server/stat_gen.py | 4 +- 2 files changed, 127 insertions(+), 11 deletions(-) diff --git a/server/analysis/cultural.py b/server/analysis/cultural.py index fb8bb9d..ae55774 100644 --- a/server/analysis/cultural.py +++ b/server/analysis/cultural.py @@ -1,40 +1,154 @@ import pandas as pd import re +from collections import Counter +from typing import Any + + class CulturalAnalysis: - def __init__(self, df: pd.DataFrame): + def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"): self.df = df + self.content_col = content_col + self.topic_col = topic_col def get_identity_markers(self): df = self.df.copy() - s = df["content"].fillna("").astype(str).str.lower() + s = df[self.content_col].fillna("").astype(str).str.lower() in_group_words = {"we", "us", "our", "ourselves"} out_group_words = {"they", "them", "their", "themselves"} emotion_exclusions = {"emotion_neutral", "emotion_surprise"} - emotion_cols = [c for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions] + emotion_cols = [ + c for c in df.columns + if c.startswith("emotion_") and c not in emotion_exclusions + ] - # token counts per row + # Tokenize per row tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt)) total_tokens = int(tokens_per_row.map(len).sum()) - in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)) - out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)) + in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int) + out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int) in_count = int(in_hits.sum()) out_count = int(out_hits.sum()) + in_mask = in_hits > out_hits + out_mask = out_hits > in_hits + tie_mask = ~(in_mask | out_mask) + result = { "in_group_usage": in_count, "out_group_usage": out_count, "in_group_ratio": round(in_count / max(total_tokens, 1), 5), "out_group_ratio": round(out_count / max(total_tokens, 1), 5), + + "in_group_posts": int(in_mask.sum()), + "out_group_posts": int(out_mask.sum()), + "tie_posts": int(tie_mask.sum()), } if emotion_cols: - emo = df[emotion_cols].fillna(0).astype(float) - result["in_group_emotion_sums"] = emo[in_hits > out_hits].sum().to_dict() - result["out_group_emotion_sums"] = emo[out_hits > in_hits].sum().to_dict() + emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0) - return result \ No newline at end of file + in_avg = emo.loc[in_mask].mean() if in_mask.any() else pd.Series(0.0, index=emotion_cols) + out_avg = emo.loc[out_mask].mean() if out_mask.any() else pd.Series(0.0, index=emotion_cols) + + result["in_group_emotion_avg"] = in_avg.to_dict() + result["out_group_emotion_avg"] = out_avg.to_dict() + + return result + + def get_stance_markers(self) -> dict[str, Any]: + s = self.df[self.content_col].fillna("").astype(str) + + hedges = { + "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem", + "i think", "i feel", "i guess", "kind of", "sort of", "somewhat" + } + certainty = { + "definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never" + } + + deontic = { + "must", "should", "need", "needs", "have to", "has to", "ought", "required", "require" + } + + permission = {"can", "allowed", "okay", "ok", "permitted"} + + def count_phrases(text: str, phrases: set[str]) -> int: + c = 0 + for p in phrases: + if " " in p: + c += len(re.findall(r"\b" + re.escape(p) + r"\b", text)) + else: + c += len(re.findall(r"\b" + re.escape(p) + r"\b", text)) + return c + + hedge_counts = s.apply(lambda t: count_phrases(t, hedges)) + certainty_counts = s.apply(lambda t: count_phrases(t, certainty)) + deontic_counts = s.apply(lambda t: count_phrases(t, deontic)) + perm_counts = s.apply(lambda t: count_phrases(t, permission)) + + token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1) + + return { + "hedge_total": int(hedge_counts.sum()), + "certainty_total": int(certainty_counts.sum()), + "deontic_total": int(deontic_counts.sum()), + "permission_total": int(perm_counts.sum()), + "hedge_per_1k_tokens": round(1000 * hedge_counts.sum() / token_counts.sum(), 3), + "certainty_per_1k_tokens": round(1000 * certainty_counts.sum() / token_counts.sum(), 3), + "deontic_per_1k_tokens": round(1000 * deontic_counts.sum() / token_counts.sum(), 3), + "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3), + } + + def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]: + if "entities" not in self.df.columns: + return {"entity_emotion_avg": {}} + + df = self.df + emotion_cols = [c for c in df.columns if c.startswith("emotion_")] + + entity_counter = Counter() + + for row in df["entities"].dropna(): + if isinstance(row, list): + for ent in row: + if isinstance(ent, dict): + text = ent.get("text") + if isinstance(text, str): + text = text.strip() + if len(text) >= 3: # filter short junk + entity_counter[text] += 1 + + top_entities = entity_counter.most_common(top_n) + + entity_emotion_avg = {} + + for entity_text, _ in top_entities: + mask = df["entities"].apply( + lambda ents: isinstance(ents, list) and + any(isinstance(e, dict) and e.get("text") == entity_text for e in ents) + ) + + post_count = int(mask.sum()) + + if post_count >= min_posts: + emo_means = ( + df.loc[mask, emotion_cols] + .apply(pd.to_numeric, errors="coerce") + .fillna(0.0) + .mean() + .to_dict() + ) + + entity_emotion_avg[entity_text] = { + "post_count": post_count, + "emotion_avg": emo_means + } + + return { + "entity_emotion_avg": entity_emotion_avg + } \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py index 8cd9e70..f6b4886 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -105,7 +105,9 @@ class StatGen: # in-group and out-group linguistic markers def get_cultural_analysis(self) -> dict: return { - "identity_markers": self.cultural_analysis.get_identity_markers() + "identity_markers": self.cultural_analysis.get_identity_markers(), + "stance_markers": self.cultural_analysis.get_stance_markers(), + "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity() } def summary(self) -> dict: