Files
crosspost/server/analysis/cultural.py

180 lines
6.4 KiB
Python

import pandas as pd
import re
from typing import Any
class CulturalAnalysis:
def __init__(self, content_col: str = "content", topic_col: str = "topic"):
self.content_col = content_col
self.topic_col = topic_col
def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
df = original_df.copy()
s = df[self.content_col].fillna("").astype(str).str.lower()
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c
for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
# Tokenize per row
in_pattern = re.compile(r"\b(we|us|our|ourselves)\b")
out_pattern = re.compile(r"\b(they|them|their|themselves)\b")
token_pattern = re.compile(r"\b[a-z]{2,}\b")
in_hits = s.str.count(in_pattern)
out_hits = s.str.count(out_pattern)
total_tokens = s.str.count(token_pattern).sum()
in_count = int(in_hits.sum())
out_count = int(out_hits.sum())
in_mask = in_hits > out_hits
out_mask = out_hits > in_hits
tie_mask = ~(in_mask | out_mask)
result = {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total_tokens, 1), 5),
"out_group_ratio": round(out_count / max(total_tokens, 1), 5),
"in_group_posts": int(in_mask.sum()),
"out_group_posts": int(out_mask.sum()),
"tie_posts": int(tie_mask.sum()),
}
if emotion_cols:
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
in_avg = (
emo.loc[in_mask].mean()
if in_mask.any()
else pd.Series(0.0, index=emotion_cols)
)
out_avg = (
emo.loc[out_mask].mean()
if out_mask.any()
else pd.Series(0.0, index=emotion_cols)
)
result["in_group_emotion_avg"] = in_avg.to_dict()
result["out_group_emotion_avg"] = out_avg.to_dict()
return result
def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
s = df[self.content_col].fillna("").astype(str)
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c
for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
hedge_pattern = re.compile(
r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
)
certainty_pattern = re.compile(
r"\b(definitely|certainly|clearly|obviously|undeniably|always|never)\b"
)
deontic_pattern = re.compile(
r"\b(must|should|need|needs|have to|has to|ought|required|require)\b"
)
permission_pattern = re.compile(r"\b(can|allowed|okay|ok|permitted)\b")
hedge_counts = s.str.count(hedge_pattern)
certainty_counts = s.str.count(certainty_pattern)
deontic_counts = s.str.count(deontic_pattern)
perm_counts = s.str.count(permission_pattern)
token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(
0, 1
)
result = {
"hedge_total": int(hedge_counts.sum()),
"certainty_total": int(certainty_counts.sum()),
"deontic_total": int(deontic_counts.sum()),
"permission_total": int(perm_counts.sum()),
"hedge_per_1k_tokens": round(
1000 * hedge_counts.sum() / token_counts.sum(), 3
),
"certainty_per_1k_tokens": round(
1000 * certainty_counts.sum() / token_counts.sum(), 3
),
"deontic_per_1k_tokens": round(
1000 * deontic_counts.sum() / token_counts.sum(), 3
),
"permission_per_1k_tokens": round(
1000 * perm_counts.sum() / token_counts.sum(), 3
),
}
if emotion_cols:
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
result["hedge_emotion_avg"] = (
emo.loc[hedge_counts > 0].mean()
if (hedge_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["certainty_emotion_avg"] = (
emo.loc[certainty_counts > 0].mean()
if (certainty_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["deontic_emotion_avg"] = (
emo.loc[deontic_counts > 0].mean()
if (deontic_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["permission_emotion_avg"] = (
emo.loc[perm_counts > 0].mean()
if (perm_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
return result
def get_avg_emotions_per_entity(
self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
) -> dict[str, Any]:
if "ner_entities" not in df.columns:
return {"entity_emotion_avg": {}}
emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
entity_df = df[["ner_entities"] + emotion_cols].explode("ner_entities")
entity_df["entity_text"] = entity_df["ner_entities"].apply(
lambda e: (
e.get("text").strip()
if isinstance(e, dict)
and isinstance(e.get("text"), str)
and len(e.get("text")) >= 3
else None
)
)
entity_df = entity_df.dropna(subset=["entity_text"])
entity_counts = entity_df["entity_text"].value_counts().head(top_n)
entity_emotion_avg = {}
for entity_text, count in entity_counts.items():
if count >= min_posts:
emo_means = (
entity_df[entity_df["entity_text"] == entity_text][emotion_cols]
.mean()
.to_dict()
)
entity_emotion_avg[entity_text] = {
"post_count": int(count),
"emotion_avg": emo_means,
}
return {"entity_emotion_avg": entity_emotion_avg}