From ccba6a52620bcc2e3bae2b723566076775b01daa Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 24 Feb 2026 14:25:53 +0000 Subject: [PATCH] feat(api): add cultural analysis endpoint with identity markers --- server/analysis/cultural.py | 40 ++++++++++++++++++++++++++++++++ server/analysis/linguistic.py | 43 +---------------------------------- server/stat_gen.py | 11 +++++---- 3 files changed, 47 insertions(+), 47 deletions(-) create mode 100644 server/analysis/cultural.py diff --git a/server/analysis/cultural.py b/server/analysis/cultural.py new file mode 100644 index 0000000..fb8bb9d --- /dev/null +++ b/server/analysis/cultural.py @@ -0,0 +1,40 @@ +import pandas as pd +import re + +class CulturalAnalysis: + def __init__(self, df: pd.DataFrame): + self.df = df + + def get_identity_markers(self): + df = self.df.copy() + s = df["content"].fillna("").astype(str).str.lower() + + in_group_words = {"we", "us", "our", "ourselves"} + out_group_words = {"they", "them", "their", "themselves"} + + emotion_exclusions = {"emotion_neutral", "emotion_surprise"} + emotion_cols = [c for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions] + + # token counts per row + tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt)) + + total_tokens = int(tokens_per_row.map(len).sum()) + in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)) + out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)) + + in_count = int(in_hits.sum()) + out_count = int(out_hits.sum()) + + result = { + "in_group_usage": in_count, + "out_group_usage": out_count, + "in_group_ratio": round(in_count / max(total_tokens, 1), 5), + "out_group_ratio": round(out_count / max(total_tokens, 1), 5), + } + + if emotion_cols: + emo = df[emotion_cols].fillna(0).astype(float) + result["in_group_emotion_sums"] = emo[in_hits > out_hits].sum().to_dict() + result["out_group_emotion_sums"] = emo[out_hits > in_hits].sum().to_dict() + + return result \ No newline at end of file diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index 8cfcc7f..5718edc 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -69,45 +69,4 @@ class LinguisticAnalysis: .sort_values("count", ascending=False) .head(limit) .to_dict(orient="records") - ) - - def identity_markers(self): - df = self.df.copy() - df["content"] = df["content"].fillna("").astype(str).str.lower() - - in_group_words = {"we", "us", "our", "ourselves"} - out_group_words = {"they", "them", "their", "themselves"} - - emotion_exclusions = [ - "emotion_neutral", - "emotion_surprise" - ] - - emotion_cols = [ - col for col in self.df.columns - if col.startswith("emotion_") and col not in emotion_exclusions - ] - in_count = 0 - out_count = 0 - in_emotions = {e: 0 for e in emotion_cols} - out_emotions = {e: 0 for e in emotion_cols} - total = 0 - - for post in df: - text = post["content"] - tokens = re.findall(r"\b[a-z]{2,}\b", text) - total += len(tokens) - in_count += sum(t in in_group_words for t in tokens) - out_count += sum(t in out_group_words for t in tokens) - - emotions = post[emotion_cols] - print(emotions) - - - - return { - "in_group_usage": in_count, - "out_group_usage": out_count, - "in_group_ratio": round(in_count / max(total, 1), 5), - "out_group_ratio": round(out_count / max(total, 1), 5), - } \ No newline at end of file + ) \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py index 6ac7159..8cd9e70 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis from server.analysis.emotional import EmotionalAnalysis from server.analysis.interactional import InteractionAnalysis from server.analysis.linguistic import LinguisticAnalysis +from server.analysis.cultural import CulturalAnalysis DOMAIN_STOPWORDS = { "www", "https", "http", @@ -46,6 +47,7 @@ class StatGen: self.emotional_analysis = EmotionalAnalysis(self.df) self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS) + self.cultural_analysis = CulturalAnalysis(self.df) self.original_df = self.df.copy(deep=True) @@ -87,24 +89,23 @@ class StatGen: def get_user_analysis(self) -> dict: return { "top_users": self.interaction_analysis.top_users(), - "users": self.interaction_analysis.per_user_analysis(), - "interaction_graph": self.interaction_analysis.interaction_graph() + "users": self.interaction_analysis.per_user_analysis() } # average / max thread depth # high engagment threads based on volume - def get_interactional_analysis(self) -> dict: return { "average_thread_depth": self.interaction_analysis.average_thread_depth(), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion() + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(), + "interaction_graph": self.interaction_analysis.interaction_graph() } # detect community jargon # in-group and out-group linguistic markers def get_cultural_analysis(self) -> dict: return { - "identity_markers": self.linguistic_analysis.identity_markers() + "identity_markers": self.cultural_analysis.get_identity_markers() } def summary(self) -> dict: