feat(api): add cultural analysis endpoint with identity markers

This commit is contained in:
2026-02-24 14:25:53 +00:00
parent 257eb80de7
commit ccba6a5262
3 changed files with 47 additions and 47 deletions

View File

@@ -0,0 +1,40 @@
import pandas as pd
import re
class CulturalAnalysis:
def __init__(self, df: pd.DataFrame):
self.df = df
def get_identity_markers(self):
df = self.df.copy()
s = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [c for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions]
# token counts per row
tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt))
total_tokens = int(tokens_per_row.map(len).sum())
in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks))
out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks))
in_count = int(in_hits.sum())
out_count = int(out_hits.sum())
result = {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total_tokens, 1), 5),
"out_group_ratio": round(out_count / max(total_tokens, 1), 5),
}
if emotion_cols:
emo = df[emotion_cols].fillna(0).astype(float)
result["in_group_emotion_sums"] = emo[in_hits > out_hits].sum().to_dict()
result["out_group_emotion_sums"] = emo[out_hits > in_hits].sum().to_dict()
return result

View File

@@ -69,45 +69,4 @@ class LinguisticAnalysis:
.sort_values("count", ascending=False) .sort_values("count", ascending=False)
.head(limit) .head(limit)
.to_dict(orient="records") .to_dict(orient="records")
) )
def identity_markers(self):
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
in_count = 0
out_count = 0
in_emotions = {e: 0 for e in emotion_cols}
out_emotions = {e: 0 for e in emotion_cols}
total = 0
for post in df:
text = post["content"]
tokens = re.findall(r"\b[a-z]{2,}\b", text)
total += len(tokens)
in_count += sum(t in in_group_words for t in tokens)
out_count += sum(t in out_group_words for t in tokens)
emotions = post[emotion_cols]
print(emotions)
return {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total, 1), 5),
"out_group_ratio": round(out_count / max(total, 1), 5),
}

View File

@@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis
from server.analysis.emotional import EmotionalAnalysis from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.cultural import CulturalAnalysis
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www", "https", "http",
@@ -46,6 +47,7 @@ class StatGen:
self.emotional_analysis = EmotionalAnalysis(self.df) self.emotional_analysis = EmotionalAnalysis(self.df)
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS) self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis(self.df)
self.original_df = self.df.copy(deep=True) self.original_df = self.df.copy(deep=True)
@@ -87,24 +89,23 @@ class StatGen:
def get_user_analysis(self) -> dict: def get_user_analysis(self) -> dict:
return { return {
"top_users": self.interaction_analysis.top_users(), "top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis(), "users": self.interaction_analysis.per_user_analysis()
"interaction_graph": self.interaction_analysis.interaction_graph()
} }
# average / max thread depth # average / max thread depth
# high engagment threads based on volume # high engagment threads based on volume
def get_interactional_analysis(self) -> dict: def get_interactional_analysis(self) -> dict:
return { return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(), "average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion() "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
"interaction_graph": self.interaction_analysis.interaction_graph()
} }
# detect community jargon # detect community jargon
# in-group and out-group linguistic markers # in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict: def get_cultural_analysis(self) -> dict:
return { return {
"identity_markers": self.linguistic_analysis.identity_markers() "identity_markers": self.cultural_analysis.get_identity_markers()
} }
def summary(self) -> dict: def summary(self) -> dict: