feat(api): add cultural analysis endpoint with identity markers
This commit is contained in:
40
server/analysis/cultural.py
Normal file
40
server/analysis/cultural.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
|
||||||
|
class CulturalAnalysis:
|
||||||
|
def __init__(self, df: pd.DataFrame):
|
||||||
|
self.df = df
|
||||||
|
|
||||||
|
def get_identity_markers(self):
|
||||||
|
df = self.df.copy()
|
||||||
|
s = df["content"].fillna("").astype(str).str.lower()
|
||||||
|
|
||||||
|
in_group_words = {"we", "us", "our", "ourselves"}
|
||||||
|
out_group_words = {"they", "them", "their", "themselves"}
|
||||||
|
|
||||||
|
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
||||||
|
emotion_cols = [c for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions]
|
||||||
|
|
||||||
|
# token counts per row
|
||||||
|
tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt))
|
||||||
|
|
||||||
|
total_tokens = int(tokens_per_row.map(len).sum())
|
||||||
|
in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks))
|
||||||
|
out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks))
|
||||||
|
|
||||||
|
in_count = int(in_hits.sum())
|
||||||
|
out_count = int(out_hits.sum())
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"in_group_usage": in_count,
|
||||||
|
"out_group_usage": out_count,
|
||||||
|
"in_group_ratio": round(in_count / max(total_tokens, 1), 5),
|
||||||
|
"out_group_ratio": round(out_count / max(total_tokens, 1), 5),
|
||||||
|
}
|
||||||
|
|
||||||
|
if emotion_cols:
|
||||||
|
emo = df[emotion_cols].fillna(0).astype(float)
|
||||||
|
result["in_group_emotion_sums"] = emo[in_hits > out_hits].sum().to_dict()
|
||||||
|
result["out_group_emotion_sums"] = emo[out_hits > in_hits].sum().to_dict()
|
||||||
|
|
||||||
|
return result
|
||||||
@@ -69,45 +69,4 @@ class LinguisticAnalysis:
|
|||||||
.sort_values("count", ascending=False)
|
.sort_values("count", ascending=False)
|
||||||
.head(limit)
|
.head(limit)
|
||||||
.to_dict(orient="records")
|
.to_dict(orient="records")
|
||||||
)
|
)
|
||||||
|
|
||||||
def identity_markers(self):
|
|
||||||
df = self.df.copy()
|
|
||||||
df["content"] = df["content"].fillna("").astype(str).str.lower()
|
|
||||||
|
|
||||||
in_group_words = {"we", "us", "our", "ourselves"}
|
|
||||||
out_group_words = {"they", "them", "their", "themselves"}
|
|
||||||
|
|
||||||
emotion_exclusions = [
|
|
||||||
"emotion_neutral",
|
|
||||||
"emotion_surprise"
|
|
||||||
]
|
|
||||||
|
|
||||||
emotion_cols = [
|
|
||||||
col for col in self.df.columns
|
|
||||||
if col.startswith("emotion_") and col not in emotion_exclusions
|
|
||||||
]
|
|
||||||
in_count = 0
|
|
||||||
out_count = 0
|
|
||||||
in_emotions = {e: 0 for e in emotion_cols}
|
|
||||||
out_emotions = {e: 0 for e in emotion_cols}
|
|
||||||
total = 0
|
|
||||||
|
|
||||||
for post in df:
|
|
||||||
text = post["content"]
|
|
||||||
tokens = re.findall(r"\b[a-z]{2,}\b", text)
|
|
||||||
total += len(tokens)
|
|
||||||
in_count += sum(t in in_group_words for t in tokens)
|
|
||||||
out_count += sum(t in out_group_words for t in tokens)
|
|
||||||
|
|
||||||
emotions = post[emotion_cols]
|
|
||||||
print(emotions)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return {
|
|
||||||
"in_group_usage": in_count,
|
|
||||||
"out_group_usage": out_count,
|
|
||||||
"in_group_ratio": round(in_count / max(total, 1), 5),
|
|
||||||
"out_group_ratio": round(out_count / max(total, 1), 5),
|
|
||||||
}
|
|
||||||
@@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis
|
|||||||
from server.analysis.emotional import EmotionalAnalysis
|
from server.analysis.emotional import EmotionalAnalysis
|
||||||
from server.analysis.interactional import InteractionAnalysis
|
from server.analysis.interactional import InteractionAnalysis
|
||||||
from server.analysis.linguistic import LinguisticAnalysis
|
from server.analysis.linguistic import LinguisticAnalysis
|
||||||
|
from server.analysis.cultural import CulturalAnalysis
|
||||||
|
|
||||||
DOMAIN_STOPWORDS = {
|
DOMAIN_STOPWORDS = {
|
||||||
"www", "https", "http",
|
"www", "https", "http",
|
||||||
@@ -46,6 +47,7 @@ class StatGen:
|
|||||||
self.emotional_analysis = EmotionalAnalysis(self.df)
|
self.emotional_analysis = EmotionalAnalysis(self.df)
|
||||||
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
|
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
|
||||||
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
|
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
|
||||||
|
self.cultural_analysis = CulturalAnalysis(self.df)
|
||||||
|
|
||||||
self.original_df = self.df.copy(deep=True)
|
self.original_df = self.df.copy(deep=True)
|
||||||
|
|
||||||
@@ -87,24 +89,23 @@ class StatGen:
|
|||||||
def get_user_analysis(self) -> dict:
|
def get_user_analysis(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"top_users": self.interaction_analysis.top_users(),
|
"top_users": self.interaction_analysis.top_users(),
|
||||||
"users": self.interaction_analysis.per_user_analysis(),
|
"users": self.interaction_analysis.per_user_analysis()
|
||||||
"interaction_graph": self.interaction_analysis.interaction_graph()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# average / max thread depth
|
# average / max thread depth
|
||||||
# high engagment threads based on volume
|
# high engagment threads based on volume
|
||||||
|
|
||||||
def get_interactional_analysis(self) -> dict:
|
def get_interactional_analysis(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
|
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
|
||||||
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
|
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
|
||||||
|
"interaction_graph": self.interaction_analysis.interaction_graph()
|
||||||
}
|
}
|
||||||
|
|
||||||
# detect community jargon
|
# detect community jargon
|
||||||
# in-group and out-group linguistic markers
|
# in-group and out-group linguistic markers
|
||||||
def get_cultural_analysis(self) -> dict:
|
def get_cultural_analysis(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"identity_markers": self.linguistic_analysis.identity_markers()
|
"identity_markers": self.cultural_analysis.get_identity_markers()
|
||||||
}
|
}
|
||||||
|
|
||||||
def summary(self) -> dict:
|
def summary(self) -> dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user