feat(api): add cultural endpoint

2026-02-23 17:14:12 +00:00
parent c11b4bb85b
commit 04b7094036
3 changed files with 81 additions and 1 deletions
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -9,6 +9,10 @@ class LinguisticAnalysis:
        self.df = df
        self.word_exclusions = word_exclusions

+    def _tokenize(self, text: str):
+        tokens = re.findall(r"\b[a-z]{3,}\b", text)
+        return [t for t in tokens if t not in self.word_exclusions]
+
    def _clean_text(self, text: str) -> str:
        text = re.sub(r"http\S+", "", text)        # remove URLs
        text = re.sub(r"www\S+", "", text)
@@ -65,4 +69,45 @@ class LinguisticAnalysis:
            .sort_values("count", ascending=False)
            .head(limit)
            .to_dict(orient="records")
-        )
+        )
+        
+    def identity_markers(self):
+        df = self.df.copy()
+        df["content"] = df["content"].fillna("").astype(str).str.lower()
+
+        in_group_words = {"we", "us", "our", "ourselves"}
+        out_group_words = {"they", "them", "their", "themselves"}
+
+        emotion_exclusions = [
+            "emotion_neutral",
+            "emotion_surprise"
+        ]
+
+        emotion_cols = [
+            col for col in self.df.columns
+            if col.startswith("emotion_") and col not in emotion_exclusions
+        ]
+        in_count = 0
+        out_count = 0
+        in_emotions = {e: 0 for e in emotion_cols}
+        out_emotions = {e: 0 for e in emotion_cols}
+        total = 0
+
+        for post in df:
+            text = post["content"]
+            tokens = re.findall(r"\b[a-z]{2,}\b", text)
+            total += len(tokens)
+            in_count += sum(t in in_group_words for t in tokens)
+            out_count += sum(t in out_group_words for t in tokens)
+
+            emotions = post[emotion_cols]
+            print(emotions)
+
+            
+
+        return {
+            "in_group_usage": in_count,
+            "out_group_usage": out_count,
+            "in_group_ratio": round(in_count / max(total, 1), 5),
+            "out_group_ratio": round(out_count / max(total, 1), 5),
+        }