2026-03-01 16:47:25 +00:00
7 changed files with 265 additions and 105 deletions
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -0,0 +1,154 @@
+import pandas as pd
+import re
+
+from collections import Counter
+from typing import Any
+
+
+class CulturalAnalysis:
+    def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
+        self.df = df
+        self.content_col = content_col
+        self.topic_col = topic_col
+
+    def get_identity_markers(self):
+        df = self.df.copy()
+        s = df[self.content_col].fillna("").astype(str).str.lower()
+
+        in_group_words = {"we", "us", "our", "ourselves"}
+        out_group_words = {"they", "them", "their", "themselves"}
+
+        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
+        emotion_cols = [
+            c for c in df.columns
+            if c.startswith("emotion_") and c not in emotion_exclusions
+        ]
+
+        # Tokenize per row
+        tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt))
+
+        total_tokens = int(tokens_per_row.map(len).sum())
+        in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int)
+        out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int)
+
+        in_count = int(in_hits.sum())
+        out_count = int(out_hits.sum())
+
+        in_mask = in_hits > out_hits
+        out_mask = out_hits > in_hits
+        tie_mask = ~(in_mask | out_mask)
+
+        result = {
+            "in_group_usage": in_count,
+            "out_group_usage": out_count,
+            "in_group_ratio": round(in_count / max(total_tokens, 1), 5),
+            "out_group_ratio": round(out_count / max(total_tokens, 1), 5),
+
+            "in_group_posts": int(in_mask.sum()),
+            "out_group_posts": int(out_mask.sum()),
+            "tie_posts": int(tie_mask.sum()),
+        }
+
+        if emotion_cols:
+            emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
+
+            in_avg = emo.loc[in_mask].mean() if in_mask.any() else pd.Series(0.0, index=emotion_cols)
+            out_avg = emo.loc[out_mask].mean() if out_mask.any() else pd.Series(0.0, index=emotion_cols)
+
+            result["in_group_emotion_avg"] = in_avg.to_dict()
+            result["out_group_emotion_avg"] = out_avg.to_dict()
+
+        return result
+    
+    def get_stance_markers(self) -> dict[str, Any]:
+        s = self.df[self.content_col].fillna("").astype(str)
+
+        hedges = {
+            "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
+            "i think", "i feel", "i guess", "kind of", "sort of", "somewhat"
+        }
+        certainty = {
+            "definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never"
+        }
+
+        deontic = {
+            "must", "should", "need", "needs", "have to", "has to", "ought", "required", "require"
+        }
+
+        permission = {"can", "allowed", "okay", "ok", "permitted"}
+
+        def count_phrases(text: str, phrases: set[str]) -> int:
+            c = 0
+            for p in phrases:
+                if " " in p:
+                    c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
+                else:
+                    c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
+            return c
+
+        hedge_counts = s.apply(lambda t: count_phrases(t, hedges))
+        certainty_counts = s.apply(lambda t: count_phrases(t, certainty))
+        deontic_counts = s.apply(lambda t: count_phrases(t, deontic))
+        perm_counts = s.apply(lambda t: count_phrases(t, permission))
+
+        token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1)
+
+        return {
+            "hedge_total": int(hedge_counts.sum()),
+            "certainty_total": int(certainty_counts.sum()),
+            "deontic_total": int(deontic_counts.sum()),
+            "permission_total": int(perm_counts.sum()),
+            "hedge_per_1k_tokens": round(1000 * hedge_counts.sum() / token_counts.sum(), 3),
+            "certainty_per_1k_tokens": round(1000 * certainty_counts.sum() / token_counts.sum(), 3),
+            "deontic_per_1k_tokens": round(1000 * deontic_counts.sum() / token_counts.sum(), 3),
+            "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
+        }
+    
+    def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
+        if "entities" not in self.df.columns:
+            return {"entity_emotion_avg": {}}
+
+        df = self.df
+        emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
+
+        entity_counter = Counter()
+
+        for row in df["entities"].dropna():
+            if isinstance(row, list):
+                for ent in row:
+                    if isinstance(ent, dict):
+                        text = ent.get("text")
+                        if isinstance(text, str):
+                            text = text.strip()
+                            if len(text) >= 3:  # filter short junk
+                                entity_counter[text] += 1
+
+        top_entities = entity_counter.most_common(top_n)
+
+        entity_emotion_avg = {}
+
+        for entity_text, _ in top_entities:
+            mask = df["entities"].apply(
+                lambda ents: isinstance(ents, list) and
+                any(isinstance(e, dict) and e.get("text") == entity_text for e in ents)
+            )
+
+            post_count = int(mask.sum())
+
+            if post_count >= min_posts:
+                emo_means = (
+                    df.loc[mask, emotion_cols]
+                    .apply(pd.to_numeric, errors="coerce")
+                    .fillna(0.0)
+                    .mean()
+                    .to_dict()
+                )
+
+                entity_emotion_avg[entity_text] = {
+                    "post_count": post_count,
+                    "emotion_avg": emo_means
+                }
+
+        return {
+            "entity_emotion_avg": entity_emotion_avg
+        }
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -5,14 +5,9 @@ class EmotionalAnalysis:
        self.df = df

    def avg_emotion_by_topic(self) -> dict:
-        emotion_exclusions = [
-            "emotion_neutral",
-            "emotion_surprise"
-        ]
-
        emotion_cols = [
            col for col in self.df.columns
-            if col.startswith("emotion_") and col not in emotion_exclusions
+            if col.startswith("emotion_")
        ]

        counts = (
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -3,6 +3,7 @@ import re

 from collections import Counter

+
 class InteractionAnalysis:
    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
        self.df = df
@@ -12,7 +13,9 @@ class InteractionAnalysis:
        tokens = re.findall(r"\b[a-z]{3,}\b", text)
        return [t for t in tokens if t not in self.word_exclusions]

-    def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list:
+    def _vocab_richness_per_user(
+        self, min_words: int = 20, top_most_used_words: int = 100
+    ) -> list:
        df = self.df.copy()
        df["content"] = df["content"].fillna("").astype(str).str.lower()
        df["tokens"] = df["content"].apply(self._tokenize)
@@ -39,15 +42,17 @@ class InteractionAnalysis:
                for w, c in counts.most_common(top_most_used_words)
            ]

-            rows.append({
-                "author": author,
-                "events": int(events),
-                "total_words": int(total_words),
-                "unique_words": int(unique_words),
-                "vocab_richness": round(vocab_richness, 3),
-                "avg_words_per_event": round(avg_words, 2),
-                "top_words": top_words
-            })
+            rows.append(
+                {
+                    "author": author,
+                    "events": int(events),
+                    "total_words": int(total_words),
+                    "unique_words": int(unique_words),
+                    "vocab_richness": round(vocab_richness, 3),
+                    "avg_words_per_event": round(avg_words, 2),
+                    "top_words": top_words,
+                }
+            )

        rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)

@@ -55,9 +60,7 @@ class InteractionAnalysis:

    def top_users(self) -> list:
        counts = (
-            self.df.groupby(["author", "source"])
-            .size()
-            .sort_values(ascending=False)
+            self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
        )

        top_users = [
@@ -68,19 +71,29 @@ class InteractionAnalysis:
        return top_users

    def per_user_analysis(self) -> dict:
-        per_user = (
-            self.df.groupby(["author", "type"])
-            .size()
-            .unstack(fill_value=0)
-        )
+        per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
+
+        emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
+
+        avg_emotions_by_author = {}
+        if emotion_cols:
+            avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
+            avg_emotions_by_author = {
+                author: {emotion: float(score) for emotion, score in row.items()}
+                for author, row in avg_emotions.iterrows()
+            }

        # ensure columns always exist
        for col in ("post", "comment"):
            if col not in per_user.columns:
                per_user[col] = 0

-        per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1)
-        per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1)
+        per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
+            0, 1
+        )
+        per_user["comment_share"] = per_user["comment"] / (
+            per_user["post"] + per_user["comment"]
+        ).replace(0, 1)
        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
        per_user_records = per_user.reset_index().to_dict(orient="records")

@@ -91,14 +104,17 @@ class InteractionAnalysis:
        merged_users = []
        for row in per_user_records:
            author = row["author"]
-            merged_users.append({
-                "author": author,
-                "post": int(row.get("post", 0)),
-                "comment": int(row.get("comment", 0)),
-                "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
-                "comment_share": float(row.get("comment_share", 0)),
-                "vocab": vocab_by_author.get(author)
-            })
+            merged_users.append(
+                {
+                    "author": author,
+                    "post": int(row.get("post", 0)),
+                    "comment": int(row.get("comment", 0)),
+                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
+                    "comment_share": float(row.get("comment_share", 0)),
+                    "avg_emotions": avg_emotions_by_author.get(author, {}),
+                    "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
+                }
+            )

        merged_users.sort(key=lambda u: u["comment_post_ratio"])

@@ -151,7 +167,8 @@ class InteractionAnalysis:
        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}

        emotion_cols = [
-            c for c in self.df.columns
+            c
+            for c in self.df.columns
            if c.startswith("emotion_") and c not in emotion_exclusions
        ]

@@ -174,14 +191,18 @@ class InteractionAnalysis:

                reply_to = id_to_reply.get(current)

-                if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
+                if (
+                    reply_to is None
+                    or (isinstance(reply_to, float) and pd.isna(reply_to))
+                    or reply_to == ""
+                ):
                    break

                length += 1
                current = reply_to

                if current in length_cache:
-                    length += (length_cache[current] - 1)
+                    length += length_cache[current] - 1
                    break

            length_cache[start_id] = length
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -70,44 +70,3 @@ class LinguisticAnalysis:
            .head(limit)
            .to_dict(orient="records")
        )
-        
-    def identity_markers(self):
-        df = self.df.copy()
-        df["content"] = df["content"].fillna("").astype(str).str.lower()
-
-        in_group_words = {"we", "us", "our", "ourselves"}
-        out_group_words = {"they", "them", "their", "themselves"}
-
-        emotion_exclusions = [
-            "emotion_neutral",
-            "emotion_surprise"
-        ]
-
-        emotion_cols = [
-            col for col in self.df.columns
-            if col.startswith("emotion_") and col not in emotion_exclusions
-        ]
-        in_count = 0
-        out_count = 0
-        in_emotions = {e: 0 for e in emotion_cols}
-        out_emotions = {e: 0 for e in emotion_cols}
-        total = 0
-
-        for post in df:
-            text = post["content"]
-            tokens = re.findall(r"\b[a-z]{2,}\b", text)
-            total += len(tokens)
-            in_count += sum(t in in_group_words for t in tokens)
-            out_count += sum(t in out_group_words for t in tokens)
-
-            emotions = post[emotion_cols]
-            print(emotions)
-
-            
-
-        return {
-            "in_group_usage": in_count,
-            "out_group_usage": out_count,
-            "in_group_ratio": round(in_count / max(total, 1), 5),
-            "out_group_ratio": round(out_count / max(total, 1), 5),
-        }
--- a/server/analysis/nlp.py
+++ b/server/analysis/nlp.py
@@ -200,6 +200,35 @@ class NLP:
            if column.startswith("emotion_") and column not in emotion_df.columns:
                self.df[column] = 0.0

+        # drop neutral and surprise columns from df and normalize others to sum to 1
+        drop_cols = ["emotion_neutral", "emotion_surprise"]
+
+        existing_drop = [c for c in drop_cols if c in self.df.columns]
+        self.df.drop(columns=existing_drop, inplace=True)
+
+        remaining_emotion_cols = [
+            c for c in self.df.columns
+            if c.startswith("emotion_")
+        ]
+
+        if remaining_emotion_cols:
+            emotion_matrix = (
+                self.df[remaining_emotion_cols]
+                .apply(pd.to_numeric, errors="coerce")
+                .fillna(0.0)
+            )
+
+            row_sums = emotion_matrix.sum(axis=1)
+
+            # Avoid division by zero
+            row_sums = row_sums.replace(0, 1.0)
+
+            normalized = emotion_matrix.div(row_sums, axis=0)
+
+            self.df[remaining_emotion_cols] = normalized.values
+
+        
+
    def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
        titles = self.df[self.title_col].fillna("").astype(str)
        contents = self.df[self.content_col].fillna("").astype(str)
@@ -276,3 +305,5 @@ class NLP:
            self.df[col_name] = [
                d.get(label, 0) for d in entity_count_dicts
            ]
+
+
--- a/server/app.py
+++ b/server/app.py
@@ -215,8 +215,8 @@ def get_interaction_analysis():
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500

-@app.route('/filter/search', methods=["POST"])
-def search_dataset():
+@app.route('/filter/query', methods=["POST"])
+def filter_query():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400

@@ -226,7 +226,7 @@ def search_dataset():
        return jsonify(stat_obj.df.to_dict(orient="records")), 200
    
    query = data["query"]
-    filtered_df = stat_obj.search(query)
+    filtered_df = stat_obj.filter_by_query(query)

    return jsonify(filtered_df), 200

--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
+from server.analysis.cultural import CulturalAnalysis

 DOMAIN_STOPWORDS = {
    "www", "https", "http",
@@ -15,8 +16,7 @@ DOMAIN_STOPWORDS = {
    "comment", "comments",
    "discussion", "thread",
    "post", "posts",
-    "would", "could", "should",
-    "like", "get", "one"
+    "would", "get", "one"
 }

 nltk.download('stopwords')
@@ -40,33 +40,32 @@ class StatGen:
        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")

        self.nlp = NLP(self.df, "title", "content", domain_topics)
-        self._add_extra_cols(self.df)
+        self.nlp.add_emotion_cols()
+        self.nlp.add_topic_col()
+        self.nlp.add_ner_cols()
+        self._add_time_cols(self.df)

        self.temporal_analysis = TemporalAnalysis(self.df)
        self.emotional_analysis = EmotionalAnalysis(self.df)
        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
+        self.cultural_analysis = CulturalAnalysis(self.df)

        self.original_df = self.df.copy(deep=True)

    ## Private Methods
-    def _add_extra_cols(self, df: pd.DataFrame) -> None:
-        df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce')
+    def _add_time_cols(self, df: pd.DataFrame) -> None:
+        df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
        df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
        df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
        df["hour"] = df["dt"].dt.hour
        df["weekday"] = df["dt"].dt.day_name()
    
-        self.nlp.add_emotion_cols()
-        self.nlp.add_topic_col()
-        self.nlp.add_ner_cols()
-    
    ## Public

-
    # topics over time
    # emotions over time
-    def get_time_analysis(self) -> pd.DataFrame:
+    def get_time_analysis(self) -> dict:
        return {
            "events_per_day": self.temporal_analysis.posts_per_day(),
            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
@@ -87,24 +86,25 @@ class StatGen:
    def get_user_analysis(self) -> dict:
        return {
            "top_users": self.interaction_analysis.top_users(),
-            "users": self.interaction_analysis.per_user_analysis(),
-            "interaction_graph": self.interaction_analysis.interaction_graph()
+            "users": self.interaction_analysis.per_user_analysis()
        }
    
    # average / max thread depth
    # high engagment threads based on volume
-
    def get_interactional_analysis(self) -> dict:
        return {
            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
-            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
+            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
+            "interaction_graph": self.interaction_analysis.interaction_graph()
        }
    
    # detect community jargon
    # in-group and out-group linguistic markers
    def get_cultural_analysis(self) -> dict:
        return {
-            "identity_markers": self.linguistic_analysis.identity_markers()
+            "identity_markers": self.cultural_analysis.get_identity_markers(),
+            "stance_markers": self.cultural_analysis.get_stance_markers(),
+            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
        }
    
    def summary(self) -> dict:
@@ -127,7 +127,7 @@ class StatGen:
            "sources": self.df["source"].dropna().unique().tolist()
        }
        
-    def search(self, search_query: str) -> dict:
+    def filter_by_query(self, search_query: str) -> dict:
        self.df = self.df[
            self.df["content"].str.contains(search_query)
        ]