feat(nlp): remove surprise & neutral emotions from NLP processing

These emotions often dominate due to the writing style of online users and don't provide an accurate picture of the sentiment.
feat(api): add stance markers & avg emotion per entity
2026-02-24 15:28:30 +00:00 · 2026-02-24 15:12:17 +00:00 · 2026-02-24 14:25:53 +00:00 · 2026-02-23 19:09:48 +00:00 · 2026-02-23 18:14:34 +00:00 · 2026-02-23 18:14:24 +00:00
9 changed files with 336 additions and 18 deletions
--- a/frontend/src/components/InteractionStats.tsx
+++ b/frontend/src/components/InteractionStats.tsx
@@ -34,7 +34,7 @@ function ApiToGraphData(apiData: InteractionGraph) {
 }
-const InteractionStats = (props: { data: UserAnalysisResponse }) => {
+const UserStats = (props: { data: UserAnalysisResponse }) => {
  const graphData = ApiToGraphData(props.data.interaction_graph);
  return (
@@ -44,7 +44,7 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
            This graph visualizes interactions between users based on comments and replies. 
            Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
        </p>
-        <div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}>
+        <div>
            <ForceGraph3D
                graphData={graphData}
                nodeAutoColorBy="id"
@@ -58,4 +58,4 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
  );
 }
-export default InteractionStats;
+export default UserStats;
--- a/frontend/src/pages/Stats.tsx
+++ b/frontend/src/pages/Stats.tsx
@@ -3,7 +3,7 @@ import axios from "axios";
 import StatsStyling from "../styles/stats_styling";
 import SummaryStats from "../components/SummaryStats";
 import EmotionalStats from "../components/EmotionalStats";
-import InteractionStats from "../components/InteractionStats";
+import InteractionStats from "../components/UserStats";
 import { 
  type SummaryResponse, 
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -0,0 +1,154 @@
 import pandas as pd
 import re
 from collections import Counter
 from typing import Any
 class CulturalAnalysis:
    def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
        self.df = df
        self.content_col = content_col
        self.topic_col = topic_col
    def get_identity_markers(self):
        df = self.df.copy()
        s = df[self.content_col].fillna("").astype(str).str.lower()
        in_group_words = {"we", "us", "our", "ourselves"}
        out_group_words = {"they", "them", "their", "themselves"}
        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
        emotion_cols = [
            c for c in df.columns
            if c.startswith("emotion_") and c not in emotion_exclusions
        ]
        # Tokenize per row
        tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt))
        total_tokens = int(tokens_per_row.map(len).sum())
        in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int)
        out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int)
        in_count = int(in_hits.sum())
        out_count = int(out_hits.sum())
        in_mask = in_hits > out_hits
        out_mask = out_hits > in_hits
        tie_mask = ~(in_mask | out_mask)
        result = {
            "in_group_usage": in_count,
            "out_group_usage": out_count,
            "in_group_ratio": round(in_count / max(total_tokens, 1), 5),
            "out_group_ratio": round(out_count / max(total_tokens, 1), 5),
            "in_group_posts": int(in_mask.sum()),
            "out_group_posts": int(out_mask.sum()),
            "tie_posts": int(tie_mask.sum()),
        }
        if emotion_cols:
            emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
            in_avg = emo.loc[in_mask].mean() if in_mask.any() else pd.Series(0.0, index=emotion_cols)
            out_avg = emo.loc[out_mask].mean() if out_mask.any() else pd.Series(0.0, index=emotion_cols)
            result["in_group_emotion_avg"] = in_avg.to_dict()
            result["out_group_emotion_avg"] = out_avg.to_dict()
        return result
    def get_stance_markers(self) -> dict[str, Any]:
        s = self.df[self.content_col].fillna("").astype(str)
        hedges = {
            "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
            "i think", "i feel", "i guess", "kind of", "sort of", "somewhat"
        }
        certainty = {
            "definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never"
        }
        deontic = {
            "must", "should", "need", "needs", "have to", "has to", "ought", "required", "require"
        }
        permission = {"can", "allowed", "okay", "ok", "permitted"}
        def count_phrases(text: str, phrases: set[str]) -> int:
            c = 0
            for p in phrases:
                if " " in p:
                    c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
                else:
                    c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
            return c
        hedge_counts = s.apply(lambda t: count_phrases(t, hedges))
        certainty_counts = s.apply(lambda t: count_phrases(t, certainty))
        deontic_counts = s.apply(lambda t: count_phrases(t, deontic))
        perm_counts = s.apply(lambda t: count_phrases(t, permission))
        token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1)
        return {
            "hedge_total": int(hedge_counts.sum()),
            "certainty_total": int(certainty_counts.sum()),
            "deontic_total": int(deontic_counts.sum()),
            "permission_total": int(perm_counts.sum()),
            "hedge_per_1k_tokens": round(1000 * hedge_counts.sum() / token_counts.sum(), 3),
            "certainty_per_1k_tokens": round(1000 * certainty_counts.sum() / token_counts.sum(), 3),
            "deontic_per_1k_tokens": round(1000 * deontic_counts.sum() / token_counts.sum(), 3),
            "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
        }
    def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
        if "entities" not in self.df.columns:
            return {"entity_emotion_avg": {}}
        df = self.df
        emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
        entity_counter = Counter()
        for row in df["entities"].dropna():
            if isinstance(row, list):
                for ent in row:
                    if isinstance(ent, dict):
                        text = ent.get("text")
                        if isinstance(text, str):
                            text = text.strip()
                            if len(text) >= 3:  # filter short junk
                                entity_counter[text] += 1
        top_entities = entity_counter.most_common(top_n)
        entity_emotion_avg = {}
        for entity_text, _ in top_entities:
            mask = df["entities"].apply(
                lambda ents: isinstance(ents, list) and
                any(isinstance(e, dict) and e.get("text") == entity_text for e in ents)
            )
            post_count = int(mask.sum())
            if post_count >= min_posts:
                emo_means = (
                    df.loc[mask, emotion_cols]
                    .apply(pd.to_numeric, errors="coerce")
                    .fillna(0.0)
                    .mean()
                    .to_dict()
                )
                entity_emotion_avg[entity_text] = {
                    "post_count": post_count,
                    "emotion_avg": emo_means
                }
        return {
            "entity_emotion_avg": entity_emotion_avg
        }
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -5,14 +5,9 @@ class EmotionalAnalysis:
        self.df = df
    def avg_emotion_by_topic(self) -> dict:
        emotion_exclusions = [
            "emotion_neutral",
            "emotion_surprise"
        ]
        emotion_cols = [
            col for col in self.df.columns
-            if col.startswith("emotion_") and col not in emotion_exclusions
+            if col.startswith("emotion_")
        ]
        counts = (
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -124,3 +124,85 @@ class InteractionAnalysis:
            interactions[a][b] = interactions[a].get(b, 0) + 1
        return interactions
    def average_thread_depth(self):
        depths = []
        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
        for _, row in self.df.iterrows():
            depth = 0
            current_id = row["id"]
            while True:
                reply_to = id_to_reply.get(current_id)
                if pd.isna(reply_to) or reply_to == "":
                    break
                depth += 1
                current_id = reply_to
            depths.append(depth)
        if not depths:
            return 0
        return round(sum(depths) / len(depths), 2)
    def average_thread_length_by_emotion(self):
        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
        emotion_cols = [
            c for c in self.df.columns
            if c.startswith("emotion_") and c not in emotion_exclusions
        ]
        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
        length_cache = {}
        def thread_length_from(start_id):
            if start_id in length_cache:
                return length_cache[start_id]
            seen = set()
            length = 1
            current = start_id
            while True:
                if current in seen:
                    # infinite loop shouldn't happen, but just in case
                    break
                seen.add(current)
                reply_to = id_to_reply.get(current)
                if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
                    break
                length += 1
                current = reply_to
                if current in length_cache:
                    length += (length_cache[current] - 1)
                    break
            length_cache[start_id] = length
            return length
        emotion_to_lengths = {}
        # Fill NaNs in emotion cols to avoid max() issues
        emo_df = self.df[["id"] + emotion_cols].copy()
        emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
        for _, row in emo_df.iterrows():
            msg_id = row["id"]
            length = thread_length_from(msg_id)
            emotions = {c: row[c] for c in emotion_cols}
            dominant = max(emotions, key=emotions.get)
            emotion_to_lengths.setdefault(dominant, []).append(length)
        return {
            emotion: round(sum(lengths) / len(lengths), 2)
            for emotion, lengths in emotion_to_lengths.items()
        }
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -9,6 +9,10 @@ class LinguisticAnalysis:
        self.df = df
        self.word_exclusions = word_exclusions
    def _tokenize(self, text: str):
        tokens = re.findall(r"\b[a-z]{3,}\b", text)
        return [t for t in tokens if t not in self.word_exclusions]
    def _clean_text(self, text: str) -> str:
        text = re.sub(r"http\S+", "", text)        # remove URLs
        text = re.sub(r"www\S+", "", text)
--- a/server/analysis/nlp.py
+++ b/server/analysis/nlp.py
@@ -200,6 +200,35 @@ class NLP:
            if column.startswith("emotion_") and column not in emotion_df.columns:
                self.df[column] = 0.0
        # drop neutral and surprise columns from df and normalize others to sum to 1
        drop_cols = ["emotion_neutral", "emotion_surprise"]
        existing_drop = [c for c in drop_cols if c in self.df.columns]
        self.df.drop(columns=existing_drop, inplace=True)
        remaining_emotion_cols = [
            c for c in self.df.columns
            if c.startswith("emotion_")
        ]
        if remaining_emotion_cols:
            emotion_matrix = (
                self.df[remaining_emotion_cols]
                .apply(pd.to_numeric, errors="coerce")
                .fillna(0.0)
            )
            row_sums = emotion_matrix.sum(axis=1)
            # Avoid division by zero
            row_sums = row_sums.replace(0, 1.0)
            normalized = emotion_matrix.div(row_sums, axis=0)
            self.df[remaining_emotion_cols] = normalized.values
    def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
        titles = self.df[self.title_col].fillna("").astype(str)
        contents = self.df[self.content_col].fillna("").astype(str)
@@ -276,3 +305,5 @@ class NLP:
            self.df[col_name] = [
                d.get(label, 0) for d in entity_count_dicts
            ]
--- a/server/app.py
+++ b/server/app.py
@@ -55,7 +55,7 @@ def word_frequencies():
        return jsonify({"error": "No data uploaded"}), 400
    try:
-        return jsonify(stat_obj.content_analysis()), 200
+        return jsonify(stat_obj.get_content_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
@@ -80,7 +80,7 @@ def get_time_analysis():
        return jsonify({"error": "No data uploaded"}), 400
    try:
-        return jsonify(stat_obj.time_analysis()), 200
+        return jsonify(stat_obj.get_time_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
@@ -93,7 +93,33 @@ def get_user_analysis():
        return jsonify({"error": "No data uploaded"}), 400
    try:
-        return jsonify(stat_obj.user_analysis()), 200
+        return jsonify(stat_obj.get_user_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
 def get_cultural_analysis():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    try:
        return jsonify(stat_obj.get_cultural_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
 def get_interaction_analysis():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    try:
        return jsonify(stat_obj.get_interactional_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
 from server.analysis.cultural import CulturalAnalysis
 DOMAIN_STOPWORDS = {
    "www", "https", "http",
@@ -46,6 +47,7 @@ class StatGen:
        self.emotional_analysis = EmotionalAnalysis(self.df)
        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
        self.cultural_analysis = CulturalAnalysis(self.df)
        self.original_df = self.df.copy(deep=True)
@@ -62,13 +64,18 @@ class StatGen:
        self.nlp.add_ner_cols()
    ## Public
-    def time_analysis(self) -> pd.DataFrame:
+
    # topics over time
    # emotions over time
    def get_time_analysis(self) -> pd.DataFrame:
        return {
            "events_per_day": self.temporal_analysis.posts_per_day(),
            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
        }
-    def content_analysis(self) -> dict:
+    # average topic duration
    def get_content_analysis(self) -> dict:
        return {
            "word_frequencies": self.linguistic_analysis.word_frequencies(),
            "common_two_phrases": self.linguistic_analysis.ngrams(),
@@ -77,13 +84,32 @@ class StatGen:
            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
        }
-    def user_analysis(self) -> dict:
+    # average emotion per user
    # average chain length
    def get_user_analysis(self) -> dict:
        return {
            "top_users": self.interaction_analysis.top_users(),
-            "users": self.interaction_analysis.per_user_analysis(),
+            "users": self.interaction_analysis.per_user_analysis()
        }
    # average / max thread depth
    # high engagment threads based on volume
    def get_interactional_analysis(self) -> dict:
        return {
            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
            "interaction_graph": self.interaction_analysis.interaction_graph()
        }
    # detect community jargon
    # in-group and out-group linguistic markers
    def get_cultural_analysis(self) -> dict:
        return {
            "identity_markers": self.cultural_analysis.get_identity_markers(),
            "stance_markers": self.cultural_analysis.get_stance_markers(),
            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
        }
    def summary(self) -> dict:
        total_posts = (self.df["type"] == "post").sum()
        total_comments = (self.df["type"] == "comment").sum()
Author	SHA1	Message	Date
Dylan De Faoite	ce0aa6bc43	feat(nlp): remove surprise & neutral emotions from NLP processing These emotions often dominate due to the writing style of online users and don't provide an accurate picture of the sentiment.	2026-02-24 15:28:30 +00:00
Dylan De Faoite	e82ac8d73b	feat(api): add stance markers & avg emotion per entity	2026-02-24 15:12:17 +00:00
Dylan De Faoite	ccba6a5262	feat(api): add cultural analysis endpoint with identity markers	2026-02-24 14:25:53 +00:00
Dylan De Faoite	257eb80de7	feat(api): add average thread length per emotion	2026-02-23 19:09:48 +00:00
Dylan De Faoite	3a23b1f0c8	feat(api): add average thread depth	2026-02-23 18:14:34 +00:00
Dylan De Faoite	8c76476cd3	fix(api): broken analysis calls due to overlap in attribute and method names	2026-02-23 18:14:24 +00:00
Dylan De Faoite	397986dc89	refactor(frontend): rename InteractionStats to UserStats	2026-02-23 17:15:14 +00:00
Dylan De Faoite	04b7094036	feat(api): add cultural endpoint	2026-02-23 17:14:12 +00:00