From ce0aa6bc4301c60a1a84ff1ba7e17a24bf390211 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Tue, 24 Feb 2026 15:28:30 +0000
Subject: [PATCH] feat(nlp): remove surprise & neutral emotions from NLP
 processing

These emotions often dominate due to the writing style of online users and don't provide an accurate picture of the sentiment.
---
 server/analysis/emotional.py |  7 +------
 server/analysis/nlp.py       | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py
index c311944..10e897d 100644
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -5,14 +5,9 @@ class EmotionalAnalysis:
         self.df = df
 
     def avg_emotion_by_topic(self) -> dict:
-        emotion_exclusions = [
-            "emotion_neutral",
-            "emotion_surprise"
-        ]
-
         emotion_cols = [
             col for col in self.df.columns
-            if col.startswith("emotion_") and col not in emotion_exclusions
+            if col.startswith("emotion_")
         ]
 
         counts = (
diff --git a/server/analysis/nlp.py b/server/analysis/nlp.py
index c3fcf89..4459851 100644
--- a/server/analysis/nlp.py
+++ b/server/analysis/nlp.py
@@ -200,6 +200,35 @@ class NLP:
             if column.startswith("emotion_") and column not in emotion_df.columns:
                 self.df[column] = 0.0
 
+        # drop neutral and surprise columns from df and normalize others to sum to 1
+        drop_cols = ["emotion_neutral", "emotion_surprise"]
+
+        existing_drop = [c for c in drop_cols if c in self.df.columns]
+        self.df.drop(columns=existing_drop, inplace=True)
+
+        remaining_emotion_cols = [
+            c for c in self.df.columns
+            if c.startswith("emotion_")
+        ]
+
+        if remaining_emotion_cols:
+            emotion_matrix = (
+                self.df[remaining_emotion_cols]
+                .apply(pd.to_numeric, errors="coerce")
+                .fillna(0.0)
+            )
+
+            row_sums = emotion_matrix.sum(axis=1)
+
+            # Avoid division by zero
+            row_sums = row_sums.replace(0, 1.0)
+
+            normalized = emotion_matrix.div(row_sums, axis=0)
+
+            self.df[remaining_emotion_cols] = normalized.values
+
+        
+
     def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
         titles = self.df[self.title_col].fillna("").astype(str)
         contents = self.df[self.content_col].fillna("").astype(str)
@@ -276,3 +305,5 @@ class NLP:
             self.df[col_name] = [
                 d.get(label, 0) for d in entity_count_dicts
             ]
+
+