From ce0aa6bc4301c60a1a84ff1ba7e17a24bf390211 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 24 Feb 2026 15:28:30 +0000 Subject: [PATCH] feat(nlp): remove surprise & neutral emotions from NLP processing These emotions often dominate due to the writing style of online users and don't provide an accurate picture of the sentiment. --- server/analysis/emotional.py | 7 +------ server/analysis/nlp.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py index c311944..10e897d 100644 --- a/server/analysis/emotional.py +++ b/server/analysis/emotional.py @@ -5,14 +5,9 @@ class EmotionalAnalysis: self.df = df def avg_emotion_by_topic(self) -> dict: - emotion_exclusions = [ - "emotion_neutral", - "emotion_surprise" - ] - emotion_cols = [ col for col in self.df.columns - if col.startswith("emotion_") and col not in emotion_exclusions + if col.startswith("emotion_") ] counts = ( diff --git a/server/analysis/nlp.py b/server/analysis/nlp.py index c3fcf89..4459851 100644 --- a/server/analysis/nlp.py +++ b/server/analysis/nlp.py @@ -200,6 +200,35 @@ class NLP: if column.startswith("emotion_") and column not in emotion_df.columns: self.df[column] = 0.0 + # drop neutral and surprise columns from df and normalize others to sum to 1 + drop_cols = ["emotion_neutral", "emotion_surprise"] + + existing_drop = [c for c in drop_cols if c in self.df.columns] + self.df.drop(columns=existing_drop, inplace=True) + + remaining_emotion_cols = [ + c for c in self.df.columns + if c.startswith("emotion_") + ] + + if remaining_emotion_cols: + emotion_matrix = ( + self.df[remaining_emotion_cols] + .apply(pd.to_numeric, errors="coerce") + .fillna(0.0) + ) + + row_sums = emotion_matrix.sum(axis=1) + + # Avoid division by zero + row_sums = row_sums.replace(0, 1.0) + + normalized = emotion_matrix.div(row_sums, axis=0) + + self.df[remaining_emotion_cols] = normalized.values + + + def add_topic_col(self, confidence_threshold: float = 0.3) -> None: titles = self.df[self.title_col].fillna("").astype(str) contents = self.df[self.content_col].fillna("").astype(str) @@ -276,3 +305,5 @@ class NLP: self.df[col_name] = [ d.get(label, 0) for d in entity_count_dicts ] + +