feat(nlp): remove surprise & neutral emotions from NLP processing

These emotions often dominate due to the writing style of online users and don't provide an accurate picture of the sentiment.
2026-02-24 15:28:30 +00:00
parent e82ac8d73b
commit ce0aa6bc43
2 changed files with 32 additions and 6 deletions
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -5,14 +5,9 @@ class EmotionalAnalysis:
        self.df = df
    def avg_emotion_by_topic(self) -> dict:
        emotion_exclusions = [
            "emotion_neutral",
            "emotion_surprise"
        ]
        emotion_cols = [
            col for col in self.df.columns
-            if col.startswith("emotion_") and col not in emotion_exclusions
+            if col.startswith("emotion_")
        ]
        counts = (
--- a/server/analysis/nlp.py
+++ b/server/analysis/nlp.py
@@ -200,6 +200,35 @@ class NLP:
            if column.startswith("emotion_") and column not in emotion_df.columns:
                self.df[column] = 0.0
        # drop neutral and surprise columns from df and normalize others to sum to 1
        drop_cols = ["emotion_neutral", "emotion_surprise"]
        existing_drop = [c for c in drop_cols if c in self.df.columns]
        self.df.drop(columns=existing_drop, inplace=True)
        remaining_emotion_cols = [
            c for c in self.df.columns
            if c.startswith("emotion_")
        ]
        if remaining_emotion_cols:
            emotion_matrix = (
                self.df[remaining_emotion_cols]
                .apply(pd.to_numeric, errors="coerce")
                .fillna(0.0)
            )
            row_sums = emotion_matrix.sum(axis=1)
            # Avoid division by zero
            row_sums = row_sums.replace(0, 1.0)
            normalized = emotion_matrix.div(row_sums, axis=0)
            self.df[remaining_emotion_cols] = normalized.values
    def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
        titles = self.df[self.title_col].fillna("").astype(str)
        contents = self.df[self.content_col].fillna("").astype(str)
@@ -276,3 +305,5 @@ class NLP:
            self.df[col_name] = [
                d.get(label, 0) for d in entity_count_dicts
            ]