feat(nlp): remove surprise & neutral emotions from NLP processing

These emotions often dominate due to the writing style of online users and don't provide an accurate picture of the sentiment.
This commit is contained in:
2026-02-24 15:28:30 +00:00
parent e82ac8d73b
commit ce0aa6bc43
2 changed files with 32 additions and 6 deletions

View File

@@ -5,14 +5,9 @@ class EmotionalAnalysis:
self.df = df self.df = df
def avg_emotion_by_topic(self) -> dict: def avg_emotion_by_topic(self) -> dict:
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [ emotion_cols = [
col for col in self.df.columns col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions if col.startswith("emotion_")
] ]
counts = ( counts = (

View File

@@ -200,6 +200,35 @@ class NLP:
if column.startswith("emotion_") and column not in emotion_df.columns: if column.startswith("emotion_") and column not in emotion_df.columns:
self.df[column] = 0.0 self.df[column] = 0.0
# drop neutral and surprise columns from df and normalize others to sum to 1
drop_cols = ["emotion_neutral", "emotion_surprise"]
existing_drop = [c for c in drop_cols if c in self.df.columns]
self.df.drop(columns=existing_drop, inplace=True)
remaining_emotion_cols = [
c for c in self.df.columns
if c.startswith("emotion_")
]
if remaining_emotion_cols:
emotion_matrix = (
self.df[remaining_emotion_cols]
.apply(pd.to_numeric, errors="coerce")
.fillna(0.0)
)
row_sums = emotion_matrix.sum(axis=1)
# Avoid division by zero
row_sums = row_sums.replace(0, 1.0)
normalized = emotion_matrix.div(row_sums, axis=0)
self.df[remaining_emotion_cols] = normalized.values
def add_topic_col(self, confidence_threshold: float = 0.3) -> None: def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
titles = self.df[self.title_col].fillna("").astype(str) titles = self.df[self.title_col].fillna("").astype(str)
contents = self.df[self.content_col].fillna("").astype(str) contents = self.df[self.content_col].fillna("").astype(str)
@@ -276,3 +305,5 @@ class NLP:
self.df[col_name] = [ self.df[col_name] = [
d.get(label, 0) for d in entity_count_dicts d.get(label, 0) for d in entity_count_dicts
] ]