refactor: nlp processing unified into a class

Also removed surprise emotion from content endpoint
2026-02-08 16:33:27 +00:00
parent f136e7b7c8
commit e7ffb58c3d
2 changed files with 67 additions and 67 deletions
--- a/server/nlp.py
+++ b/server/nlp.py
@@ -5,24 +5,37 @@ from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 class NLP:
    def __init__(self, df: pd.DataFrame, title_col: str, content_col: str, topics: dict):
        self.df = df
        self.title_col = title_col
        self.content_col = content_col
        self.device = 0 if torch.cuda.is_available() else 1
-model = SentenceTransformer("all-mpnet-base-v2", device=0 if torch.cuda.is_available() else 1)
+        # Topic model
        self.topic_model = SentenceTransformer("all-mpnet-base-v2", device=self.device)
-def add_emotion_cols(
+        self.topic_labels = list(topics.keys())
-        df: pd.DataFrame, 
+        self.topic_texts = list(topics.values())
-        content_col: str
+
-    ) -> None:
+        self.topic_embeddings = self.topic_model.encode(
-    emotion_classifier = pipeline(
+            self.topic_texts,
            normalize_embeddings=True,
        )
        # emotion model
        self.emotion_classifier = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            top_k=None,
            truncation=True,
-        device=0 if torch.cuda.is_available() else -1
+            device=self.device
        )
-    texts = df[content_col].astype(str).str.slice(0, 512).tolist()
+    def add_emotion_cols(self) -> None:
        texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()
-    results = emotion_classifier(
+        results = self.emotion_classifier(
            texts,
            batch_size=64
        )
@@ -30,50 +43,32 @@ def add_emotion_cols(
        labels = [r["label"] for r in results[0]]
        for label in labels:
-        df[f"emotion_{label}"] = [
+            self.df[f"emotion_{label}"] = [
                next(item["score"] for item in row if item["label"] == label)
                for row in results
            ]
-def add_topic_col(
+    def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
-        df: pd.DataFrame,
+        titles = self.df[self.title_col].fillna("").astype(str)
-        title_col: str,
+        contents = self.df[self.content_col].fillna("").astype(str)
        content_col: str,
        domain_topics: dict,
        confidence_threshold: float = 0.3
    ) -> None:
    topic_labels = list(domain_topics.keys())
    topic_texts = list(domain_topics.values())
    topic_embeddings = model.encode(
        topic_texts,
        normalize_embeddings=True,
    )
    titles = df[title_col].fillna("").astype(str)
    contents = df[content_col].fillna("").astype(str)
        texts = [
            f"{title}. {content}" if title else content
            for title, content in zip(titles, contents)
        ]
-    text_embeddings = model.encode(
+        text_embeddings = self.topic_model.encode(
            texts,
            normalize_embeddings=True,
        )
        # Similarity
-    sims = cosine_similarity(text_embeddings, topic_embeddings)
+        sims = cosine_similarity(text_embeddings, self.topic_embeddings)
        # Best match
        best_idx = sims.argmax(axis=1)
        best_score = sims.max(axis=1)
-    df["topic"] = [topic_labels[i] for i in best_idx]
+        self.df["topic"] = [self.topic_labels[i] for i in best_idx]
-    df["topic_confidence"] = best_score
+        self.df["topic_confidence"] = best_score
-
+        self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
    df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
    return df
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -5,7 +5,7 @@ import datetime
 from nltk.corpus import stopwords
 from collections import Counter
-from server.nlp import add_emotion_cols, add_topic_col
+from server.nlp import NLP
 DOMAIN_STOPWORDS = {
    "www", "https", "http",
@@ -21,7 +21,7 @@ nltk.download('stopwords')
 EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
 class StatGen:
-    def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: list) -> None:
+    def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None:
        posts_df["type"] = "post"
        posts_df["parent_id"] = None
@@ -30,6 +30,7 @@ class StatGen:
        self.domain_topics = domain_topics
        self.df = pd.concat([posts_df, comments_df])
        self.nlp = NLP(self.df, "title", "content", domain_topics)
        self._add_extra_cols(self.df)
        self.original_df = self.df.copy(deep=True)
@@ -41,8 +42,8 @@ class StatGen:
        df["hour"] = df["dt"].dt.hour
        df["weekday"] = df["dt"].dt.day_name()
-        add_emotion_cols(df, "content")
+        self.nlp.add_emotion_cols()
-        add_topic_col(df, "title", "content", self.domain_topics)
+        self.nlp.add_topic_col()
    def _tokenize(self, text: str):
        tokens = re.findall(r"\b[a-z]{3,}\b", text)
@@ -192,10 +193,14 @@ class StatGen:
            .reset_index(drop=True)
        )
-        # avearge emotion by topic (excluding neutral)
+        emotion_exclusions = [
            "emotion_neutral",
            "emotion_surprise"
        ]
        emotion_cols = [
            col for col in self.df.columns
-            if col.startswith("emotion_") and col != "emotion_neutral"
+            if col.startswith("emotion_") and col not in emotion_exclusions
        ]
        counts = (