refactor: nlp processing unified into a class

Also removed surprise emotion from content endpoint
2026-02-08 16:33:27 +00:00
parent f136e7b7c8
commit e7ffb58c3d
2 changed files with 67 additions and 67 deletions
--- a/server/nlp.py
+++ b/server/nlp.py
@@ -5,24 +5,37 @@ from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity

+class NLP:
+    def __init__(self, df: pd.DataFrame, title_col: str, content_col: str, topics: dict):
+        self.df = df
+        self.title_col = title_col
+        self.content_col = content_col
+        self.device = 0 if torch.cuda.is_available() else 1
        
-model = SentenceTransformer("all-mpnet-base-v2", device=0 if torch.cuda.is_available() else 1)
+        # Topic model
+        self.topic_model = SentenceTransformer("all-mpnet-base-v2", device=self.device)

-def add_emotion_cols(
-        df: pd.DataFrame, 
-        content_col: str
-    ) -> None:
-    emotion_classifier = pipeline(
+        self.topic_labels = list(topics.keys())
+        self.topic_texts = list(topics.values())
+
+        self.topic_embeddings = self.topic_model.encode(
+            self.topic_texts,
+            normalize_embeddings=True,
+        )
+
+        # emotion model
+        self.emotion_classifier = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            top_k=None,
            truncation=True,
-        device=0 if torch.cuda.is_available() else -1
+            device=self.device
        )

-    texts = df[content_col].astype(str).str.slice(0, 512).tolist()
+    def add_emotion_cols(self) -> None:
+        texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()

-    results = emotion_classifier(
+        results = self.emotion_classifier(
            texts,
            batch_size=64
        )
@@ -30,50 +43,32 @@ def add_emotion_cols(
        labels = [r["label"] for r in results[0]]

        for label in labels:
-        df[f"emotion_{label}"] = [
+            self.df[f"emotion_{label}"] = [
                next(item["score"] for item in row if item["label"] == label)
                for row in results
            ]

-def add_topic_col(
-        df: pd.DataFrame,
-        title_col: str,
-        content_col: str,
-        domain_topics: dict,
-        confidence_threshold: float = 0.3
-    ) -> None:
-
-    topic_labels = list(domain_topics.keys())
-    topic_texts = list(domain_topics.values())
-
-    topic_embeddings = model.encode(
-        topic_texts,
-        normalize_embeddings=True,
-    )
-
-    titles = df[title_col].fillna("").astype(str)
-    contents = df[content_col].fillna("").astype(str)
+    def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
+        titles = self.df[self.title_col].fillna("").astype(str)
+        contents = self.df[self.content_col].fillna("").astype(str)

        texts = [
            f"{title}. {content}" if title else content
            for title, content in zip(titles, contents)
        ]

-    text_embeddings = model.encode(
+        text_embeddings = self.topic_model.encode(
            texts,
            normalize_embeddings=True,
        )

        # Similarity
-    sims = cosine_similarity(text_embeddings, topic_embeddings)
+        sims = cosine_similarity(text_embeddings, self.topic_embeddings)

        # Best match
        best_idx = sims.argmax(axis=1)
        best_score = sims.max(axis=1)

-    df["topic"] = [topic_labels[i] for i in best_idx]
-    df["topic_confidence"] = best_score
-
-    df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
-
-    return df
+        self.df["topic"] = [self.topic_labels[i] for i in best_idx]
+        self.df["topic_confidence"] = best_score
+        self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -5,7 +5,7 @@ import datetime

 from nltk.corpus import stopwords
 from collections import Counter
-from server.nlp import add_emotion_cols, add_topic_col
+from server.nlp import NLP

 DOMAIN_STOPWORDS = {
    "www", "https", "http",
@@ -21,7 +21,7 @@ nltk.download('stopwords')
 EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS

 class StatGen:
-    def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: list) -> None:
+    def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None:
        posts_df["type"] = "post"
        posts_df["parent_id"] = None

@@ -30,6 +30,7 @@ class StatGen:
        self.domain_topics = domain_topics

        self.df = pd.concat([posts_df, comments_df])
+        self.nlp = NLP(self.df, "title", "content", domain_topics)
        self._add_extra_cols(self.df)

        self.original_df = self.df.copy(deep=True)
@@ -41,8 +42,8 @@ class StatGen:
        df["hour"] = df["dt"].dt.hour
        df["weekday"] = df["dt"].dt.day_name()
        
-        add_emotion_cols(df, "content")
-        add_topic_col(df, "title", "content", self.domain_topics)
+        self.nlp.add_emotion_cols()
+        self.nlp.add_topic_col()

    def _tokenize(self, text: str):
        tokens = re.findall(r"\b[a-z]{3,}\b", text)
@@ -192,10 +193,14 @@ class StatGen:
            .reset_index(drop=True)
        )

-        # avearge emotion by topic (excluding neutral)
+        emotion_exclusions = [
+            "emotion_neutral",
+            "emotion_surprise"
+        ]
+
        emotion_cols = [
            col for col in self.df.columns
-            if col.startswith("emotion_") and col != "emotion_neutral"
+            if col.startswith("emotion_") and col not in emotion_exclusions
        ]

        counts = (