feat: add nlp topic processing

2026-02-05 17:23:25 +00:00
parent d4db7bec24
commit ba1501e493
2 changed files with 21 additions and 3 deletions
--- a/server/nlp_processor.py
+++ b/server/nlp_processor.py
@@ -2,6 +2,9 @@ import torch
 import pandas as pd
 from transformers import pipeline
 from keybert import KeyBERT
 kw_model = KeyBERT(model="all-MiniLM-L6-v2")
 emotion_classifier = pipeline(
    "text-classification",
@@ -26,3 +29,18 @@ def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
            next(item["score"] for item in row if item["label"] == label)
            for row in results
        ]
 def add_topic_col(df: pd.DataFrame, content_col: str, top_n: int = 3) -> None:
    topics = []
    for text in df["content"].astype(str):
        keywords = kw_model.extract_keywords(
            text,
            keyphrase_ngram_range=(1, 3),
            stop_words="english",
            top_n=top_n
        )
        topics.append([kw for kw, _ in keywords])
    df["topics"] = topics
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -5,7 +5,7 @@ import datetime
 from nltk.corpus import stopwords
 from collections import Counter
-from server.nlp_processor import add_emotion_cols
+from server.nlp_processor import add_emotion_cols, add_topic_col
 DOMAIN_STOPWORDS = {
    "www", "https", "http",
@@ -41,7 +41,7 @@ class StatGen:
        df["weekday"] = df["dt"].dt.day_name()
        add_emotion_cols(df, "content")
-
+        add_topic_col(df, "content")
    def _tokenize(self, text: str):
        tokens = re.findall(r"\b[a-z]{3,}\b", text)