feat: add nlp topic processing

This commit is contained in:
2026-02-05 17:23:25 +00:00
parent d4db7bec24
commit ba1501e493
2 changed files with 21 additions and 3 deletions

View File

@@ -2,6 +2,9 @@ import torch
import pandas as pd import pandas as pd
from transformers import pipeline from transformers import pipeline
from keybert import KeyBERT
kw_model = KeyBERT(model="all-MiniLM-L6-v2")
emotion_classifier = pipeline( emotion_classifier = pipeline(
"text-classification", "text-classification",
@@ -26,3 +29,18 @@ def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
next(item["score"] for item in row if item["label"] == label) next(item["score"] for item in row if item["label"] == label)
for row in results for row in results
] ]
def add_topic_col(df: pd.DataFrame, content_col: str, top_n: int = 3) -> None:
topics = []
for text in df["content"].astype(str):
keywords = kw_model.extract_keywords(
text,
keyphrase_ngram_range=(1, 3),
stop_words="english",
top_n=top_n
)
topics.append([kw for kw, _ in keywords])
df["topics"] = topics

View File

@@ -5,7 +5,7 @@ import datetime
from nltk.corpus import stopwords from nltk.corpus import stopwords
from collections import Counter from collections import Counter
from server.nlp_processor import add_emotion_cols from server.nlp_processor import add_emotion_cols, add_topic_col
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www", "https", "http",
@@ -41,7 +41,7 @@ class StatGen:
df["weekday"] = df["dt"].dt.day_name() df["weekday"] = df["dt"].dt.day_name()
add_emotion_cols(df, "content") add_emotion_cols(df, "content")
add_topic_col(df, "content")
def _tokenize(self, text: str): def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)