From ba1501e493a4442a1ec1dcfc243ae6ed5d0a73c7 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Thu, 5 Feb 2026 17:23:25 +0000 Subject: [PATCH] feat: add nlp topic processing --- server/nlp_processor.py | 20 +++++++++++++++++++- server/stat_gen.py | 4 ++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/server/nlp_processor.py b/server/nlp_processor.py index 84c79ae..bbd4f25 100644 --- a/server/nlp_processor.py +++ b/server/nlp_processor.py @@ -2,6 +2,9 @@ import torch import pandas as pd from transformers import pipeline +from keybert import KeyBERT + +kw_model = KeyBERT(model="all-MiniLM-L6-v2") emotion_classifier = pipeline( "text-classification", @@ -25,4 +28,19 @@ def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None: df[f"emotion_{label}"] = [ next(item["score"] for item in row if item["label"] == label) for row in results - ] \ No newline at end of file + ] + +def add_topic_col(df: pd.DataFrame, content_col: str, top_n: int = 3) -> None: + topics = [] + + for text in df["content"].astype(str): + keywords = kw_model.extract_keywords( + text, + keyphrase_ngram_range=(1, 3), + stop_words="english", + top_n=top_n + ) + + topics.append([kw for kw, _ in keywords]) + + df["topics"] = topics \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py index e760b5b..22ec49e 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -5,7 +5,7 @@ import datetime from nltk.corpus import stopwords from collections import Counter -from server.nlp_processor import add_emotion_cols +from server.nlp_processor import add_emotion_cols, add_topic_col DOMAIN_STOPWORDS = { "www", "https", "http", @@ -41,7 +41,7 @@ class StatGen: df["weekday"] = df["dt"].dt.day_name() add_emotion_cols(df, "content") - + add_topic_col(df, "content") def _tokenize(self, text: str): tokens = re.findall(r"\b[a-z]{3,}\b", text)