From ba1501e493a4442a1ec1dcfc243ae6ed5d0a73c7 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Thu, 5 Feb 2026 17:23:25 +0000
Subject: [PATCH] feat: add nlp topic processing

---
 server/nlp_processor.py | 20 +++++++++++++++++++-
 server/stat_gen.py      |  4 ++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/server/nlp_processor.py b/server/nlp_processor.py
index 84c79ae..bbd4f25 100644
--- a/server/nlp_processor.py
+++ b/server/nlp_processor.py
@@ -2,6 +2,9 @@ import torch
 import pandas as pd
 
 from transformers import pipeline
+from keybert import KeyBERT
+
+kw_model = KeyBERT(model="all-MiniLM-L6-v2")
 
 emotion_classifier = pipeline(
     "text-classification",
@@ -25,4 +28,19 @@ def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
         df[f"emotion_{label}"] = [
             next(item["score"] for item in row if item["label"] == label)
             for row in results
-        ]
\ No newline at end of file
+        ]
+
+def add_topic_col(df: pd.DataFrame, content_col: str, top_n: int = 3) -> None:
+    topics = []
+
+    for text in df["content"].astype(str):
+        keywords = kw_model.extract_keywords(
+            text,
+            keyphrase_ngram_range=(1, 3),
+            stop_words="english",
+            top_n=top_n
+        )
+
+        topics.append([kw for kw, _ in keywords])
+
+    df["topics"] = topics
\ No newline at end of file
diff --git a/server/stat_gen.py b/server/stat_gen.py
index e760b5b..22ec49e 100644
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -5,7 +5,7 @@ import datetime
 
 from nltk.corpus import stopwords
 from collections import Counter
-from server.nlp_processor import add_emotion_cols
+from server.nlp_processor import add_emotion_cols, add_topic_col
 
 DOMAIN_STOPWORDS = {
     "www", "https", "http",
@@ -41,7 +41,7 @@ class StatGen:
         df["weekday"] = df["dt"].dt.day_name()
         
         add_emotion_cols(df, "content")
-
+        add_topic_col(df, "content")
 
     def _tokenize(self, text: str):
         tokens = re.findall(r"\b[a-z]{3,}\b", text)