refactor: move NLP processing out of server file

This commit is contained in:
2026-02-05 17:10:48 +00:00
parent 8e257a92d9
commit d4db7bec24
2 changed files with 31 additions and 26 deletions

28
server/nlp_processor.py Normal file
View File

@@ -0,0 +1,28 @@
import torch
import pandas as pd
from transformers import pipeline
emotion_classifier = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=None,
truncation=True,
device=0 if torch.cuda.is_available() else -1
)
def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
texts = df[content_col].astype(str).str.slice(0, 512).tolist()
results = emotion_classifier(
texts,
batch_size=64
)
labels = [r["label"] for r in results[0]]
for label in labels:
df[f"emotion_{label}"] = [
next(item["score"] for item in row if item["label"] == label)
for row in results
]

View File

@@ -2,19 +2,10 @@ import pandas as pd
import re
import nltk
import datetime
import torch
from nltk.corpus import stopwords
from collections import Counter
from transformers import pipeline
emotion_classifier = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=None,
truncation=True,
device=0 if torch.cuda.is_available() else -1
)
from server.nlp_processor import add_emotion_cols
DOMAIN_STOPWORDS = {
"www", "https", "http",
@@ -39,7 +30,6 @@ class StatGen:
self.df = pd.concat([posts_df, comments_df])
self._add_extra_cols(self.df)
self._add_emotion_cols(self.df)
self.original_df = self.df.copy(deep=True)
@@ -49,22 +39,9 @@ class StatGen:
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name()
add_emotion_cols(df, "content")
def _add_emotion_cols(self, df: pd.DataFrame) -> None:
texts = df["content"].astype(str).str.slice(0, 512).tolist()
results = emotion_classifier(
texts,
batch_size=64
)
labels = [r["label"] for r in results[0]]
for label in labels:
df[f"emotion_{label}"] = [
next(item["score"] for item in row if item["label"] == label)
for row in results
]
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)