refactor: move NLP processing out of server file

This commit is contained in:
2026-02-05 17:10:48 +00:00
parent 8e257a92d9
commit d4db7bec24
2 changed files with 31 additions and 26 deletions

28
server/nlp_processor.py Normal file
View File

@@ -0,0 +1,28 @@
import torch
import pandas as pd
from transformers import pipeline
emotion_classifier = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=None,
truncation=True,
device=0 if torch.cuda.is_available() else -1
)
def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
texts = df[content_col].astype(str).str.slice(0, 512).tolist()
results = emotion_classifier(
texts,
batch_size=64
)
labels = [r["label"] for r in results[0]]
for label in labels:
df[f"emotion_{label}"] = [
next(item["score"] for item in row if item["label"] == label)
for row in results
]

View File

@@ -2,19 +2,10 @@ import pandas as pd
import re import re
import nltk import nltk
import datetime import datetime
import torch
from nltk.corpus import stopwords from nltk.corpus import stopwords
from collections import Counter from collections import Counter
from transformers import pipeline from server.nlp_processor import add_emotion_cols
emotion_classifier = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=None,
truncation=True,
device=0 if torch.cuda.is_available() else -1
)
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www", "https", "http",
@@ -39,7 +30,6 @@ class StatGen:
self.df = pd.concat([posts_df, comments_df]) self.df = pd.concat([posts_df, comments_df])
self._add_extra_cols(self.df) self._add_extra_cols(self.df)
self._add_emotion_cols(self.df)
self.original_df = self.df.copy(deep=True) self.original_df = self.df.copy(deep=True)
@@ -49,22 +39,9 @@ class StatGen:
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name() df["weekday"] = df["dt"].dt.day_name()
add_emotion_cols(df, "content")
def _add_emotion_cols(self, df: pd.DataFrame) -> None:
texts = df["content"].astype(str).str.slice(0, 512).tolist()
results = emotion_classifier(
texts,
batch_size=64
)
labels = [r["label"] for r in results[0]]
for label in labels:
df[f"emotion_{label}"] = [
next(item["score"] for item in row if item["label"] == label)
for row in results
]
def _tokenize(self, text: str): def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)