refactor: move NLP processing out of server file
This commit is contained in:
28
server/nlp_processor.py
Normal file
28
server/nlp_processor.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import torch
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from transformers import pipeline
|
||||||
|
|
||||||
|
emotion_classifier = pipeline(
|
||||||
|
"text-classification",
|
||||||
|
model="j-hartmann/emotion-english-distilroberta-base",
|
||||||
|
top_k=None,
|
||||||
|
truncation=True,
|
||||||
|
device=0 if torch.cuda.is_available() else -1
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
|
||||||
|
texts = df[content_col].astype(str).str.slice(0, 512).tolist()
|
||||||
|
|
||||||
|
results = emotion_classifier(
|
||||||
|
texts,
|
||||||
|
batch_size=64
|
||||||
|
)
|
||||||
|
|
||||||
|
labels = [r["label"] for r in results[0]]
|
||||||
|
|
||||||
|
for label in labels:
|
||||||
|
df[f"emotion_{label}"] = [
|
||||||
|
next(item["score"] for item in row if item["label"] == label)
|
||||||
|
for row in results
|
||||||
|
]
|
||||||
@@ -2,19 +2,10 @@ import pandas as pd
|
|||||||
import re
|
import re
|
||||||
import nltk
|
import nltk
|
||||||
import datetime
|
import datetime
|
||||||
import torch
|
|
||||||
|
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from transformers import pipeline
|
from server.nlp_processor import add_emotion_cols
|
||||||
|
|
||||||
emotion_classifier = pipeline(
|
|
||||||
"text-classification",
|
|
||||||
model="j-hartmann/emotion-english-distilroberta-base",
|
|
||||||
top_k=None,
|
|
||||||
truncation=True,
|
|
||||||
device=0 if torch.cuda.is_available() else -1
|
|
||||||
)
|
|
||||||
|
|
||||||
DOMAIN_STOPWORDS = {
|
DOMAIN_STOPWORDS = {
|
||||||
"www", "https", "http",
|
"www", "https", "http",
|
||||||
@@ -39,7 +30,6 @@ class StatGen:
|
|||||||
|
|
||||||
self.df = pd.concat([posts_df, comments_df])
|
self.df = pd.concat([posts_df, comments_df])
|
||||||
self._add_extra_cols(self.df)
|
self._add_extra_cols(self.df)
|
||||||
self._add_emotion_cols(self.df)
|
|
||||||
|
|
||||||
self.original_df = self.df.copy(deep=True)
|
self.original_df = self.df.copy(deep=True)
|
||||||
|
|
||||||
@@ -50,21 +40,8 @@ class StatGen:
|
|||||||
df["hour"] = df["dt"].dt.hour
|
df["hour"] = df["dt"].dt.hour
|
||||||
df["weekday"] = df["dt"].dt.day_name()
|
df["weekday"] = df["dt"].dt.day_name()
|
||||||
|
|
||||||
def _add_emotion_cols(self, df: pd.DataFrame) -> None:
|
add_emotion_cols(df, "content")
|
||||||
texts = df["content"].astype(str).str.slice(0, 512).tolist()
|
|
||||||
|
|
||||||
results = emotion_classifier(
|
|
||||||
texts,
|
|
||||||
batch_size=64
|
|
||||||
)
|
|
||||||
|
|
||||||
labels = [r["label"] for r in results[0]]
|
|
||||||
|
|
||||||
for label in labels:
|
|
||||||
df[f"emotion_{label}"] = [
|
|
||||||
next(item["score"] for item in row if item["label"] == label)
|
|
||||||
for row in results
|
|
||||||
]
|
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str):
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||||
|
|||||||
Reference in New Issue
Block a user