feat: add nlp topic processing
This commit is contained in:
@@ -2,6 +2,9 @@ import torch
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
|
from keybert import KeyBERT
|
||||||
|
|
||||||
|
kw_model = KeyBERT(model="all-MiniLM-L6-v2")
|
||||||
|
|
||||||
emotion_classifier = pipeline(
|
emotion_classifier = pipeline(
|
||||||
"text-classification",
|
"text-classification",
|
||||||
@@ -25,4 +28,19 @@ def add_emotion_cols(df: pd.Dataframe, content_col: str) -> None:
|
|||||||
df[f"emotion_{label}"] = [
|
df[f"emotion_{label}"] = [
|
||||||
next(item["score"] for item in row if item["label"] == label)
|
next(item["score"] for item in row if item["label"] == label)
|
||||||
for row in results
|
for row in results
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def add_topic_col(df: pd.DataFrame, content_col: str, top_n: int = 3) -> None:
|
||||||
|
topics = []
|
||||||
|
|
||||||
|
for text in df["content"].astype(str):
|
||||||
|
keywords = kw_model.extract_keywords(
|
||||||
|
text,
|
||||||
|
keyphrase_ngram_range=(1, 3),
|
||||||
|
stop_words="english",
|
||||||
|
top_n=top_n
|
||||||
|
)
|
||||||
|
|
||||||
|
topics.append([kw for kw, _ in keywords])
|
||||||
|
|
||||||
|
df["topics"] = topics
|
||||||
@@ -5,7 +5,7 @@ import datetime
|
|||||||
|
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from server.nlp_processor import add_emotion_cols
|
from server.nlp_processor import add_emotion_cols, add_topic_col
|
||||||
|
|
||||||
DOMAIN_STOPWORDS = {
|
DOMAIN_STOPWORDS = {
|
||||||
"www", "https", "http",
|
"www", "https", "http",
|
||||||
@@ -41,7 +41,7 @@ class StatGen:
|
|||||||
df["weekday"] = df["dt"].dt.day_name()
|
df["weekday"] = df["dt"].dt.day_name()
|
||||||
|
|
||||||
add_emotion_cols(df, "content")
|
add_emotion_cols(df, "content")
|
||||||
|
add_topic_col(df, "content")
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str):
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||||
|
|||||||
Reference in New Issue
Block a user