refactor: nlp processing unified into a class
Also removed surprise emotion from content endpoint
This commit is contained in:
117
server/nlp.py
117
server/nlp.py
@@ -5,75 +5,70 @@ from transformers import pipeline
|
|||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
class NLP:
|
||||||
|
def __init__(self, df: pd.DataFrame, title_col: str, content_col: str, topics: dict):
|
||||||
|
self.df = df
|
||||||
|
self.title_col = title_col
|
||||||
|
self.content_col = content_col
|
||||||
|
self.device = 0 if torch.cuda.is_available() else 1
|
||||||
|
|
||||||
|
# Topic model
|
||||||
|
self.topic_model = SentenceTransformer("all-mpnet-base-v2", device=self.device)
|
||||||
|
|
||||||
model = SentenceTransformer("all-mpnet-base-v2", device=0 if torch.cuda.is_available() else 1)
|
self.topic_labels = list(topics.keys())
|
||||||
|
self.topic_texts = list(topics.values())
|
||||||
|
|
||||||
def add_emotion_cols(
|
self.topic_embeddings = self.topic_model.encode(
|
||||||
df: pd.DataFrame,
|
self.topic_texts,
|
||||||
content_col: str
|
normalize_embeddings=True,
|
||||||
) -> None:
|
)
|
||||||
emotion_classifier = pipeline(
|
|
||||||
"text-classification",
|
|
||||||
model="j-hartmann/emotion-english-distilroberta-base",
|
|
||||||
top_k=None,
|
|
||||||
truncation=True,
|
|
||||||
device=0 if torch.cuda.is_available() else -1
|
|
||||||
)
|
|
||||||
|
|
||||||
texts = df[content_col].astype(str).str.slice(0, 512).tolist()
|
# emotion model
|
||||||
|
self.emotion_classifier = pipeline(
|
||||||
|
"text-classification",
|
||||||
|
model="j-hartmann/emotion-english-distilroberta-base",
|
||||||
|
top_k=None,
|
||||||
|
truncation=True,
|
||||||
|
device=self.device
|
||||||
|
)
|
||||||
|
|
||||||
results = emotion_classifier(
|
def add_emotion_cols(self) -> None:
|
||||||
texts,
|
texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()
|
||||||
batch_size=64
|
|
||||||
)
|
|
||||||
|
|
||||||
labels = [r["label"] for r in results[0]]
|
results = self.emotion_classifier(
|
||||||
|
texts,
|
||||||
|
batch_size=64
|
||||||
|
)
|
||||||
|
|
||||||
for label in labels:
|
labels = [r["label"] for r in results[0]]
|
||||||
df[f"emotion_{label}"] = [
|
|
||||||
next(item["score"] for item in row if item["label"] == label)
|
for label in labels:
|
||||||
for row in results
|
self.df[f"emotion_{label}"] = [
|
||||||
|
next(item["score"] for item in row if item["label"] == label)
|
||||||
|
for row in results
|
||||||
|
]
|
||||||
|
|
||||||
|
def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
|
||||||
|
titles = self.df[self.title_col].fillna("").astype(str)
|
||||||
|
contents = self.df[self.content_col].fillna("").astype(str)
|
||||||
|
|
||||||
|
texts = [
|
||||||
|
f"{title}. {content}" if title else content
|
||||||
|
for title, content in zip(titles, contents)
|
||||||
]
|
]
|
||||||
|
|
||||||
def add_topic_col(
|
text_embeddings = self.topic_model.encode(
|
||||||
df: pd.DataFrame,
|
texts,
|
||||||
title_col: str,
|
normalize_embeddings=True,
|
||||||
content_col: str,
|
)
|
||||||
domain_topics: dict,
|
|
||||||
confidence_threshold: float = 0.3
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
topic_labels = list(domain_topics.keys())
|
# Similarity
|
||||||
topic_texts = list(domain_topics.values())
|
sims = cosine_similarity(text_embeddings, self.topic_embeddings)
|
||||||
|
|
||||||
topic_embeddings = model.encode(
|
# Best match
|
||||||
topic_texts,
|
best_idx = sims.argmax(axis=1)
|
||||||
normalize_embeddings=True,
|
best_score = sims.max(axis=1)
|
||||||
)
|
|
||||||
|
|
||||||
titles = df[title_col].fillna("").astype(str)
|
self.df["topic"] = [self.topic_labels[i] for i in best_idx]
|
||||||
contents = df[content_col].fillna("").astype(str)
|
self.df["topic_confidence"] = best_score
|
||||||
|
self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
|
||||||
texts = [
|
|
||||||
f"{title}. {content}" if title else content
|
|
||||||
for title, content in zip(titles, contents)
|
|
||||||
]
|
|
||||||
|
|
||||||
text_embeddings = model.encode(
|
|
||||||
texts,
|
|
||||||
normalize_embeddings=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Similarity
|
|
||||||
sims = cosine_similarity(text_embeddings, topic_embeddings)
|
|
||||||
|
|
||||||
# Best match
|
|
||||||
best_idx = sims.argmax(axis=1)
|
|
||||||
best_score = sims.max(axis=1)
|
|
||||||
|
|
||||||
df["topic"] = [topic_labels[i] for i in best_idx]
|
|
||||||
df["topic_confidence"] = best_score
|
|
||||||
|
|
||||||
df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
|
|
||||||
|
|
||||||
return df
|
|
||||||
@@ -5,7 +5,7 @@ import datetime
|
|||||||
|
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from server.nlp import add_emotion_cols, add_topic_col
|
from server.nlp import NLP
|
||||||
|
|
||||||
DOMAIN_STOPWORDS = {
|
DOMAIN_STOPWORDS = {
|
||||||
"www", "https", "http",
|
"www", "https", "http",
|
||||||
@@ -21,7 +21,7 @@ nltk.download('stopwords')
|
|||||||
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
|
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
|
||||||
|
|
||||||
class StatGen:
|
class StatGen:
|
||||||
def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: list) -> None:
|
def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None:
|
||||||
posts_df["type"] = "post"
|
posts_df["type"] = "post"
|
||||||
posts_df["parent_id"] = None
|
posts_df["parent_id"] = None
|
||||||
|
|
||||||
@@ -30,6 +30,7 @@ class StatGen:
|
|||||||
self.domain_topics = domain_topics
|
self.domain_topics = domain_topics
|
||||||
|
|
||||||
self.df = pd.concat([posts_df, comments_df])
|
self.df = pd.concat([posts_df, comments_df])
|
||||||
|
self.nlp = NLP(self.df, "title", "content", domain_topics)
|
||||||
self._add_extra_cols(self.df)
|
self._add_extra_cols(self.df)
|
||||||
|
|
||||||
self.original_df = self.df.copy(deep=True)
|
self.original_df = self.df.copy(deep=True)
|
||||||
@@ -41,8 +42,8 @@ class StatGen:
|
|||||||
df["hour"] = df["dt"].dt.hour
|
df["hour"] = df["dt"].dt.hour
|
||||||
df["weekday"] = df["dt"].dt.day_name()
|
df["weekday"] = df["dt"].dt.day_name()
|
||||||
|
|
||||||
add_emotion_cols(df, "content")
|
self.nlp.add_emotion_cols()
|
||||||
add_topic_col(df, "title", "content", self.domain_topics)
|
self.nlp.add_topic_col()
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str):
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||||
@@ -192,10 +193,14 @@ class StatGen:
|
|||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
# avearge emotion by topic (excluding neutral)
|
emotion_exclusions = [
|
||||||
|
"emotion_neutral",
|
||||||
|
"emotion_surprise"
|
||||||
|
]
|
||||||
|
|
||||||
emotion_cols = [
|
emotion_cols = [
|
||||||
col for col in self.df.columns
|
col for col in self.df.columns
|
||||||
if col.startswith("emotion_") and col != "emotion_neutral"
|
if col.startswith("emotion_") and col not in emotion_exclusions
|
||||||
]
|
]
|
||||||
|
|
||||||
counts = (
|
counts = (
|
||||||
|
|||||||
Reference in New Issue
Block a user