refactor: nlp processing unified into a class

Also removed surprise emotion from content endpoint
This commit is contained in:
2026-02-08 16:33:27 +00:00
parent f136e7b7c8
commit e7ffb58c3d
2 changed files with 67 additions and 67 deletions

View File

@@ -5,24 +5,37 @@ from transformers import pipeline
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
class NLP:
def __init__(self, df: pd.DataFrame, title_col: str, content_col: str, topics: dict):
self.df = df
self.title_col = title_col
self.content_col = content_col
self.device = 0 if torch.cuda.is_available() else 1
model = SentenceTransformer("all-mpnet-base-v2", device=0 if torch.cuda.is_available() else 1) # Topic model
self.topic_model = SentenceTransformer("all-mpnet-base-v2", device=self.device)
def add_emotion_cols( self.topic_labels = list(topics.keys())
df: pd.DataFrame, self.topic_texts = list(topics.values())
content_col: str
) -> None: self.topic_embeddings = self.topic_model.encode(
emotion_classifier = pipeline( self.topic_texts,
normalize_embeddings=True,
)
# emotion model
self.emotion_classifier = pipeline(
"text-classification", "text-classification",
model="j-hartmann/emotion-english-distilroberta-base", model="j-hartmann/emotion-english-distilroberta-base",
top_k=None, top_k=None,
truncation=True, truncation=True,
device=0 if torch.cuda.is_available() else -1 device=self.device
) )
texts = df[content_col].astype(str).str.slice(0, 512).tolist() def add_emotion_cols(self) -> None:
texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()
results = emotion_classifier( results = self.emotion_classifier(
texts, texts,
batch_size=64 batch_size=64
) )
@@ -30,50 +43,32 @@ def add_emotion_cols(
labels = [r["label"] for r in results[0]] labels = [r["label"] for r in results[0]]
for label in labels: for label in labels:
df[f"emotion_{label}"] = [ self.df[f"emotion_{label}"] = [
next(item["score"] for item in row if item["label"] == label) next(item["score"] for item in row if item["label"] == label)
for row in results for row in results
] ]
def add_topic_col( def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
df: pd.DataFrame, titles = self.df[self.title_col].fillna("").astype(str)
title_col: str, contents = self.df[self.content_col].fillna("").astype(str)
content_col: str,
domain_topics: dict,
confidence_threshold: float = 0.3
) -> None:
topic_labels = list(domain_topics.keys())
topic_texts = list(domain_topics.values())
topic_embeddings = model.encode(
topic_texts,
normalize_embeddings=True,
)
titles = df[title_col].fillna("").astype(str)
contents = df[content_col].fillna("").astype(str)
texts = [ texts = [
f"{title}. {content}" if title else content f"{title}. {content}" if title else content
for title, content in zip(titles, contents) for title, content in zip(titles, contents)
] ]
text_embeddings = model.encode( text_embeddings = self.topic_model.encode(
texts, texts,
normalize_embeddings=True, normalize_embeddings=True,
) )
# Similarity # Similarity
sims = cosine_similarity(text_embeddings, topic_embeddings) sims = cosine_similarity(text_embeddings, self.topic_embeddings)
# Best match # Best match
best_idx = sims.argmax(axis=1) best_idx = sims.argmax(axis=1)
best_score = sims.max(axis=1) best_score = sims.max(axis=1)
df["topic"] = [topic_labels[i] for i in best_idx] self.df["topic"] = [self.topic_labels[i] for i in best_idx]
df["topic_confidence"] = best_score self.df["topic_confidence"] = best_score
self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
return df

View File

@@ -5,7 +5,7 @@ import datetime
from nltk.corpus import stopwords from nltk.corpus import stopwords
from collections import Counter from collections import Counter
from server.nlp import add_emotion_cols, add_topic_col from server.nlp import NLP
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www", "https", "http",
@@ -21,7 +21,7 @@ nltk.download('stopwords')
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
class StatGen: class StatGen:
def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: list) -> None: def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None:
posts_df["type"] = "post" posts_df["type"] = "post"
posts_df["parent_id"] = None posts_df["parent_id"] = None
@@ -30,6 +30,7 @@ class StatGen:
self.domain_topics = domain_topics self.domain_topics = domain_topics
self.df = pd.concat([posts_df, comments_df]) self.df = pd.concat([posts_df, comments_df])
self.nlp = NLP(self.df, "title", "content", domain_topics)
self._add_extra_cols(self.df) self._add_extra_cols(self.df)
self.original_df = self.df.copy(deep=True) self.original_df = self.df.copy(deep=True)
@@ -41,8 +42,8 @@ class StatGen:
df["hour"] = df["dt"].dt.hour df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name() df["weekday"] = df["dt"].dt.day_name()
add_emotion_cols(df, "content") self.nlp.add_emotion_cols()
add_topic_col(df, "title", "content", self.domain_topics) self.nlp.add_topic_col()
def _tokenize(self, text: str): def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
@@ -192,10 +193,14 @@ class StatGen:
.reset_index(drop=True) .reset_index(drop=True)
) )
# avearge emotion by topic (excluding neutral) emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [ emotion_cols = [
col for col in self.df.columns col for col in self.df.columns
if col.startswith("emotion_") and col != "emotion_neutral" if col.startswith("emotion_") and col not in emotion_exclusions
] ]
counts = ( counts = (