diff --git a/server/nlp.py b/server/nlp.py index 2f010fb..5a0dc60 100644 --- a/server/nlp.py +++ b/server/nlp.py @@ -5,75 +5,70 @@ from transformers import pipeline from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity +class NLP: + def __init__(self, df: pd.DataFrame, title_col: str, content_col: str, topics: dict): + self.df = df + self.title_col = title_col + self.content_col = content_col + self.device = 0 if torch.cuda.is_available() else 1 + + # Topic model + self.topic_model = SentenceTransformer("all-mpnet-base-v2", device=self.device) -model = SentenceTransformer("all-mpnet-base-v2", device=0 if torch.cuda.is_available() else 1) + self.topic_labels = list(topics.keys()) + self.topic_texts = list(topics.values()) -def add_emotion_cols( - df: pd.DataFrame, - content_col: str - ) -> None: - emotion_classifier = pipeline( - "text-classification", - model="j-hartmann/emotion-english-distilroberta-base", - top_k=None, - truncation=True, - device=0 if torch.cuda.is_available() else -1 - ) + self.topic_embeddings = self.topic_model.encode( + self.topic_texts, + normalize_embeddings=True, + ) - texts = df[content_col].astype(str).str.slice(0, 512).tolist() + # emotion model + self.emotion_classifier = pipeline( + "text-classification", + model="j-hartmann/emotion-english-distilroberta-base", + top_k=None, + truncation=True, + device=self.device + ) - results = emotion_classifier( - texts, - batch_size=64 - ) + def add_emotion_cols(self) -> None: + texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist() - labels = [r["label"] for r in results[0]] + results = self.emotion_classifier( + texts, + batch_size=64 + ) - for label in labels: - df[f"emotion_{label}"] = [ - next(item["score"] for item in row if item["label"] == label) - for row in results + labels = [r["label"] for r in results[0]] + + for label in labels: + self.df[f"emotion_{label}"] = [ + next(item["score"] for item in row if item["label"] == label) + for row in results + ] + + def add_topic_col(self, confidence_threshold: float = 0.3) -> None: + titles = self.df[self.title_col].fillna("").astype(str) + contents = self.df[self.content_col].fillna("").astype(str) + + texts = [ + f"{title}. {content}" if title else content + for title, content in zip(titles, contents) ] -def add_topic_col( - df: pd.DataFrame, - title_col: str, - content_col: str, - domain_topics: dict, - confidence_threshold: float = 0.3 - ) -> None: + text_embeddings = self.topic_model.encode( + texts, + normalize_embeddings=True, + ) - topic_labels = list(domain_topics.keys()) - topic_texts = list(domain_topics.values()) + # Similarity + sims = cosine_similarity(text_embeddings, self.topic_embeddings) - topic_embeddings = model.encode( - topic_texts, - normalize_embeddings=True, - ) + # Best match + best_idx = sims.argmax(axis=1) + best_score = sims.max(axis=1) - titles = df[title_col].fillna("").astype(str) - contents = df[content_col].fillna("").astype(str) - - texts = [ - f"{title}. {content}" if title else content - for title, content in zip(titles, contents) - ] - - text_embeddings = model.encode( - texts, - normalize_embeddings=True, - ) - - # Similarity - sims = cosine_similarity(text_embeddings, topic_embeddings) - - # Best match - best_idx = sims.argmax(axis=1) - best_score = sims.max(axis=1) - - df["topic"] = [topic_labels[i] for i in best_idx] - df["topic_confidence"] = best_score - - df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc" - - return df \ No newline at end of file + self.df["topic"] = [self.topic_labels[i] for i in best_idx] + self.df["topic_confidence"] = best_score + self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = "Misc" \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py index b63d377..2ffa5fb 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -5,7 +5,7 @@ import datetime from nltk.corpus import stopwords from collections import Counter -from server.nlp import add_emotion_cols, add_topic_col +from server.nlp import NLP DOMAIN_STOPWORDS = { "www", "https", "http", @@ -21,7 +21,7 @@ nltk.download('stopwords') EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS class StatGen: - def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: list) -> None: + def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None: posts_df["type"] = "post" posts_df["parent_id"] = None @@ -30,6 +30,7 @@ class StatGen: self.domain_topics = domain_topics self.df = pd.concat([posts_df, comments_df]) + self.nlp = NLP(self.df, "title", "content", domain_topics) self._add_extra_cols(self.df) self.original_df = self.df.copy(deep=True) @@ -41,8 +42,8 @@ class StatGen: df["hour"] = df["dt"].dt.hour df["weekday"] = df["dt"].dt.day_name() - add_emotion_cols(df, "content") - add_topic_col(df, "title", "content", self.domain_topics) + self.nlp.add_emotion_cols() + self.nlp.add_topic_col() def _tokenize(self, text: str): tokens = re.findall(r"\b[a-z]{3,}\b", text) @@ -192,10 +193,14 @@ class StatGen: .reset_index(drop=True) ) - # avearge emotion by topic (excluding neutral) + emotion_exclusions = [ + "emotion_neutral", + "emotion_surprise" + ] + emotion_cols = [ col for col in self.df.columns - if col.startswith("emotion_") and col != "emotion_neutral" + if col.startswith("emotion_") and col not in emotion_exclusions ] counts = (