crosspost/server/nlp.py

import torch
import pandas as pd

from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class NLP:
    def __init__(self, df: pd.DataFrame, title_col: str, content_col: str, topics: dict):
        self.df = df
        self.title_col = title_col
        self.content_col = content_col
        self.device = 0 if torch.cuda.is_available() else 1

        # Topic model
        self.topic_model = SentenceTransformer("all-mpnet-base-v2", device=self.device)

        self.topic_labels = list(topics.keys())
        self.topic_texts = list(topics.values())

        self.topic_embeddings = self.topic_model.encode(
            self.topic_texts,
            normalize_embeddings=True,
        )

        # emotion model
        self.emotion_classifier = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            top_k=None,
            truncation=True,
            device=self.device
        )

    def add_emotion_cols(self) -> None:
        texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist()

        results = self.emotion_classifier(
            texts,
            batch_size=64
        )

        labels = [r["label"] for r in results[0]]

        for label in labels:
            self.df[f"emotion_{label}"] = [
                next(item["score"] for item in row if item["label"] == label)
                for row in results
            ]

    def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
        titles = self.df[self.title_col].fillna("").astype(str)
        contents = self.df[self.content_col].fillna("").astype(str)

        texts = [
            f"{title}. {content}" if title else content
            for title, content in zip(titles, contents)
        ]

        text_embeddings = self.topic_model.encode(
            texts,
            normalize_embeddings=True,
        )

        # Similarity
        sims = cosine_similarity(text_embeddings, self.topic_embeddings)

        # Best match
        best_idx = sims.argmax(axis=1)
        best_score = sims.max(axis=1)

        self.df["topic"] = [self.topic_labels[i] for i in best_idx]
        self.df["topic_confidence"] = best_score
        self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"