Files
crosspost/server/nlp.py

67 lines
1.7 KiB
Python

import torch
import pandas as pd
import numpy as np
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer("all-MiniLM-L6-v2", device=0 if torch.cuda.is_available() else 1)
def add_emotion_cols(
df: pd.DataFrame,
content_col: str
) -> None:
emotion_classifier = pipeline(
"text-classification",
model="j-hartmann/emotion-english-distilroberta-base",
top_k=None,
truncation=True,
device=0 if torch.cuda.is_available() else -1
)
texts = df[content_col].astype(str).str.slice(0, 512).tolist()
results = emotion_classifier(
texts,
batch_size=64
)
labels = [r["label"] for r in results[0]]
for label in labels:
df[f"emotion_{label}"] = [
next(item["score"] for item in row if item["label"] == label)
for row in results
]
def add_topic_col(
df: pd.DataFrame,
content_col: str,
domain_topics: list[str],
confidence_threshold: float = 0.15
) -> None:
topic_embeddings = model.encode(
domain_topics,
normalize_embeddings=True,
)
texts = df[content_col].astype(str).tolist()
text_embeddings = model.encode(
texts,
normalize_embeddings=True,
)
# Similarity
sims = cosine_similarity(text_embeddings, topic_embeddings)
# Best match
best_idx = sims.argmax(axis=1)
best_score = sims.max(axis=1)
df["topic"] = [domain_topics[i] for i in best_idx]
df["topic_confidence"] = best_score
df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
return df