67 lines
1.7 KiB
Python
67 lines
1.7 KiB
Python
import torch
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from transformers import pipeline
|
|
from sentence_transformers import SentenceTransformer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
model = SentenceTransformer("all-MiniLM-L6-v2", device=0 if torch.cuda.is_available() else 1)
|
|
|
|
def add_emotion_cols(
|
|
df: pd.DataFrame,
|
|
content_col: str
|
|
) -> None:
|
|
emotion_classifier = pipeline(
|
|
"text-classification",
|
|
model="j-hartmann/emotion-english-distilroberta-base",
|
|
top_k=None,
|
|
truncation=True,
|
|
device=0 if torch.cuda.is_available() else -1
|
|
)
|
|
|
|
texts = df[content_col].astype(str).str.slice(0, 512).tolist()
|
|
|
|
results = emotion_classifier(
|
|
texts,
|
|
batch_size=64
|
|
)
|
|
|
|
labels = [r["label"] for r in results[0]]
|
|
|
|
for label in labels:
|
|
df[f"emotion_{label}"] = [
|
|
next(item["score"] for item in row if item["label"] == label)
|
|
for row in results
|
|
]
|
|
|
|
def add_topic_col(
|
|
df: pd.DataFrame,
|
|
content_col: str,
|
|
domain_topics: list[str],
|
|
confidence_threshold: float = 0.15
|
|
) -> None:
|
|
topic_embeddings = model.encode(
|
|
domain_topics,
|
|
normalize_embeddings=True,
|
|
)
|
|
|
|
texts = df[content_col].astype(str).tolist()
|
|
text_embeddings = model.encode(
|
|
texts,
|
|
normalize_embeddings=True,
|
|
)
|
|
|
|
# Similarity
|
|
sims = cosine_similarity(text_embeddings, topic_embeddings)
|
|
|
|
# Best match
|
|
best_idx = sims.argmax(axis=1)
|
|
best_score = sims.max(axis=1)
|
|
|
|
df["topic"] = [domain_topics[i] for i in best_idx]
|
|
df["topic_confidence"] = best_score
|
|
df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc"
|
|
|
|
return df |