feat: add dataset processor class

This commit is contained in:
2026-03-01 15:01:34 +00:00
parent 2493c6d465
commit 0be9ff4896

View File

@@ -0,0 +1,39 @@
import pandas as pd
from server.analysis.nlp import NLP
class DatasetProcessor:
def __init__(self, df, topics):
self.df = self._explode_comments(df)
self.topics = topics
self.nlp = NLP(self.df, "title", "content", self.topics)
def _explode_comments(self, df) -> pd.DataFrame:
comments_df = df[["id", "comments"]].explode("comments")
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
comments_df = pd.json_normalize(comments_df["comments"])
posts_df = df.drop(columns=["comments"])
posts_df["type"] = "post"
posts_df["parent_id"] = None
comments_df["type"] = "comment"
comments_df["parent_id"] = comments_df.get("post_id")
df = pd.concat([posts_df, comments_df])
df.drop(columns=["post_id"], inplace=True, errors="ignore")
return df
def enrich(self) -> pd.DataFrame:
self.df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce')
self.df['date'] = pd.to_datetime(self.df['timestamp'], unit='s').dt.date
self.df["dt"] = pd.to_datetime(self.df["timestamp"], unit="s", utc=True)
self.df["hour"] = self.df["dt"].dt.hour
self.df["weekday"] = self.df["dt"].dt.day_name()
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
return self.df