refactor: extract temporal analysis into it's own class

This commit is contained in:
2026-02-17 17:35:28 +00:00
parent 563212c98e
commit 70b34036db
2 changed files with 76 additions and 66 deletions

View File

@@ -0,0 +1,70 @@
import pandas as pd
class TemporalAnalysis:
def __init__(self, df: pd.DataFrame):
self.df = df
def avg_reply_time_per_emotion(self) -> dict:
df = self.df.copy()
replies = df[
(df["type"] == "comment") &
(df["reply_to"].notna()) &
(df["reply_to"] != "")
]
id_to_time = df.set_index("id")["dt"].to_dict()
def compute_reply_time(row):
reply_id = row["reply_to"]
parent_time = id_to_time.get(reply_id)
if parent_time is None:
return None
return (row["dt"] - parent_time).total_seconds()
replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
grouped = (
replies
.groupby("dominant_emotion")["reply_time"]
.agg(["mean", "count"])
.reset_index()
)
return grouped.to_dict(orient="records")
def posts_per_day(self) -> dict:
per_day = (
self.df.groupby("date")
.size()
.reset_index(name="count")
)
return per_day.to_dict(orient="records")
def heatmap(self) -> dict:
weekday_order = [
"Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"
]
self.df["weekday"] = pd.Categorical(
self.df["weekday"],
categories=weekday_order,
ordered=True
)
heatmap = (
self.df
.groupby(["weekday", "hour"], observed=True)
.size()
.unstack(fill_value=0)
.reindex(columns=range(24), fill_value=0)
)
heatmap.columns = heatmap.columns.map(str)
return heatmap.to_dict(orient="records")

View File

@@ -6,6 +6,7 @@ import datetime
from nltk.corpus import stopwords from nltk.corpus import stopwords
from collections import Counter from collections import Counter
from server.nlp import NLP from server.nlp import NLP
from server.analysis.temporal import TemporalAnalysis
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www", "https", "http",
@@ -39,6 +40,8 @@ class StatGen:
self.nlp = NLP(self.df, "title", "content", domain_topics) self.nlp = NLP(self.df, "title", "content", domain_topics)
self._add_extra_cols(self.df) self._add_extra_cols(self.df)
self.temporal_analysis = TemporalAnalysis(self.df)
self.original_df = self.df.copy(deep=True) self.original_df = self.df.copy(deep=True)
## Private Methods ## Private Methods
@@ -117,75 +120,12 @@ class StatGen:
interactions[a][b] = interactions[a].get(b, 0) + 1 interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions return interactions
def _avg_reply_time_per_emotion(self):
df = self.df.copy()
replies = df[
(df["type"] == "comment") &
(df["reply_to"].notna()) &
(df["reply_to"] != "")
]
id_to_time = df.set_index("id")["dt"].to_dict()
def compute_reply_time(row):
reply_id = row["reply_to"]
parent_time = id_to_time.get(reply_id)
if parent_time is None:
return None
return (row["dt"] - parent_time).total_seconds()
replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
grouped = (
replies
.groupby("dominant_emotion")["reply_time"]
.agg(["mean", "count"])
.reset_index()
)
return grouped.to_dict(orient="records")
## Public ## Public
def time_analysis(self) -> pd.DataFrame: def time_analysis(self) -> pd.DataFrame:
per_day = (
self.df.groupby("date")
.size()
.reset_index(name="count")
)
weekday_order = [
"Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"
]
self.df["weekday"] = pd.Categorical(
self.df["weekday"],
categories=weekday_order,
ordered=True
)
heatmap = (
self.df
.groupby(["weekday", "hour"], observed=True)
.size()
.unstack(fill_value=0)
.reindex(columns=range(24), fill_value=0)
)
heatmap.columns = heatmap.columns.map(str)
burst_index = per_day["count"].std() / max(per_day["count"].mean(), 1)
return { return {
"events_per_day": per_day.to_dict(orient="records"), "events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": heatmap.to_dict(orient="records"), "weekday_hour_heatmap": self.temporal_analysis.heatmap()
"burstiness": round(burst_index, 2)
} }
def summary(self) -> dict: def summary(self) -> dict:
@@ -269,7 +209,7 @@ class StatGen:
return { return {
"word_frequencies": word_frequencies.to_dict(orient='records'), "word_frequencies": word_frequencies.to_dict(orient='records'),
"average_emotion_by_topic": avg_emotion_by_topic.to_dict(orient='records'), "average_emotion_by_topic": avg_emotion_by_topic.to_dict(orient='records'),
"reply_time_by_emotion": self._avg_reply_time_per_emotion() "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
} }
def user_analysis(self) -> dict: def user_analysis(self) -> dict: