refactor: update analysis classes to accept DataFrame as parameter instead of instance variable

This commit is contained in:
2026-03-01 16:25:39 +00:00
parent d20790ed4b
commit 07ab7529a9
7 changed files with 403 additions and 389 deletions

View File

@@ -1,170 +1,135 @@
import pandas as pd
import datetime
import nltk
import nltk
import pandas as pd
from nltk.corpus import stopwords
from server.analysis.nlp import NLP
from server.analysis.temporal import TemporalAnalysis
from server.analysis.cultural import CulturalAnalysis
from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.cultural import CulturalAnalysis
from server.analysis.temporal import TemporalAnalysis
DOMAIN_STOPWORDS = {
"www", "https", "http",
"boards", "boardsie",
"comment", "comments",
"discussion", "thread",
"post", "posts",
"would", "get", "one"
"www",
"https",
"http",
"boards",
"boardsie",
"comment",
"comments",
"discussion",
"thread",
"post",
"posts",
"would",
"get",
"one",
}
nltk.download('stopwords')
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
nltk.download("stopwords")
EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
class StatGen:
def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
comments_df = df[["id", "comments"]].explode("comments")
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
comments_df = pd.json_normalize(comments_df["comments"])
def __init__(self) -> None:
self.temporal_analysis = TemporalAnalysis()
self.emotional_analysis = EmotionalAnalysis()
self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis()
posts_df = df.drop(columns=["comments"])
posts_df["type"] = "post"
posts_df["parent_id"] = None
comments_df["type"] = "comment"
comments_df["parent_id"] = comments_df.get("post_id")
self.domain_topics = domain_topics
self.df = pd.concat([posts_df, comments_df])
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
self.nlp = NLP(self.df, "title", "content", domain_topics)
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
self._add_time_cols(self.df)
self.temporal_analysis = TemporalAnalysis(self.df)
self.emotional_analysis = EmotionalAnalysis(self.df)
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis(self.df)
self.original_df = self.df.copy(deep=True)
## Private Methods
def _add_time_cols(self, df: pd.DataFrame) -> None:
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name()
## Public
# topics over time
# emotions over time
def get_time_analysis(self) -> dict:
def get_time_analysis(self, df: pd.DataFrame) -> dict:
return {
"events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
"events_per_day": self.temporal_analysis.posts_per_day(df),
"weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
}
# average topic duration
def get_content_analysis(self) -> dict:
def get_content_analysis(self, df: pd.DataFrame) -> dict:
return {
"word_frequencies": self.linguistic_analysis.word_frequencies(),
"common_two_phrases": self.linguistic_analysis.ngrams(),
"common_three_phrases": self.linguistic_analysis.ngrams(n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
"word_frequencies": self.linguistic_analysis.word_frequencies(df),
"common_two_phrases": self.linguistic_analysis.ngrams(df),
"common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
}
# average emotion per user
# average chain length
def get_user_analysis(self) -> dict:
return {
"top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis()
}
# average / max thread depth
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
"interaction_graph": self.interaction_analysis.interaction_graph()
}
# detect community jargon
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return {
"identity_markers": self.cultural_analysis.get_identity_markers(),
"stance_markers": self.cultural_analysis.get_stance_markers(),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
}
def summary(self) -> dict:
total_posts = (self.df["type"] == "post").sum()
total_comments = (self.df["type"] == "comment").sum()
events_per_user = self.df.groupby("author").size()
def get_user_analysis(self, df: pd.DataFrame) -> dict:
return {
"top_users": self.interaction_analysis.top_users(df),
"users": self.interaction_analysis.per_user_analysis(df),
"interaction_graph": self.interaction_analysis.interaction_graph(df),
}
def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(df),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
}
def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
return {
"identity_markers": self.cultural_analysis.get_identity_markers(df),
"stance_markers": self.cultural_analysis.get_stance_markers(df),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
}
def summary(self, df: pd.DataFrame) -> dict:
total_posts = (df["type"] == "post").sum()
total_comments = (df["type"] == "comment").sum()
events_per_user = df.groupby("author").size()
return {
"total_events": int(len(self.df)),
"total_events": int(len(df)),
"total_posts": int(total_posts),
"total_comments": int(total_comments),
"unique_users": int(events_per_user.count()),
"comments_per_post": round(total_comments / max(total_posts, 1), 2),
"lurker_ratio": round((events_per_user == 1).mean(), 2),
"time_range": {
"start": int(self.df["dt"].min().timestamp()),
"end": int(self.df["dt"].max().timestamp())
"start": int(df["dt"].min().timestamp()),
"end": int(df["dt"].max().timestamp()),
},
"sources": self.df["source"].dropna().unique().tolist()
}
def filter_by_query(self, search_query: str) -> dict:
self.df = self.df[
self.df["content"].str.contains(search_query)
]
return {
"rows": len(self.df),
"data": self.df.to_dict(orient="records")
}
def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict:
self.df = self.df[
(self.df["dt"] >= start) &
(self.df["dt"] <= end)
]
return {
"rows": len(self.df),
"data": self.df.to_dict(orient="records")
}
"""
Input is a hash map (source_name: str -> enabled: bool)
"""
def filter_data_sources(self, data_sources: dict) -> dict:
enabled_sources = [src for src, enabled in data_sources.items() if enabled]
if not enabled_sources:
raise ValueError("Please choose at least one data source")
self.df = self.df[self.df["source"].isin(enabled_sources)]
return {
"rows": len(self.df),
"data": self.df.to_dict(orient="records")
"sources": df["source"].dropna().unique().tolist(),
}
def reset_dataset(self) -> None:
self.df = self.original_df.copy(deep=True)
# def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
# filtered_df = df[df["content"].str.contains(search_query, na=False)]
# return {
# "rows": len(filtered_df),
# "data": filtered_df.to_dict(orient="records"),
# }
# def set_time_range(
# self,
# original_df: pd.DataFrame,
# start: datetime.datetime,
# end: datetime.datetime,
# ) -> dict:
# df = self._prepare_df(original_df)
# filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
# return {
# "rows": len(filtered_df),
# "data": filtered_df.to_dict(orient="records"),
# }
# def filter_data_sources(
# self, original_df: pd.DataFrame, data_sources: dict
# ) -> dict:
# df = self._prepare_df(original_df)
# enabled_sources = [src for src, enabled in data_sources.items() if enabled]
# if not enabled_sources:
# raise ValueError("Please choose at least one data source")
# filtered_df = df[df["source"].isin(enabled_sources)]
# return {
# "rows": len(filtered_df),
# "data": filtered_df.to_dict(orient="records"),
# }
# def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
# return self._prepare_df(original_df)