diff --git a/server/analysis/cultural.py b/server/analysis/cultural.py new file mode 100644 index 0000000..ae55774 --- /dev/null +++ b/server/analysis/cultural.py @@ -0,0 +1,154 @@ +import pandas as pd +import re + +from collections import Counter +from typing import Any + + +class CulturalAnalysis: + def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"): + self.df = df + self.content_col = content_col + self.topic_col = topic_col + + def get_identity_markers(self): + df = self.df.copy() + s = df[self.content_col].fillna("").astype(str).str.lower() + + in_group_words = {"we", "us", "our", "ourselves"} + out_group_words = {"they", "them", "their", "themselves"} + + emotion_exclusions = {"emotion_neutral", "emotion_surprise"} + emotion_cols = [ + c for c in df.columns + if c.startswith("emotion_") and c not in emotion_exclusions + ] + + # Tokenize per row + tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt)) + + total_tokens = int(tokens_per_row.map(len).sum()) + in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int) + out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int) + + in_count = int(in_hits.sum()) + out_count = int(out_hits.sum()) + + in_mask = in_hits > out_hits + out_mask = out_hits > in_hits + tie_mask = ~(in_mask | out_mask) + + result = { + "in_group_usage": in_count, + "out_group_usage": out_count, + "in_group_ratio": round(in_count / max(total_tokens, 1), 5), + "out_group_ratio": round(out_count / max(total_tokens, 1), 5), + + "in_group_posts": int(in_mask.sum()), + "out_group_posts": int(out_mask.sum()), + "tie_posts": int(tie_mask.sum()), + } + + if emotion_cols: + emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0) + + in_avg = emo.loc[in_mask].mean() if in_mask.any() else pd.Series(0.0, index=emotion_cols) + out_avg = emo.loc[out_mask].mean() if out_mask.any() else pd.Series(0.0, index=emotion_cols) + + result["in_group_emotion_avg"] = in_avg.to_dict() + result["out_group_emotion_avg"] = out_avg.to_dict() + + return result + + def get_stance_markers(self) -> dict[str, Any]: + s = self.df[self.content_col].fillna("").astype(str) + + hedges = { + "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem", + "i think", "i feel", "i guess", "kind of", "sort of", "somewhat" + } + certainty = { + "definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never" + } + + deontic = { + "must", "should", "need", "needs", "have to", "has to", "ought", "required", "require" + } + + permission = {"can", "allowed", "okay", "ok", "permitted"} + + def count_phrases(text: str, phrases: set[str]) -> int: + c = 0 + for p in phrases: + if " " in p: + c += len(re.findall(r"\b" + re.escape(p) + r"\b", text)) + else: + c += len(re.findall(r"\b" + re.escape(p) + r"\b", text)) + return c + + hedge_counts = s.apply(lambda t: count_phrases(t, hedges)) + certainty_counts = s.apply(lambda t: count_phrases(t, certainty)) + deontic_counts = s.apply(lambda t: count_phrases(t, deontic)) + perm_counts = s.apply(lambda t: count_phrases(t, permission)) + + token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1) + + return { + "hedge_total": int(hedge_counts.sum()), + "certainty_total": int(certainty_counts.sum()), + "deontic_total": int(deontic_counts.sum()), + "permission_total": int(perm_counts.sum()), + "hedge_per_1k_tokens": round(1000 * hedge_counts.sum() / token_counts.sum(), 3), + "certainty_per_1k_tokens": round(1000 * certainty_counts.sum() / token_counts.sum(), 3), + "deontic_per_1k_tokens": round(1000 * deontic_counts.sum() / token_counts.sum(), 3), + "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3), + } + + def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]: + if "entities" not in self.df.columns: + return {"entity_emotion_avg": {}} + + df = self.df + emotion_cols = [c for c in df.columns if c.startswith("emotion_")] + + entity_counter = Counter() + + for row in df["entities"].dropna(): + if isinstance(row, list): + for ent in row: + if isinstance(ent, dict): + text = ent.get("text") + if isinstance(text, str): + text = text.strip() + if len(text) >= 3: # filter short junk + entity_counter[text] += 1 + + top_entities = entity_counter.most_common(top_n) + + entity_emotion_avg = {} + + for entity_text, _ in top_entities: + mask = df["entities"].apply( + lambda ents: isinstance(ents, list) and + any(isinstance(e, dict) and e.get("text") == entity_text for e in ents) + ) + + post_count = int(mask.sum()) + + if post_count >= min_posts: + emo_means = ( + df.loc[mask, emotion_cols] + .apply(pd.to_numeric, errors="coerce") + .fillna(0.0) + .mean() + .to_dict() + ) + + entity_emotion_avg[entity_text] = { + "post_count": post_count, + "emotion_avg": emo_means + } + + return { + "entity_emotion_avg": entity_emotion_avg + } \ No newline at end of file diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py index c311944..10e897d 100644 --- a/server/analysis/emotional.py +++ b/server/analysis/emotional.py @@ -5,14 +5,9 @@ class EmotionalAnalysis: self.df = df def avg_emotion_by_topic(self) -> dict: - emotion_exclusions = [ - "emotion_neutral", - "emotion_surprise" - ] - emotion_cols = [ col for col in self.df.columns - if col.startswith("emotion_") and col not in emotion_exclusions + if col.startswith("emotion_") ] counts = ( diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 4ed4950..6fd5b3f 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -3,6 +3,7 @@ import re from collections import Counter + class InteractionAnalysis: def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): self.df = df @@ -12,7 +13,9 @@ class InteractionAnalysis: tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in self.word_exclusions] - def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list: + def _vocab_richness_per_user( + self, min_words: int = 20, top_most_used_words: int = 100 + ) -> list: df = self.df.copy() df["content"] = df["content"].fillna("").astype(str).str.lower() df["tokens"] = df["content"].apply(self._tokenize) @@ -39,15 +42,17 @@ class InteractionAnalysis: for w, c in counts.most_common(top_most_used_words) ] - rows.append({ - "author": author, - "events": int(events), - "total_words": int(total_words), - "unique_words": int(unique_words), - "vocab_richness": round(vocab_richness, 3), - "avg_words_per_event": round(avg_words, 2), - "top_words": top_words - }) + rows.append( + { + "author": author, + "events": int(events), + "total_words": int(total_words), + "unique_words": int(unique_words), + "vocab_richness": round(vocab_richness, 3), + "avg_words_per_event": round(avg_words, 2), + "top_words": top_words, + } + ) rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) @@ -55,9 +60,7 @@ class InteractionAnalysis: def top_users(self) -> list: counts = ( - self.df.groupby(["author", "source"]) - .size() - .sort_values(ascending=False) + self.df.groupby(["author", "source"]).size().sort_values(ascending=False) ) top_users = [ @@ -66,21 +69,31 @@ class InteractionAnalysis: ] return top_users - + def per_user_analysis(self) -> dict: - per_user = ( - self.df.groupby(["author", "type"]) - .size() - .unstack(fill_value=0) - ) + per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0) + + emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")] + + avg_emotions_by_author = {} + if emotion_cols: + avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0) + avg_emotions_by_author = { + author: {emotion: float(score) for emotion, score in row.items()} + for author, row in avg_emotions.iterrows() + } # ensure columns always exist for col in ("post", "comment"): if col not in per_user.columns: per_user[col] = 0 - per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1) - per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1) + per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( + 0, 1 + ) + per_user["comment_share"] = per_user["comment"] / ( + per_user["post"] + per_user["comment"] + ).replace(0, 1) per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user_records = per_user.reset_index().to_dict(orient="records") @@ -91,19 +104,22 @@ class InteractionAnalysis: merged_users = [] for row in per_user_records: author = row["author"] - merged_users.append({ - "author": author, - "post": int(row.get("post", 0)), - "comment": int(row.get("comment", 0)), - "comment_post_ratio": float(row.get("comment_post_ratio", 0)), - "comment_share": float(row.get("comment_share", 0)), - "vocab": vocab_by_author.get(author) - }) + merged_users.append( + { + "author": author, + "post": int(row.get("post", 0)), + "comment": int(row.get("comment", 0)), + "comment_post_ratio": float(row.get("comment_post_ratio", 0)), + "comment_share": float(row.get("comment_share", 0)), + "avg_emotions": avg_emotions_by_author.get(author, {}), + "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}), + } + ) merged_users.sort(key=lambda u: u["comment_post_ratio"]) return merged_users - + def interaction_graph(self): interactions = {a: {} for a in self.df["author"].dropna().unique()} @@ -124,7 +140,7 @@ class InteractionAnalysis: interactions[a][b] = interactions[a].get(b, 0) + 1 return interactions - + def average_thread_depth(self): depths = [] id_to_reply = self.df.set_index("id")["reply_to"].to_dict() @@ -144,14 +160,15 @@ class InteractionAnalysis: if not depths: return 0 - + return round(sum(depths) / len(depths), 2) - + def average_thread_length_by_emotion(self): emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_cols = [ - c for c in self.df.columns + c + for c in self.df.columns if c.startswith("emotion_") and c not in emotion_exclusions ] @@ -174,14 +191,18 @@ class InteractionAnalysis: reply_to = id_to_reply.get(current) - if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "": + if ( + reply_to is None + or (isinstance(reply_to, float) and pd.isna(reply_to)) + or reply_to == "" + ): break length += 1 current = reply_to if current in length_cache: - length += (length_cache[current] - 1) + length += length_cache[current] - 1 break length_cache[start_id] = length @@ -205,4 +226,4 @@ class InteractionAnalysis: return { emotion: round(sum(lengths) / len(lengths), 2) for emotion, lengths in emotion_to_lengths.items() - } \ No newline at end of file + } diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index 8cfcc7f..5718edc 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -69,45 +69,4 @@ class LinguisticAnalysis: .sort_values("count", ascending=False) .head(limit) .to_dict(orient="records") - ) - - def identity_markers(self): - df = self.df.copy() - df["content"] = df["content"].fillna("").astype(str).str.lower() - - in_group_words = {"we", "us", "our", "ourselves"} - out_group_words = {"they", "them", "their", "themselves"} - - emotion_exclusions = [ - "emotion_neutral", - "emotion_surprise" - ] - - emotion_cols = [ - col for col in self.df.columns - if col.startswith("emotion_") and col not in emotion_exclusions - ] - in_count = 0 - out_count = 0 - in_emotions = {e: 0 for e in emotion_cols} - out_emotions = {e: 0 for e in emotion_cols} - total = 0 - - for post in df: - text = post["content"] - tokens = re.findall(r"\b[a-z]{2,}\b", text) - total += len(tokens) - in_count += sum(t in in_group_words for t in tokens) - out_count += sum(t in out_group_words for t in tokens) - - emotions = post[emotion_cols] - print(emotions) - - - - return { - "in_group_usage": in_count, - "out_group_usage": out_count, - "in_group_ratio": round(in_count / max(total, 1), 5), - "out_group_ratio": round(out_count / max(total, 1), 5), - } \ No newline at end of file + ) \ No newline at end of file diff --git a/server/analysis/nlp.py b/server/analysis/nlp.py index c3fcf89..4459851 100644 --- a/server/analysis/nlp.py +++ b/server/analysis/nlp.py @@ -200,6 +200,35 @@ class NLP: if column.startswith("emotion_") and column not in emotion_df.columns: self.df[column] = 0.0 + # drop neutral and surprise columns from df and normalize others to sum to 1 + drop_cols = ["emotion_neutral", "emotion_surprise"] + + existing_drop = [c for c in drop_cols if c in self.df.columns] + self.df.drop(columns=existing_drop, inplace=True) + + remaining_emotion_cols = [ + c for c in self.df.columns + if c.startswith("emotion_") + ] + + if remaining_emotion_cols: + emotion_matrix = ( + self.df[remaining_emotion_cols] + .apply(pd.to_numeric, errors="coerce") + .fillna(0.0) + ) + + row_sums = emotion_matrix.sum(axis=1) + + # Avoid division by zero + row_sums = row_sums.replace(0, 1.0) + + normalized = emotion_matrix.div(row_sums, axis=0) + + self.df[remaining_emotion_cols] = normalized.values + + + def add_topic_col(self, confidence_threshold: float = 0.3) -> None: titles = self.df[self.title_col].fillna("").astype(str) contents = self.df[self.content_col].fillna("").astype(str) @@ -276,3 +305,5 @@ class NLP: self.df[col_name] = [ d.get(label, 0) for d in entity_count_dicts ] + + diff --git a/server/app.py b/server/app.py index d032484..c9856d9 100644 --- a/server/app.py +++ b/server/app.py @@ -207,8 +207,8 @@ def get_interaction_analysis(): print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 -@app.route('/filter/search', methods=["POST"]) -def search_dataset(): +@app.route('/filter/query', methods=["POST"]) +def filter_query(): if stat_obj is None: return jsonify({"error": "No data uploaded"}), 400 @@ -218,7 +218,7 @@ def search_dataset(): return jsonify(stat_obj.df.to_dict(orient="records")), 200 query = data["query"] - filtered_df = stat_obj.search(query) + filtered_df = stat_obj.filter_by_query(query) return jsonify(filtered_df), 200 diff --git a/server/stat_gen.py b/server/stat_gen.py index 6ac7159..bbba747 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis from server.analysis.emotional import EmotionalAnalysis from server.analysis.interactional import InteractionAnalysis from server.analysis.linguistic import LinguisticAnalysis +from server.analysis.cultural import CulturalAnalysis DOMAIN_STOPWORDS = { "www", "https", "http", @@ -15,8 +16,7 @@ DOMAIN_STOPWORDS = { "comment", "comments", "discussion", "thread", "post", "posts", - "would", "could", "should", - "like", "get", "one" + "would", "get", "one" } nltk.download('stopwords') @@ -40,33 +40,32 @@ class StatGen: self.df.drop(columns=["post_id"], inplace=True, errors="ignore") self.nlp = NLP(self.df, "title", "content", domain_topics) - self._add_extra_cols(self.df) + self.nlp.add_emotion_cols() + self.nlp.add_topic_col() + self.nlp.add_ner_cols() + self._add_time_cols(self.df) self.temporal_analysis = TemporalAnalysis(self.df) self.emotional_analysis = EmotionalAnalysis(self.df) self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS) + self.cultural_analysis = CulturalAnalysis(self.df) self.original_df = self.df.copy(deep=True) ## Private Methods - def _add_extra_cols(self, df: pd.DataFrame) -> None: - df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce') + def _add_time_cols(self, df: pd.DataFrame) -> None: + df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce') df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) df["hour"] = df["dt"].dt.hour df["weekday"] = df["dt"].dt.day_name() - - self.nlp.add_emotion_cols() - self.nlp.add_topic_col() - self.nlp.add_ner_cols() ## Public - # topics over time # emotions over time - def get_time_analysis(self) -> pd.DataFrame: + def get_time_analysis(self) -> dict: return { "events_per_day": self.temporal_analysis.posts_per_day(), "weekday_hour_heatmap": self.temporal_analysis.heatmap() @@ -87,24 +86,25 @@ class StatGen: def get_user_analysis(self) -> dict: return { "top_users": self.interaction_analysis.top_users(), - "users": self.interaction_analysis.per_user_analysis(), - "interaction_graph": self.interaction_analysis.interaction_graph() + "users": self.interaction_analysis.per_user_analysis() } # average / max thread depth # high engagment threads based on volume - def get_interactional_analysis(self) -> dict: return { "average_thread_depth": self.interaction_analysis.average_thread_depth(), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion() + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(), + "interaction_graph": self.interaction_analysis.interaction_graph() } # detect community jargon # in-group and out-group linguistic markers def get_cultural_analysis(self) -> dict: return { - "identity_markers": self.linguistic_analysis.identity_markers() + "identity_markers": self.cultural_analysis.get_identity_markers(), + "stance_markers": self.cultural_analysis.get_stance_markers(), + "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity() } def summary(self) -> dict: @@ -127,7 +127,7 @@ class StatGen: "sources": self.df["source"].dropna().unique().tolist() } - def search(self, search_query: str) -> dict: + def filter_by_query(self, search_query: str) -> dict: self.df = self.df[ self.df["content"].str.contains(search_query) ]