From 9d1e8960fcdcf46acee145e586f84f14b52a2c61 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 3 Mar 2026 14:25:25 +0000 Subject: [PATCH] perf: update cultural analysis to use regex instead of Counter --- server/analysis/cultural.py | 90 ++++++++++++------------------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/server/analysis/cultural.py b/server/analysis/cultural.py index 909233e..fc4a93a 100644 --- a/server/analysis/cultural.py +++ b/server/analysis/cultural.py @@ -1,7 +1,6 @@ import pandas as pd import re -from collections import Counter from typing import Any @@ -14,9 +13,6 @@ class CulturalAnalysis: df = original_df.copy() s = df[self.content_col].fillna("").astype(str).str.lower() - in_group_words = {"we", "us", "our", "ourselves"} - out_group_words = {"they", "them", "their", "themselves"} - emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_cols = [ c for c in df.columns @@ -24,11 +20,13 @@ class CulturalAnalysis: ] # Tokenize per row - tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt)) + in_pattern = re.compile(r"\b(we|us|our|ourselves)\b") + out_pattern = re.compile(r"\b(they|them|their|themselves)\b") + token_pattern = re.compile(r"\b[a-z]{2,}\b") - total_tokens = int(tokens_per_row.map(len).sum()) - in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int) - out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int) + in_hits = s.str.count(in_pattern) + out_hits = s.str.count(out_pattern) + total_tokens = s.str.count(token_pattern).sum() in_count = int(in_hits.sum()) out_count = int(out_hits.sum()) @@ -62,33 +60,15 @@ class CulturalAnalysis: def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]: s = df[self.content_col].fillna("").astype(str) - hedges = { - "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem", - "i think", "i feel", "i guess", "kind of", "sort of", "somewhat" - } - certainty = { - "definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never" - } + hedge_pattern = re.compile(r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b") + certainty_pattern = re.compile(r"\b(definitely|certainly|clearly|obviously|undeniably|always|never)\b") + deontic_pattern = re.compile(r"\b(must|should|need|needs|have to|has to|ought|required|require)\b") + permission_pattern = re.compile(r"\b(can|allowed|okay|ok|permitted)\b") - deontic = { - "must", "should", "need", "needs", "have to", "has to", "ought", "required", "require" - } - - permission = {"can", "allowed", "okay", "ok", "permitted"} - - def count_phrases(text: str, phrases: set[str]) -> int: - c = 0 - for p in phrases: - if " " in p: - c += len(re.findall(r"\b" + re.escape(p) + r"\b", text)) - else: - c += len(re.findall(r"\b" + re.escape(p) + r"\b", text)) - return c - - hedge_counts = s.apply(lambda t: count_phrases(t, hedges)) - certainty_counts = s.apply(lambda t: count_phrases(t, certainty)) - deontic_counts = s.apply(lambda t: count_phrases(t, deontic)) - perm_counts = s.apply(lambda t: count_phrases(t, permission)) + hedge_counts = s.str.count(hedge_pattern) + certainty_counts = s.str.count(certainty_pattern) + deontic_counts = s.str.count(deontic_pattern) + perm_counts = s.str.count(permission_pattern) token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1) @@ -108,44 +88,30 @@ class CulturalAnalysis: return {"entity_emotion_avg": {}} emotion_cols = [c for c in df.columns if c.startswith("emotion_")] - entity_counter = Counter() - for row in df["entities"].dropna(): - if isinstance(row, list): - for ent in row: - if isinstance(ent, dict): - text = ent.get("text") - if isinstance(text, str): - text = text.strip() - if len(text) >= 3: # filter short junk - entity_counter[text] += 1 + entity_df = df[["entities"] + emotion_cols].explode("entities") - top_entities = entity_counter.most_common(top_n) + entity_df["entity_text"] = entity_df["entities"].apply( + lambda e: e.get("text").strip() + if isinstance(e, dict) and isinstance(e.get("text"), str) and len(e.get("text")) >= 3 + else None + ) + entity_df = entity_df.dropna(subset=["entity_text"]) + entity_counts = entity_df["entity_text"].value_counts().head(top_n) entity_emotion_avg = {} - for entity_text, _ in top_entities: - mask = df["entities"].apply( - lambda ents: isinstance(ents, list) and - any(isinstance(e, dict) and e.get("text") == entity_text for e in ents) - ) - - post_count = int(mask.sum()) - - if post_count >= min_posts: + for entity_text, count in entity_counts.items(): + if count >= min_posts: emo_means = ( - df.loc[mask, emotion_cols] - .apply(pd.to_numeric, errors="coerce") - .fillna(0.0) + entity_df[entity_df["entity_text"] == entity_text][emotion_cols] .mean() .to_dict() ) entity_emotion_avg[entity_text] = { - "post_count": post_count, - "emotion_avg": emo_means + "post_count": int(count), + "emotion_avg": emo_means, } - return { - "entity_emotion_avg": entity_emotion_avg - } \ No newline at end of file + return {"entity_emotion_avg": entity_emotion_avg} \ No newline at end of file