import pandas as pd import re class InteractionAnalysis: def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions def _tokenize(self, text: str): tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in self.word_exclusions] def interaction_graph(self, df: pd.DataFrame): interactions = {a: {} for a in df["author"].dropna().unique()} # reply_to refers to the comment id, this allows us to map comment/post ids to usernames id_to_author = df.set_index("post_id")["author"].to_dict() for _, row in df.iterrows(): a = row["author"] reply_id = row["reply_to"] if pd.isna(a) or pd.isna(reply_id) or reply_id == "": continue b = id_to_author.get(reply_id) if b is None or a == b: continue interactions[a][b] = interactions[a].get(b, 0) + 1 return interactions def average_thread_depth(self, df: pd.DataFrame): depths = [] id_to_reply = df.set_index("id")["reply_to"].to_dict() for _, row in df.iterrows(): depth = 0 current_id = row["id"] while True: reply_to = id_to_reply.get(current_id) if pd.isna(reply_to) or reply_to == "": break depth += 1 current_id = reply_to depths.append(depth) if not depths: return 0 return round(sum(depths) / len(depths), 2) def average_thread_length_by_emotion(self, df: pd.DataFrame): emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_cols = [ c for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions ] id_to_reply = df.set_index("id")["reply_to"].to_dict() length_cache = {} def thread_length_from(start_id): if start_id in length_cache: return length_cache[start_id] seen = set() length = 1 current = start_id while True: if current in seen: # infinite loop shouldn't happen, but just in case break seen.add(current) reply_to = id_to_reply.get(current) if ( reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "" ): break length += 1 current = reply_to if current in length_cache: length += length_cache[current] - 1 break length_cache[start_id] = length return length emotion_to_lengths = {} # Fill NaNs in emotion cols to avoid max() issues emo_df = df[["id"] + emotion_cols].copy() emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0) for _, row in emo_df.iterrows(): msg_id = row["id"] length = thread_length_from(msg_id) emotions = {c: row[c] for c in emotion_cols} dominant = max(emotions, key=emotions.get) emotion_to_lengths.setdefault(dominant, []).append(length) return { emotion: round(sum(lengths) / len(lengths), 2) for emotion, lengths in emotion_to_lengths.items() }