Files
crosspost/server/analysis/interactional.py

119 lines
3.5 KiB
Python

import pandas as pd
import re
class InteractionAnalysis:
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def interaction_graph(self, df: pd.DataFrame):
interactions = {a: {} for a in df["author"].dropna().unique()}
# reply_to refers to the comment id, this allows us to map comment/post ids to usernames
id_to_author = df.set_index("post_id")["author"].to_dict()
for _, row in df.iterrows():
a = row["author"]
reply_id = row["reply_to"]
if pd.isna(a) or pd.isna(reply_id) or reply_id == "":
continue
b = id_to_author.get(reply_id)
if b is None or a == b:
continue
interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions
def average_thread_depth(self, df: pd.DataFrame):
depths = []
id_to_reply = df.set_index("id")["reply_to"].to_dict()
for _, row in df.iterrows():
depth = 0
current_id = row["id"]
while True:
reply_to = id_to_reply.get(current_id)
if pd.isna(reply_to) or reply_to == "":
break
depth += 1
current_id = reply_to
depths.append(depth)
if not depths:
return 0
return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self, df: pd.DataFrame):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c
for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
id_to_reply = df.set_index("id")["reply_to"].to_dict()
length_cache = {}
def thread_length_from(start_id):
if start_id in length_cache:
return length_cache[start_id]
seen = set()
length = 1
current = start_id
while True:
if current in seen:
# infinite loop shouldn't happen, but just in case
break
seen.add(current)
reply_to = id_to_reply.get(current)
if (
reply_to is None
or (isinstance(reply_to, float) and pd.isna(reply_to))
or reply_to == ""
):
break
length += 1
current = reply_to
if current in length_cache:
length += length_cache[current] - 1
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
return {
emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items()
}