Compare commits

...

5 Commits

6 changed files with 188 additions and 12 deletions

View File

@@ -34,7 +34,7 @@ function ApiToGraphData(apiData: InteractionGraph) {
} }
const InteractionStats = (props: { data: UserAnalysisResponse }) => { const UserStats = (props: { data: UserAnalysisResponse }) => {
const graphData = ApiToGraphData(props.data.interaction_graph); const graphData = ApiToGraphData(props.data.interaction_graph);
return ( return (
@@ -44,7 +44,7 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
This graph visualizes interactions between users based on comments and replies. This graph visualizes interactions between users based on comments and replies.
Nodes represent users, and edges represent interactions (e.g., comments or replies) between them. Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
</p> </p>
<div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}> <div>
<ForceGraph3D <ForceGraph3D
graphData={graphData} graphData={graphData}
nodeAutoColorBy="id" nodeAutoColorBy="id"
@@ -58,4 +58,4 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
); );
} }
export default InteractionStats; export default UserStats;

View File

@@ -3,7 +3,7 @@ import axios from "axios";
import StatsStyling from "../styles/stats_styling"; import StatsStyling from "../styles/stats_styling";
import SummaryStats from "../components/SummaryStats"; import SummaryStats from "../components/SummaryStats";
import EmotionalStats from "../components/EmotionalStats"; import EmotionalStats from "../components/EmotionalStats";
import InteractionStats from "../components/InteractionStats"; import InteractionStats from "../components/UserStats";
import { import {
type SummaryResponse, type SummaryResponse,

View File

@@ -124,3 +124,85 @@ class InteractionAnalysis:
interactions[a][b] = interactions[a].get(b, 0) + 1 interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions return interactions
def average_thread_depth(self):
depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
for _, row in self.df.iterrows():
depth = 0
current_id = row["id"]
while True:
reply_to = id_to_reply.get(current_id)
if pd.isna(reply_to) or reply_to == "":
break
depth += 1
current_id = reply_to
depths.append(depth)
if not depths:
return 0
return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c for c in self.df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
length_cache = {}
def thread_length_from(start_id):
if start_id in length_cache:
return length_cache[start_id]
seen = set()
length = 1
current = start_id
while True:
if current in seen:
# infinite loop shouldn't happen, but just in case
break
seen.add(current)
reply_to = id_to_reply.get(current)
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
break
length += 1
current = reply_to
if current in length_cache:
length += (length_cache[current] - 1)
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = self.df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
return {
emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items()
}

View File

@@ -9,6 +9,10 @@ class LinguisticAnalysis:
self.df = df self.df = df
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _clean_text(self, text: str) -> str: def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text) text = re.sub(r"www\S+", "", text)
@@ -66,3 +70,44 @@ class LinguisticAnalysis:
.head(limit) .head(limit)
.to_dict(orient="records") .to_dict(orient="records")
) )
def identity_markers(self):
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
in_count = 0
out_count = 0
in_emotions = {e: 0 for e in emotion_cols}
out_emotions = {e: 0 for e in emotion_cols}
total = 0
for post in df:
text = post["content"]
tokens = re.findall(r"\b[a-z]{2,}\b", text)
total += len(tokens)
in_count += sum(t in in_group_words for t in tokens)
out_count += sum(t in out_group_words for t in tokens)
emotions = post[emotion_cols]
print(emotions)
return {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total, 1), 5),
"out_group_ratio": round(out_count / max(total, 1), 5),
}

View File

@@ -55,7 +55,7 @@ def word_frequencies():
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
try: try:
return jsonify(stat_obj.content_analysis()), 200 return jsonify(stat_obj.get_content_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
@@ -80,7 +80,7 @@ def get_time_analysis():
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
try: try:
return jsonify(stat_obj.time_analysis()), 200 return jsonify(stat_obj.get_time_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
@@ -93,7 +93,33 @@ def get_user_analysis():
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
try: try:
return jsonify(stat_obj.user_analysis()), 200 return jsonify(stat_obj.get_user_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
def get_cultural_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_cultural_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
def get_interaction_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_interactional_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:

View File

@@ -62,13 +62,18 @@ class StatGen:
self.nlp.add_ner_cols() self.nlp.add_ner_cols()
## Public ## Public
def time_analysis(self) -> pd.DataFrame:
# topics over time
# emotions over time
def get_time_analysis(self) -> pd.DataFrame:
return { return {
"events_per_day": self.temporal_analysis.posts_per_day(), "events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": self.temporal_analysis.heatmap() "weekday_hour_heatmap": self.temporal_analysis.heatmap()
} }
def content_analysis(self) -> dict: # average topic duration
def get_content_analysis(self) -> dict:
return { return {
"word_frequencies": self.linguistic_analysis.word_frequencies(), "word_frequencies": self.linguistic_analysis.word_frequencies(),
"common_two_phrases": self.linguistic_analysis.ngrams(), "common_two_phrases": self.linguistic_analysis.ngrams(),
@@ -77,13 +82,31 @@ class StatGen:
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
} }
def user_analysis(self) -> dict: # average emotion per user
# average chain length
def get_user_analysis(self) -> dict:
return { return {
"top_users": self.interaction_analysis.top_users(), "top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis(), "users": self.interaction_analysis.per_user_analysis(),
"interaction_graph": self.interaction_analysis.interaction_graph() "interaction_graph": self.interaction_analysis.interaction_graph()
} }
# average / max thread depth
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
}
# detect community jargon
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return {
"identity_markers": self.linguistic_analysis.identity_markers()
}
def summary(self) -> dict: def summary(self) -> dict:
total_posts = (self.df["type"] == "post").sum() total_posts = (self.df["type"] == "post").sum()
total_comments = (self.df["type"] == "comment").sum() total_comments = (self.df["type"] == "comment").sum()