Compare commits
5 Commits
c11b4bb85b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 257eb80de7 | |||
| 3a23b1f0c8 | |||
| 8c76476cd3 | |||
| 397986dc89 | |||
| 04b7094036 |
@@ -34,7 +34,7 @@ function ApiToGraphData(apiData: InteractionGraph) {
|
||||
}
|
||||
|
||||
|
||||
const InteractionStats = (props: { data: UserAnalysisResponse }) => {
|
||||
const UserStats = (props: { data: UserAnalysisResponse }) => {
|
||||
const graphData = ApiToGraphData(props.data.interaction_graph);
|
||||
|
||||
return (
|
||||
@@ -44,7 +44,7 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
|
||||
This graph visualizes interactions between users based on comments and replies.
|
||||
Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
|
||||
</p>
|
||||
<div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}>
|
||||
<div>
|
||||
<ForceGraph3D
|
||||
graphData={graphData}
|
||||
nodeAutoColorBy="id"
|
||||
@@ -58,4 +58,4 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
|
||||
);
|
||||
}
|
||||
|
||||
export default InteractionStats;
|
||||
export default UserStats;
|
||||
@@ -3,7 +3,7 @@ import axios from "axios";
|
||||
import StatsStyling from "../styles/stats_styling";
|
||||
import SummaryStats from "../components/SummaryStats";
|
||||
import EmotionalStats from "../components/EmotionalStats";
|
||||
import InteractionStats from "../components/InteractionStats";
|
||||
import InteractionStats from "../components/UserStats";
|
||||
|
||||
import {
|
||||
type SummaryResponse,
|
||||
|
||||
@@ -124,3 +124,85 @@ class InteractionAnalysis:
|
||||
interactions[a][b] = interactions[a].get(b, 0) + 1
|
||||
|
||||
return interactions
|
||||
|
||||
def average_thread_depth(self):
|
||||
depths = []
|
||||
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
|
||||
for _, row in self.df.iterrows():
|
||||
depth = 0
|
||||
current_id = row["id"]
|
||||
|
||||
while True:
|
||||
reply_to = id_to_reply.get(current_id)
|
||||
if pd.isna(reply_to) or reply_to == "":
|
||||
break
|
||||
|
||||
depth += 1
|
||||
current_id = reply_to
|
||||
|
||||
depths.append(depth)
|
||||
|
||||
if not depths:
|
||||
return 0
|
||||
|
||||
return round(sum(depths) / len(depths), 2)
|
||||
|
||||
def average_thread_length_by_emotion(self):
|
||||
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
||||
|
||||
emotion_cols = [
|
||||
c for c in self.df.columns
|
||||
if c.startswith("emotion_") and c not in emotion_exclusions
|
||||
]
|
||||
|
||||
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
|
||||
length_cache = {}
|
||||
|
||||
def thread_length_from(start_id):
|
||||
if start_id in length_cache:
|
||||
return length_cache[start_id]
|
||||
|
||||
seen = set()
|
||||
length = 1
|
||||
current = start_id
|
||||
|
||||
while True:
|
||||
if current in seen:
|
||||
# infinite loop shouldn't happen, but just in case
|
||||
break
|
||||
seen.add(current)
|
||||
|
||||
reply_to = id_to_reply.get(current)
|
||||
|
||||
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
|
||||
break
|
||||
|
||||
length += 1
|
||||
current = reply_to
|
||||
|
||||
if current in length_cache:
|
||||
length += (length_cache[current] - 1)
|
||||
break
|
||||
|
||||
length_cache[start_id] = length
|
||||
return length
|
||||
|
||||
emotion_to_lengths = {}
|
||||
|
||||
# Fill NaNs in emotion cols to avoid max() issues
|
||||
emo_df = self.df[["id"] + emotion_cols].copy()
|
||||
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
|
||||
|
||||
for _, row in emo_df.iterrows():
|
||||
msg_id = row["id"]
|
||||
length = thread_length_from(msg_id)
|
||||
|
||||
emotions = {c: row[c] for c in emotion_cols}
|
||||
dominant = max(emotions, key=emotions.get)
|
||||
|
||||
emotion_to_lengths.setdefault(dominant, []).append(length)
|
||||
|
||||
return {
|
||||
emotion: round(sum(lengths) / len(lengths), 2)
|
||||
for emotion, lengths in emotion_to_lengths.items()
|
||||
}
|
||||
@@ -9,6 +9,10 @@ class LinguisticAnalysis:
|
||||
self.df = df
|
||||
self.word_exclusions = word_exclusions
|
||||
|
||||
def _tokenize(self, text: str):
|
||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||
return [t for t in tokens if t not in self.word_exclusions]
|
||||
|
||||
def _clean_text(self, text: str) -> str:
|
||||
text = re.sub(r"http\S+", "", text) # remove URLs
|
||||
text = re.sub(r"www\S+", "", text)
|
||||
@@ -66,3 +70,44 @@ class LinguisticAnalysis:
|
||||
.head(limit)
|
||||
.to_dict(orient="records")
|
||||
)
|
||||
|
||||
def identity_markers(self):
|
||||
df = self.df.copy()
|
||||
df["content"] = df["content"].fillna("").astype(str).str.lower()
|
||||
|
||||
in_group_words = {"we", "us", "our", "ourselves"}
|
||||
out_group_words = {"they", "them", "their", "themselves"}
|
||||
|
||||
emotion_exclusions = [
|
||||
"emotion_neutral",
|
||||
"emotion_surprise"
|
||||
]
|
||||
|
||||
emotion_cols = [
|
||||
col for col in self.df.columns
|
||||
if col.startswith("emotion_") and col not in emotion_exclusions
|
||||
]
|
||||
in_count = 0
|
||||
out_count = 0
|
||||
in_emotions = {e: 0 for e in emotion_cols}
|
||||
out_emotions = {e: 0 for e in emotion_cols}
|
||||
total = 0
|
||||
|
||||
for post in df:
|
||||
text = post["content"]
|
||||
tokens = re.findall(r"\b[a-z]{2,}\b", text)
|
||||
total += len(tokens)
|
||||
in_count += sum(t in in_group_words for t in tokens)
|
||||
out_count += sum(t in out_group_words for t in tokens)
|
||||
|
||||
emotions = post[emotion_cols]
|
||||
print(emotions)
|
||||
|
||||
|
||||
|
||||
return {
|
||||
"in_group_usage": in_count,
|
||||
"out_group_usage": out_count,
|
||||
"in_group_ratio": round(in_count / max(total, 1), 5),
|
||||
"out_group_ratio": round(out_count / max(total, 1), 5),
|
||||
}
|
||||
@@ -55,7 +55,7 @@ def word_frequencies():
|
||||
return jsonify({"error": "No data uploaded"}), 400
|
||||
|
||||
try:
|
||||
return jsonify(stat_obj.content_analysis()), 200
|
||||
return jsonify(stat_obj.get_content_analysis()), 200
|
||||
except ValueError as e:
|
||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||
except Exception as e:
|
||||
@@ -80,7 +80,7 @@ def get_time_analysis():
|
||||
return jsonify({"error": "No data uploaded"}), 400
|
||||
|
||||
try:
|
||||
return jsonify(stat_obj.time_analysis()), 200
|
||||
return jsonify(stat_obj.get_time_analysis()), 200
|
||||
except ValueError as e:
|
||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||
except Exception as e:
|
||||
@@ -93,7 +93,33 @@ def get_user_analysis():
|
||||
return jsonify({"error": "No data uploaded"}), 400
|
||||
|
||||
try:
|
||||
return jsonify(stat_obj.user_analysis()), 200
|
||||
return jsonify(stat_obj.get_user_analysis()), 200
|
||||
except ValueError as e:
|
||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||
except Exception as e:
|
||||
print(traceback.format_exc())
|
||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||
|
||||
@app.route("/stats/cultural", methods=["GET"])
|
||||
def get_cultural_analysis():
|
||||
if stat_obj is None:
|
||||
return jsonify({"error": "No data uploaded"}), 400
|
||||
|
||||
try:
|
||||
return jsonify(stat_obj.get_cultural_analysis()), 200
|
||||
except ValueError as e:
|
||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||
except Exception as e:
|
||||
print(traceback.format_exc())
|
||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||
|
||||
@app.route("/stats/interaction", methods=["GET"])
|
||||
def get_interaction_analysis():
|
||||
if stat_obj is None:
|
||||
return jsonify({"error": "No data uploaded"}), 400
|
||||
|
||||
try:
|
||||
return jsonify(stat_obj.get_interactional_analysis()), 200
|
||||
except ValueError as e:
|
||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||
except Exception as e:
|
||||
|
||||
@@ -62,13 +62,18 @@ class StatGen:
|
||||
self.nlp.add_ner_cols()
|
||||
|
||||
## Public
|
||||
def time_analysis(self) -> pd.DataFrame:
|
||||
|
||||
|
||||
# topics over time
|
||||
# emotions over time
|
||||
def get_time_analysis(self) -> pd.DataFrame:
|
||||
return {
|
||||
"events_per_day": self.temporal_analysis.posts_per_day(),
|
||||
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
|
||||
}
|
||||
|
||||
def content_analysis(self) -> dict:
|
||||
# average topic duration
|
||||
def get_content_analysis(self) -> dict:
|
||||
return {
|
||||
"word_frequencies": self.linguistic_analysis.word_frequencies(),
|
||||
"common_two_phrases": self.linguistic_analysis.ngrams(),
|
||||
@@ -77,13 +82,31 @@ class StatGen:
|
||||
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
|
||||
}
|
||||
|
||||
def user_analysis(self) -> dict:
|
||||
# average emotion per user
|
||||
# average chain length
|
||||
def get_user_analysis(self) -> dict:
|
||||
return {
|
||||
"top_users": self.interaction_analysis.top_users(),
|
||||
"users": self.interaction_analysis.per_user_analysis(),
|
||||
"interaction_graph": self.interaction_analysis.interaction_graph()
|
||||
}
|
||||
|
||||
# average / max thread depth
|
||||
# high engagment threads based on volume
|
||||
|
||||
def get_interactional_analysis(self) -> dict:
|
||||
return {
|
||||
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
|
||||
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
|
||||
}
|
||||
|
||||
# detect community jargon
|
||||
# in-group and out-group linguistic markers
|
||||
def get_cultural_analysis(self) -> dict:
|
||||
return {
|
||||
"identity_markers": self.linguistic_analysis.identity_markers()
|
||||
}
|
||||
|
||||
def summary(self) -> dict:
|
||||
total_posts = (self.df["type"] == "post").sum()
|
||||
total_comments = (self.df["type"] == "comment").sum()
|
||||
|
||||
Reference in New Issue
Block a user