feat(api): add average thread length per emotion

feat(api): add average thread depth
fix(api): broken analysis calls due to overlap in attribute and method names
2026-02-23 19:09:48 +00:00 · 2026-02-23 18:14:34 +00:00 · 2026-02-23 18:14:24 +00:00 · 2026-02-23 17:15:14 +00:00 · 2026-02-23 17:14:12 +00:00
6 changed files with 188 additions and 12 deletions
--- a/frontend/src/components/InteractionStats.tsx
+++ b/frontend/src/components/InteractionStats.tsx
@@ -34,7 +34,7 @@ function ApiToGraphData(apiData: InteractionGraph) {
 }


-const InteractionStats = (props: { data: UserAnalysisResponse }) => {
+const UserStats = (props: { data: UserAnalysisResponse }) => {
  const graphData = ApiToGraphData(props.data.interaction_graph);

  return (
@@ -44,7 +44,7 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
            This graph visualizes interactions between users based on comments and replies. 
            Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
        </p>
-        <div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}>
+        <div>
            <ForceGraph3D
                graphData={graphData}
                nodeAutoColorBy="id"
@@ -58,4 +58,4 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
  );
 }

-export default InteractionStats;
+export default UserStats;
--- a/frontend/src/pages/Stats.tsx
+++ b/frontend/src/pages/Stats.tsx
@@ -3,7 +3,7 @@ import axios from "axios";
 import StatsStyling from "../styles/stats_styling";
 import SummaryStats from "../components/SummaryStats";
 import EmotionalStats from "../components/EmotionalStats";
-import InteractionStats from "../components/InteractionStats";
+import InteractionStats from "../components/UserStats";

 import { 
  type SummaryResponse, 
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -124,3 +124,85 @@ class InteractionAnalysis:
            interactions[a][b] = interactions[a].get(b, 0) + 1

        return interactions
+    
+    def average_thread_depth(self):
+        depths = []
+        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        for _, row in self.df.iterrows():
+            depth = 0
+            current_id = row["id"]
+
+            while True:
+                reply_to = id_to_reply.get(current_id)
+                if pd.isna(reply_to) or reply_to == "":
+                    break
+
+                depth += 1
+                current_id = reply_to
+
+            depths.append(depth)
+
+        if not depths:
+            return 0
+        
+        return round(sum(depths) / len(depths), 2)
+    
+    def average_thread_length_by_emotion(self):
+        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
+
+        emotion_cols = [
+            c for c in self.df.columns
+            if c.startswith("emotion_") and c not in emotion_exclusions
+        ]
+
+        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        length_cache = {}
+
+        def thread_length_from(start_id):
+            if start_id in length_cache:
+                return length_cache[start_id]
+
+            seen = set()
+            length = 1
+            current = start_id
+
+            while True:
+                if current in seen:
+                    # infinite loop shouldn't happen, but just in case
+                    break
+                seen.add(current)
+
+                reply_to = id_to_reply.get(current)
+
+                if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
+                    break
+
+                length += 1
+                current = reply_to
+
+                if current in length_cache:
+                    length += (length_cache[current] - 1)
+                    break
+
+            length_cache[start_id] = length
+            return length
+
+        emotion_to_lengths = {}
+
+        # Fill NaNs in emotion cols to avoid max() issues
+        emo_df = self.df[["id"] + emotion_cols].copy()
+        emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
+
+        for _, row in emo_df.iterrows():
+            msg_id = row["id"]
+            length = thread_length_from(msg_id)
+
+            emotions = {c: row[c] for c in emotion_cols}
+            dominant = max(emotions, key=emotions.get)
+
+            emotion_to_lengths.setdefault(dominant, []).append(length)
+
+        return {
+            emotion: round(sum(lengths) / len(lengths), 2)
+            for emotion, lengths in emotion_to_lengths.items()
+        }
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -9,6 +9,10 @@ class LinguisticAnalysis:
        self.df = df
        self.word_exclusions = word_exclusions

+    def _tokenize(self, text: str):
+        tokens = re.findall(r"\b[a-z]{3,}\b", text)
+        return [t for t in tokens if t not in self.word_exclusions]
+
    def _clean_text(self, text: str) -> str:
        text = re.sub(r"http\S+", "", text)        # remove URLs
        text = re.sub(r"www\S+", "", text)
@@ -66,3 +70,44 @@ class LinguisticAnalysis:
            .head(limit)
            .to_dict(orient="records")
        )
+        
+    def identity_markers(self):
+        df = self.df.copy()
+        df["content"] = df["content"].fillna("").astype(str).str.lower()
+
+        in_group_words = {"we", "us", "our", "ourselves"}
+        out_group_words = {"they", "them", "their", "themselves"}
+
+        emotion_exclusions = [
+            "emotion_neutral",
+            "emotion_surprise"
+        ]
+
+        emotion_cols = [
+            col for col in self.df.columns
+            if col.startswith("emotion_") and col not in emotion_exclusions
+        ]
+        in_count = 0
+        out_count = 0
+        in_emotions = {e: 0 for e in emotion_cols}
+        out_emotions = {e: 0 for e in emotion_cols}
+        total = 0
+
+        for post in df:
+            text = post["content"]
+            tokens = re.findall(r"\b[a-z]{2,}\b", text)
+            total += len(tokens)
+            in_count += sum(t in in_group_words for t in tokens)
+            out_count += sum(t in out_group_words for t in tokens)
+
+            emotions = post[emotion_cols]
+            print(emotions)
+
+            
+
+        return {
+            "in_group_usage": in_count,
+            "out_group_usage": out_count,
+            "in_group_ratio": round(in_count / max(total, 1), 5),
+            "out_group_ratio": round(out_count / max(total, 1), 5),
+        }
--- a/server/app.py
+++ b/server/app.py
@@ -55,7 +55,7 @@ def word_frequencies():
        return jsonify({"error": "No data uploaded"}), 400
    
    try:
-        return jsonify(stat_obj.content_analysis()), 200
+        return jsonify(stat_obj.get_content_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
@@ -80,7 +80,7 @@ def get_time_analysis():
        return jsonify({"error": "No data uploaded"}), 400
    
    try:
-        return jsonify(stat_obj.time_analysis()), 200
+        return jsonify(stat_obj.get_time_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
@@ -93,7 +93,33 @@ def get_user_analysis():
        return jsonify({"error": "No data uploaded"}), 400
    
    try:
-        return jsonify(stat_obj.user_analysis()), 200
+        return jsonify(stat_obj.get_user_analysis()), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+    
+@app.route("/stats/cultural", methods=["GET"])
+def get_cultural_analysis():
+    if stat_obj is None:
+        return jsonify({"error": "No data uploaded"}), 400
+    
+    try:
+        return jsonify(stat_obj.get_cultural_analysis()), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+@app.route("/stats/interaction", methods=["GET"])
+def get_interaction_analysis():
+    if stat_obj is None:
+        return jsonify({"error": "No data uploaded"}), 400
+    
+    try:
+        return jsonify(stat_obj.get_interactional_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -62,13 +62,18 @@ class StatGen:
        self.nlp.add_ner_cols()
    
    ## Public
-    def time_analysis(self) -> pd.DataFrame:
+
+
+    # topics over time
+    # emotions over time
+    def get_time_analysis(self) -> pd.DataFrame:
        return {
            "events_per_day": self.temporal_analysis.posts_per_day(),
            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
        }

-    def content_analysis(self) -> dict:
+    # average topic duration
+    def get_content_analysis(self) -> dict:
        return {
            "word_frequencies": self.linguistic_analysis.word_frequencies(),
            "common_two_phrases": self.linguistic_analysis.ngrams(),
@@ -77,13 +82,31 @@ class StatGen:
            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
        }
    
-    def user_analysis(self) -> dict:
+    # average emotion per user
+    # average chain length
+    def get_user_analysis(self) -> dict:
        return {
            "top_users": self.interaction_analysis.top_users(),
            "users": self.interaction_analysis.per_user_analysis(),
            "interaction_graph": self.interaction_analysis.interaction_graph()
        }
    
+    # average / max thread depth
+    # high engagment threads based on volume
+
+    def get_interactional_analysis(self) -> dict:
+        return {
+            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
+            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
+        }
+    
+    # detect community jargon
+    # in-group and out-group linguistic markers
+    def get_cultural_analysis(self) -> dict:
+        return {
+            "identity_markers": self.linguistic_analysis.identity_markers()
+        }
+    
    def summary(self) -> dict:
        total_posts = (self.df["type"] == "post").sum()
        total_comments = (self.df["type"] == "comment").sum()
Author	SHA1	Message	Date
Dylan De Faoite	257eb80de7	feat(api): add average thread length per emotion	2026-02-23 19:09:48 +00:00
Dylan De Faoite	3a23b1f0c8	feat(api): add average thread depth	2026-02-23 18:14:34 +00:00
Dylan De Faoite	8c76476cd3	fix(api): broken analysis calls due to overlap in attribute and method names	2026-02-23 18:14:24 +00:00
Dylan De Faoite	397986dc89	refactor(frontend): rename InteractionStats to UserStats	2026-02-23 17:15:14 +00:00
Dylan De Faoite	04b7094036	feat(api): add cultural endpoint	2026-02-23 17:14:12 +00:00