diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index c292328..8cfcc7f 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -9,6 +9,10 @@ class LinguisticAnalysis: self.df = df self.word_exclusions = word_exclusions + def _tokenize(self, text: str): + tokens = re.findall(r"\b[a-z]{3,}\b", text) + return [t for t in tokens if t not in self.word_exclusions] + def _clean_text(self, text: str) -> str: text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"www\S+", "", text) @@ -65,4 +69,45 @@ class LinguisticAnalysis: .sort_values("count", ascending=False) .head(limit) .to_dict(orient="records") - ) \ No newline at end of file + ) + + def identity_markers(self): + df = self.df.copy() + df["content"] = df["content"].fillna("").astype(str).str.lower() + + in_group_words = {"we", "us", "our", "ourselves"} + out_group_words = {"they", "them", "their", "themselves"} + + emotion_exclusions = [ + "emotion_neutral", + "emotion_surprise" + ] + + emotion_cols = [ + col for col in self.df.columns + if col.startswith("emotion_") and col not in emotion_exclusions + ] + in_count = 0 + out_count = 0 + in_emotions = {e: 0 for e in emotion_cols} + out_emotions = {e: 0 for e in emotion_cols} + total = 0 + + for post in df: + text = post["content"] + tokens = re.findall(r"\b[a-z]{2,}\b", text) + total += len(tokens) + in_count += sum(t in in_group_words for t in tokens) + out_count += sum(t in out_group_words for t in tokens) + + emotions = post[emotion_cols] + print(emotions) + + + + return { + "in_group_usage": in_count, + "out_group_usage": out_count, + "in_group_ratio": round(in_count / max(total, 1), 5), + "out_group_ratio": round(out_count / max(total, 1), 5), + } \ No newline at end of file diff --git a/server/app.py b/server/app.py index 7b3730d..760e48d 100644 --- a/server/app.py +++ b/server/app.py @@ -100,6 +100,19 @@ def get_user_analysis(): print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 +@app.route("/stats/cultural", methods=["GET"]) +def get_cultural_analysis(): + if stat_obj is None: + return jsonify({"error": "No data uploaded"}), 400 + + try: + return jsonify(stat_obj.cultural_analysis()), 200 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 + @app.route('/filter/search', methods=["POST"]) def search_dataset(): if stat_obj is None: diff --git a/server/stat_gen.py b/server/stat_gen.py index 5d88cf6..014f95f 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -62,12 +62,17 @@ class StatGen: self.nlp.add_ner_cols() ## Public + + + # topics over time + # emotions over time def time_analysis(self) -> pd.DataFrame: return { "events_per_day": self.temporal_analysis.posts_per_day(), "weekday_hour_heatmap": self.temporal_analysis.heatmap() } + # average topic duration def content_analysis(self) -> dict: return { "word_frequencies": self.linguistic_analysis.word_frequencies(), @@ -77,6 +82,8 @@ class StatGen: "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() } + # average emotion per user + # average chain length def user_analysis(self) -> dict: return { "top_users": self.interaction_analysis.top_users(), @@ -84,6 +91,21 @@ class StatGen: "interaction_graph": self.interaction_analysis.interaction_graph() } + # average / max thread depth + # high engagment threads based on volume + + def conversational_analysis(self) -> dict: + return { + + } + + # detect community jargon + # in-group and out-group linguistic markers + def cultural_analysis(self) -> dict: + return { + "identity_markers": self.linguistic_analysis.identity_markers() + } + def summary(self) -> dict: total_posts = (self.df["type"] == "post").sum() total_comments = (self.df["type"] == "comment").sum()