From 97fccd073b8e2da1b904e6d27898c7c5dfb9a4dc Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 16 Mar 2026 16:41:28 +0000 Subject: [PATCH 01/24] feat(emotional): add average emotion & dominant emotion stats --- server/analysis/emotional.py | 91 ++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 19 deletions(-) diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py index 150aa20..8f78809 100644 --- a/server/analysis/emotional.py +++ b/server/analysis/emotional.py @@ -1,33 +1,86 @@ import pandas as pd + class EmotionalAnalysis: - def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict: - emotion_cols = [ - col for col in df.columns - if col.startswith("emotion_") - ] + def _emotion_cols(self, df: pd.DataFrame) -> list[str]: + return [col for col in df.columns if col.startswith("emotion_")] + + def avg_emotion_by_topic(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols: + return [] counts = ( - df[ - (df["topic"] != "Misc") - ] - .groupby("topic") - .size() - .rename("n") + df[(df["topic"] != "Misc")].groupby("topic").size().reset_index(name="n") ) avg_emotion_by_topic = ( - df[ - (df["topic"] != "Misc") - ] + df[(df["topic"] != "Misc")] .groupby("topic")[emotion_cols] .mean() .reset_index() ) - avg_emotion_by_topic = avg_emotion_by_topic.merge( - counts, - on="topic" - ) + avg_emotion_by_topic = avg_emotion_by_topic.merge(counts, on="topic") - return avg_emotion_by_topic.to_dict(orient='records') \ No newline at end of file + return avg_emotion_by_topic.to_dict(orient="records") + + def overall_emotion_average(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols: + return [] + + means = df[emotion_cols].mean() + return [ + { + "emotion": col.replace("emotion_", ""), + "score": float(means[col]), + } + for col in emotion_cols + ] + + def dominant_emotion_distribution(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols or df.empty: + return [] + + dominant_per_row = df[emotion_cols].idxmax(axis=1) + counts = dominant_per_row.value_counts() + total = max(len(dominant_per_row), 1) + + return [ + { + "emotion": col.replace("emotion_", ""), + "count": int(count), + "ratio": round(float(count / total), 4), + } + for col, count in counts.items() + ] + + def emotion_by_source(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols or "source" not in df.columns or df.empty: + return [] + + source_counts = df.groupby("source").size() + source_means = df.groupby("source")[emotion_cols].mean().reset_index() + rows = source_means.to_dict(orient="records") + output = [] + + for row in rows: + source = row["source"] + dominant_col = max(emotion_cols, key=lambda col: float(row.get(col, 0))) + output.append( + { + "source": str(source), + "dominant_emotion": dominant_col.replace("emotion_", ""), + "dominant_score": round(float(row.get(dominant_col, 0)), 4), + "event_count": int(source_counts.get(source, 0)), + } + ) + + return output From 09a4f9036f67b22a278ff0556a22a44d0093500e Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 16 Mar 2026 16:43:24 +0000 Subject: [PATCH 02/24] refactor(stats): add summary and user stat classes for consistency --- server/analysis/stat_gen.py | 96 +++++++++++++------------------------ server/analysis/summary.py | 64 +++++++++++++++++++++++++ server/analysis/user.py | 20 ++++++++ 3 files changed, 118 insertions(+), 62 deletions(-) create mode 100644 server/analysis/summary.py create mode 100644 server/analysis/user.py diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index a9e9289..0d1ffc9 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -6,7 +6,9 @@ from server.analysis.cultural import CulturalAnalysis from server.analysis.emotional import EmotionalAnalysis from server.analysis.interactional import InteractionAnalysis from server.analysis.linguistic import LinguisticAnalysis +from server.analysis.summary import SummaryAnalysis from server.analysis.temporal import TemporalAnalysis +from server.analysis.user import UserAnalysis DOMAIN_STOPWORDS = { "www", @@ -36,12 +38,11 @@ class StatGen: self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() + self.summary_analysis = SummaryAnalysis() + self.user_analysis = UserAnalysis(self.interaction_analysis) ## Private Methods - def _prepare_filtered_df(self, - df: pd.DataFrame, - filters: dict | None = None - ) -> pd.DataFrame: + def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: filters = filters or {} filtered_df = df.copy() @@ -51,10 +52,9 @@ class StatGen: data_source_filter = filters.get("data_sources", None) if search_query: - mask = ( - filtered_df["content"].str.contains(search_query, case=False, na=False) - | filtered_df["author"].str.contains(search_query, case=False, na=False) - ) + mask = filtered_df["content"].str.contains( + search_query, case=False, na=False + ) | filtered_df["author"].str.contains(search_query, case=False, na=False) # Only include title if the column exists if "title" in filtered_df.columns: @@ -76,10 +76,10 @@ class StatGen: return filtered_df ## Public Methods - def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: return self._prepare_filtered_df(df, filters).to_dict(orient="records") - def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { @@ -87,40 +87,43 @@ class StatGen: "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), } - def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( - filtered_df - ) } - def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { - "top_users": self.interaction_analysis.top_users(filtered_df), - "users": self.interaction_analysis.per_user_analysis(filtered_df), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(filtered_df), + "overall_emotion_average": self.emotional_analysis.overall_emotion_average(filtered_df), + "dominant_emotion_distribution": self.emotional_analysis.dominant_emotion_distribution(filtered_df), + "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df) + } + + def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + return { + "top_users": self.user_analysis.top_users(filtered_df), + "users": self.user_analysis.users(filtered_df) + } + + def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + return { + "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df), + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(filtered_df), "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df) } - def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: - filtered_df = self._prepare_filtered_df(df, filters) - - return { - "average_thread_depth": self.interaction_analysis.average_thread_depth( - filtered_df - ), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion( - filtered_df - ), - } - - def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { @@ -136,35 +139,4 @@ class StatGen: def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) - total_posts = (filtered_df["type"] == "post").sum() - total_comments = (filtered_df["type"] == "comment").sum() - events_per_user = filtered_df.groupby("author").size() - - if filtered_df.empty: - return { - "total_events": 0, - "total_posts": 0, - "total_comments": 0, - "unique_users": 0, - "comments_per_post": 0, - "lurker_ratio": 0, - "time_range": { - "start": None, - "end": None, - }, - "sources": [], - } - - return { - "total_events": int(len(filtered_df)), - "total_posts": int(total_posts), - "total_comments": int(total_comments), - "unique_users": int(events_per_user.count()), - "comments_per_post": round(total_comments / max(total_posts, 1), 2), - "lurker_ratio": round((events_per_user == 1).mean(), 2), - "time_range": { - "start": int(filtered_df["dt"].min().timestamp()), - "end": int(filtered_df["dt"].max().timestamp()), - }, - "sources": filtered_df["source"].dropna().unique().tolist(), - } + return self.summary_analysis.summary(filtered_df) diff --git a/server/analysis/summary.py b/server/analysis/summary.py new file mode 100644 index 0000000..14cc8ca --- /dev/null +++ b/server/analysis/summary.py @@ -0,0 +1,64 @@ +import pandas as pd + + +class SummaryAnalysis: + def total_events(self, df: pd.DataFrame) -> int: + return int(len(df)) + + def total_posts(self, df: pd.DataFrame) -> int: + return int(len(df[df["type"] == "post"])) + + def total_comments(self, df: pd.DataFrame) -> int: + return int(len(df[df["type"] == "comment"])) + + def unique_users(self, df: pd.DataFrame) -> int: + return int(len(df["author"].dropna().unique())) + + def comments_per_post(self, total_comments: int, total_posts: int) -> float: + return round(total_comments / max(total_posts, 1), 2) + + def lurker_ratio(self, df: pd.DataFrame) -> float: + events_per_user = df.groupby("author").size() + return round((events_per_user == 1).mean(), 2) + + def time_range(self, df: pd.DataFrame) -> dict: + return { + "start": int(df["dt"].min().timestamp()), + "end": int(df["dt"].max().timestamp()), + } + + def sources(self, df: pd.DataFrame) -> list: + return df["source"].dropna().unique().tolist() + + def empty_summary(self) -> dict: + return { + "total_events": 0, + "total_posts": 0, + "total_comments": 0, + "unique_users": 0, + "comments_per_post": 0, + "lurker_ratio": 0, + "time_range": { + "start": None, + "end": None, + }, + "sources": [], + } + + def summary(self, df: pd.DataFrame) -> dict: + if df.empty: + return self.empty_summary() + + total_posts = self.total_posts(df) + total_comments = self.total_comments(df) + + return { + "total_events": self.total_events(df), + "total_posts": total_posts, + "total_comments": total_comments, + "unique_users": self.unique_users(df), + "comments_per_post": self.comments_per_post(total_comments, total_posts), + "lurker_ratio": self.lurker_ratio(df), + "time_range": self.time_range(df), + "sources": self.sources(df), + } diff --git a/server/analysis/user.py b/server/analysis/user.py new file mode 100644 index 0000000..57ddc1e --- /dev/null +++ b/server/analysis/user.py @@ -0,0 +1,20 @@ +import pandas as pd + +from server.analysis.interactional import InteractionAnalysis + + +class UserAnalysis: + def __init__(self, interaction_analysis: InteractionAnalysis): + self.interaction_analysis = interaction_analysis + + def top_users(self, df: pd.DataFrame) -> list: + return self.interaction_analysis.top_users(df) + + def users(self, df: pd.DataFrame) -> dict | list: + return self.interaction_analysis.per_user_analysis(df) + + def user(self, df: pd.DataFrame) -> dict: + return { + "top_users": self.top_users(df), + "users": self.users(df), + } From 3468fdc2ea35b870b3003108c9721b449ab80f62 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 16 Mar 2026 16:45:11 +0000 Subject: [PATCH 03/24] feat(api): add new user and linguistic endpoints --- server/app.py | 121 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 90 insertions(+), 31 deletions(-) diff --git a/server/app.py b/server/app.py index f373843..3ba9295 100644 --- a/server/app.py +++ b/server/app.py @@ -186,7 +186,7 @@ def scrape_data(): dataset_manager.set_dataset_status( dataset_id, "fetching", - f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" + f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}", ) fetch_and_process_dataset.delay( @@ -198,12 +198,14 @@ def scrape_data(): print(traceback.format_exc()) return jsonify({"error": "Failed to queue dataset processing"}), 500 + return jsonify( + { + "message": "Dataset queued for processing", + "dataset_id": dataset_id, + "status": "processing", + } + ), 202 - return jsonify({ - "message": "Dataset queued for processing", - "dataset_id": dataset_id, - "status": "processing" - }), 202 @app.route("/datasets/upload", methods=["POST"]) @jwt_required() @@ -233,7 +235,9 @@ def upload_data(): posts_df = pd.read_json(post_file, lines=True, convert_dates=False) topics = json.load(topic_file) - dataset_id = dataset_manager.save_dataset_info(current_user, dataset_name, topics) + dataset_id = dataset_manager.save_dataset_info( + current_user, dataset_name, topics + ) process_dataset.delay(dataset_id, posts_df.to_dict(orient="records"), topics) @@ -249,6 +253,7 @@ def upload_data(): except Exception as e: return jsonify({"error": f"An unexpected error occurred"}), 500 + @app.route("/dataset/", methods=["GET"]) @jwt_required() def get_dataset(dataset_id): @@ -256,7 +261,9 @@ def get_dataset(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_info = dataset_manager.get_dataset_info(dataset_id) included_cols = {"id", "name", "created_at"} @@ -269,7 +276,8 @@ def get_dataset(dataset_id): except Exception: print(traceback.format_exc()) return jsonify({"error": "An unexpected error occured"}), 500 - + + @app.route("/dataset/", methods=["PATCH"]) @jwt_required() def update_dataset(dataset_id): @@ -277,7 +285,9 @@ def update_dataset(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) body = request.get_json() new_name = body.get("name") @@ -286,7 +296,9 @@ def update_dataset(dataset_id): return jsonify({"error": "A valid name must be provided"}), 400 dataset_manager.update_dataset_name(dataset_id, new_name.strip()) - return jsonify({"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}), 200 + return jsonify( + {"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"} + ), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -294,7 +306,8 @@ def update_dataset(dataset_id): except Exception: print(traceback.format_exc()) return jsonify({"error": "An unexpected error occurred"}), 500 - + + @app.route("/dataset/", methods=["DELETE"]) @jwt_required() def delete_dataset(dataset_id): @@ -302,11 +315,17 @@ def delete_dataset(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_manager.delete_dataset_info(dataset_id) dataset_manager.delete_dataset_content(dataset_id) - return jsonify({"message": f"Dataset {dataset_id} metadata and content successfully deleted"}), 200 + return jsonify( + { + "message": f"Dataset {dataset_id} metadata and content successfully deleted" + } + ), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -315,6 +334,7 @@ def delete_dataset(dataset_id): print(traceback.format_exc()) return jsonify({"error": "An unexpected error occured"}), 500 + @app.route("/dataset//status", methods=["GET"]) @jwt_required() def get_dataset_status(dataset_id): @@ -322,7 +342,9 @@ def get_dataset_status(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_status = dataset_manager.get_dataset_status(dataset_id) return jsonify(dataset_status), 200 @@ -334,17 +356,44 @@ def get_dataset_status(dataset_id): print(traceback.format_exc()) return jsonify({"error": "An unexpected error occured"}), 500 -@app.route("/dataset//content", methods=["GET"]) + +@app.route("/dataset//linguistic", methods=["GET"]) @jwt_required() -def content_endpoint(dataset_id): +def get_linguistic_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.linguistic(dataset_content, filters)), 200 + except NotAuthorisedException: + return jsonify({"error": "User is not authorised to access this content"}), 403 + except NonExistentDatasetException: + return jsonify({"error": "Dataset does not exist"}), 404 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred"}), 500 + + +@app.route("/dataset//emotional", methods=["GET"]) +@jwt_required() +def get_emotional_analysis(dataset_id): + try: + user_id = int(get_jwt_identity()) + if not dataset_manager.authorize_user_dataset(dataset_id, user_id): + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) + + dataset_content = dataset_manager.get_dataset_content(dataset_id) + filters = get_request_filters() + return jsonify(stat_gen.emotional(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -362,7 +411,9 @@ def get_summary(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() @@ -378,17 +429,19 @@ def get_summary(dataset_id): return jsonify({"error": f"An unexpected error occurred"}), 500 -@app.route("/dataset//time", methods=["GET"]) +@app.route("/dataset//temporal", methods=["GET"]) @jwt_required() -def get_time_analysis(dataset_id): +def get_temporal_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.temporal(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -406,11 +459,13 @@ def get_user_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.user(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -428,11 +483,13 @@ def get_cultural_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.cultural(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -444,17 +501,19 @@ def get_cultural_analysis(dataset_id): return jsonify({"error": f"An unexpected error occurred"}), 500 -@app.route("/dataset//interaction", methods=["GET"]) +@app.route("/dataset//interactional", methods=["GET"]) @jwt_required() def get_interaction_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_interactional_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.interactional(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: From 8a13444b16a6cafd35093ffacc530ba478fccf1e Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 16 Mar 2026 16:46:07 +0000 Subject: [PATCH 04/24] chore(frontend): add new API types --- frontend/src/types/ApiTypes.ts | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/frontend/src/types/ApiTypes.ts b/frontend/src/types/ApiTypes.ts index 5feaddf..43b035b 100644 --- a/frontend/src/types/ApiTypes.ts +++ b/frontend/src/types/ApiTypes.ts @@ -71,6 +71,25 @@ type NGram = { type AverageEmotionByTopic = Emotion & { n: number; topic: string; + [key: string]: string | number; +}; + +type OverallEmotionAverage = { + emotion: string; + score: number; +}; + +type DominantEmotionDistribution = { + emotion: string; + count: number; + ratio: number; +}; + +type EmotionBySource = { + source: string; + dominant_emotion: string; + dominant_score: number; + event_count: number; }; @@ -79,6 +98,9 @@ type ContentAnalysisResponse = { average_emotion_by_topic: AverageEmotionByTopic[]; common_three_phrases: NGram[]; common_two_phrases: NGram[]; + overall_emotion_average?: OverallEmotionAverage[]; + dominant_emotion_distribution?: DominantEmotionDistribution[]; + emotion_by_source?: EmotionBySource[]; } // Summary @@ -110,6 +132,9 @@ export type { UserAnalysisResponse, FrequencyWord, AverageEmotionByTopic, + OverallEmotionAverage, + DominantEmotionDistribution, + EmotionBySource, SummaryResponse, TimeAnalysisResponse, ContentAnalysisResponse, From 9093059d05352ab9c039071fb7867bd861b7d97b Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 12:23:03 +0000 Subject: [PATCH 05/24] refactor(stats): move user stats out of interactional into users --- server/analysis/interactional.py | 67 ---------------------------- server/analysis/stat_gen.py | 4 +- server/analysis/user.py | 76 ++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 82 deletions(-) diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 864980d..8220747 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -57,73 +57,6 @@ class InteractionAnalysis: return rows - def top_users(self, df: pd.DataFrame) -> list: - counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) - - top_users = [ - {"author": author, "source": source, "count": int(count)} - for (author, source), count in counts.items() - ] - - return top_users - - def per_user_analysis(self, df: pd.DataFrame) -> dict: - per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) - - emotion_cols = [col for col in df.columns if col.startswith("emotion_")] - - avg_emotions_by_author = {} - if emotion_cols: - avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) - avg_emotions_by_author = { - author: {emotion: float(score) for emotion, score in row.items()} - for author, row in avg_emotions.iterrows() - } - - # ensure columns always exist - for col in ("post", "comment"): - if col not in per_user.columns: - per_user[col] = 0 - - per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( - 0, 1 - ) - per_user["comment_share"] = per_user["comment"] / ( - per_user["post"] + per_user["comment"] - ).replace(0, 1) - per_user = per_user.sort_values("comment_post_ratio", ascending=True) - per_user_records = per_user.reset_index().to_dict(orient="records") - - vocab_rows = self._vocab_richness_per_user(df) - vocab_by_author = {row["author"]: row for row in vocab_rows} - - # merge vocab richness + per_user information - merged_users = [] - for row in per_user_records: - author = row["author"] - merged_users.append( - { - "author": author, - "post": int(row.get("post", 0)), - "comment": int(row.get("comment", 0)), - "comment_post_ratio": float(row.get("comment_post_ratio", 0)), - "comment_share": float(row.get("comment_share", 0)), - "avg_emotions": avg_emotions_by_author.get(author, {}), - "vocab": vocab_by_author.get( - author, - { - "vocab_richness": 0, - "avg_words_per_event": 0, - "top_words": [], - }, - ), - } - ) - - merged_users.sort(key=lambda u: u["comment_post_ratio"]) - - return merged_users - def interaction_graph(self, df: pd.DataFrame): interactions = {a: {} for a in df["author"].dropna().unique()} diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index 0d1ffc9..bec7eeb 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -39,7 +39,7 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() self.summary_analysis = SummaryAnalysis() - self.user_analysis = UserAnalysis(self.interaction_analysis) + self.user_analysis = UserAnalysis() ## Private Methods def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: @@ -111,7 +111,7 @@ class StatGen: return { "top_users": self.user_analysis.top_users(filtered_df), - "users": self.user_analysis.users(filtered_df) + "users": self.user_analysis.per_user_analysis(filtered_df) } def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: diff --git a/server/analysis/user.py b/server/analysis/user.py index 57ddc1e..d5e9917 100644 --- a/server/analysis/user.py +++ b/server/analysis/user.py @@ -1,20 +1,70 @@ import pandas as pd -from server.analysis.interactional import InteractionAnalysis - class UserAnalysis: - def __init__(self, interaction_analysis: InteractionAnalysis): - self.interaction_analysis = interaction_analysis - def top_users(self, df: pd.DataFrame) -> list: - return self.interaction_analysis.top_users(df) + counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) - def users(self, df: pd.DataFrame) -> dict | list: - return self.interaction_analysis.per_user_analysis(df) + top_users = [ + {"author": author, "source": source, "count": int(count)} + for (author, source), count in counts.items() + ] - def user(self, df: pd.DataFrame) -> dict: - return { - "top_users": self.top_users(df), - "users": self.users(df), - } + return top_users + + def per_user_analysis(self, df: pd.DataFrame) -> dict: + per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) + + emotion_cols = [col for col in df.columns if col.startswith("emotion_")] + + avg_emotions_by_author = {} + if emotion_cols: + avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) + avg_emotions_by_author = { + author: {emotion: float(score) for emotion, score in row.items()} + for author, row in avg_emotions.iterrows() + } + + # ensure columns always exist + for col in ("post", "comment"): + if col not in per_user.columns: + per_user[col] = 0 + + per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( + 0, 1 + ) + per_user["comment_share"] = per_user["comment"] / ( + per_user["post"] + per_user["comment"] + ).replace(0, 1) + per_user = per_user.sort_values("comment_post_ratio", ascending=True) + per_user_records = per_user.reset_index().to_dict(orient="records") + + vocab_rows = self._vocab_richness_per_user(df) + vocab_by_author = {row["author"]: row for row in vocab_rows} + + # merge vocab richness + per_user information + merged_users = [] + for row in per_user_records: + author = row["author"] + merged_users.append( + { + "author": author, + "post": int(row.get("post", 0)), + "comment": int(row.get("comment", 0)), + "comment_post_ratio": float(row.get("comment_post_ratio", 0)), + "comment_share": float(row.get("comment_share", 0)), + "avg_emotions": avg_emotions_by_author.get(author, {}), + "vocab": vocab_by_author.get( + author, + { + "vocab_richness": 0, + "avg_words_per_event": 0, + "top_words": [], + }, + ), + } + ) + + merged_users.sort(key=lambda u: u["comment_post_ratio"]) + + return merged_users From 8a0f6e71e8aed5540111d9657da144b3b2fce921 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 12:31:53 +0000 Subject: [PATCH 06/24] chore(api): rename cultural entity emotion endpoint --- server/analysis/stat_gen.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index bec7eeb..36ef9a9 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -127,13 +127,9 @@ class StatGen: filtered_df = self._prepare_filtered_df(df, filters) return { - "identity_markers": self.cultural_analysis.get_identity_markers( - filtered_df - ), + "identity_markers": self.cultural_analysis.get_identity_markers(filtered_df), "stance_markers": self.cultural_analysis.get_stance_markers(filtered_df), - "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity( - filtered_df - ), + "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df) } def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: From 31fb275ee361a2c3cd2d5c6f7ab218c3377491cc Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 12:53:30 +0000 Subject: [PATCH 07/24] fix(db): incorrect NER column being inserted --- server/core/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/core/datasets.py b/server/core/datasets.py index 4690454..a55445d 100644 --- a/server/core/datasets.py +++ b/server/core/datasets.py @@ -101,7 +101,7 @@ class DatasetManager: row["source"], row.get("topic"), row.get("topic_confidence"), - Json(row["ner_entities"]) if row.get("ner_entities") else None, + Json(row["entities"]) if row.get("entities") is not None else None, row.get("emotion_anger"), row.get("emotion_disgust"), row.get("emotion_fear"), From 2fa1dff4b70e42031191fed1284cf7b2f3958879 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 13:27:39 +0000 Subject: [PATCH 08/24] feat(stat): add lexical diversity stat --- server/analysis/linguistic.py | 16 ++++++++++++++++ server/analysis/stat_gen.py | 1 + 2 files changed, 17 insertions(+) diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index dc91faf..7546bbf 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -61,3 +61,19 @@ class LinguisticAnalysis: .head(limit) .to_dict(orient="records") ) + + def lexical_diversity(self, df: pd.DataFrame) -> dict: + tokens = ( + df["content"].fillna("").astype(str).str.lower() + .str.findall(r"\b[a-z]{2,}\b") + .explode() + ) + tokens = tokens[~tokens.isin(self.word_exclusions)] + total = max(len(tokens), 1) + unique = int(tokens.nunique()) + + return { + "total_tokens": total, + "unique_tokens": unique, + "ttr": round(unique / total, 4), + } diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index 36ef9a9..8435340 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -94,6 +94,7 @@ class StatGen: "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), + "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df) } def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: From 7b5a9392715bf775d11949ff7b1074e1d1adf979 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 13:36:10 +0000 Subject: [PATCH 09/24] fix(stats): missing private methods in User obj --- server/analysis/interactional.py | 48 ---------------------------- server/analysis/stat_gen.py | 2 +- server/analysis/user.py | 54 ++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 49 deletions(-) diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 8220747..53d97dc 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -1,9 +1,6 @@ import pandas as pd import re -from collections import Counter - - class InteractionAnalysis: def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions @@ -12,51 +9,6 @@ class InteractionAnalysis: tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in self.word_exclusions] - def _vocab_richness_per_user( - self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 - ) -> list: - df = df.copy() - df["content"] = df["content"].fillna("").astype(str).str.lower() - df["tokens"] = df["content"].apply(self._tokenize) - - rows = [] - for author, group in df.groupby("author"): - all_tokens = [t for tokens in group["tokens"] for t in tokens] - - total_words = len(all_tokens) - unique_words = len(set(all_tokens)) - events = len(group) - - # Min amount of words for a user, any less than this might give weird results - if total_words < min_words: - continue - - # 100% = they never reused a word (excluding stop words) - vocab_richness = unique_words / total_words - avg_words = total_words / max(events, 1) - - counts = Counter(all_tokens) - top_words = [ - {"word": w, "count": int(c)} - for w, c in counts.most_common(top_most_used_words) - ] - - rows.append( - { - "author": author, - "events": int(events), - "total_words": int(total_words), - "unique_words": int(unique_words), - "vocab_richness": round(vocab_richness, 3), - "avg_words_per_event": round(avg_words, 2), - "top_words": top_words, - } - ) - - rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) - - return rows - def interaction_graph(self, df: pd.DataFrame): interactions = {a: {} for a in df["author"].dropna().unique()} diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index 8435340..c2f09ed 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -39,7 +39,7 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() self.summary_analysis = SummaryAnalysis() - self.user_analysis = UserAnalysis() + self.user_analysis = UserAnalysis(EXCLUDE_WORDS) ## Private Methods def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: diff --git a/server/analysis/user.py b/server/analysis/user.py index d5e9917..fc8e618 100644 --- a/server/analysis/user.py +++ b/server/analysis/user.py @@ -1,7 +1,61 @@ import pandas as pd +import re +from collections import Counter class UserAnalysis: + def __init__(self, word_exclusions: set[str]): + self.word_exclusions = word_exclusions + + def _tokenize(self, text: str): + tokens = re.findall(r"\b[a-z]{3,}\b", text) + return [t for t in tokens if t not in self.word_exclusions] + + def _vocab_richness_per_user( + self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 + ) -> list: + df = df.copy() + df["content"] = df["content"].fillna("").astype(str).str.lower() + df["tokens"] = df["content"].apply(self._tokenize) + + rows = [] + for author, group in df.groupby("author"): + all_tokens = [t for tokens in group["tokens"] for t in tokens] + + total_words = len(all_tokens) + unique_words = len(set(all_tokens)) + events = len(group) + + # Min amount of words for a user, any less than this might give weird results + if total_words < min_words: + continue + + # 100% = they never reused a word (excluding stop words) + vocab_richness = unique_words / total_words + avg_words = total_words / max(events, 1) + + counts = Counter(all_tokens) + top_words = [ + {"word": w, "count": int(c)} + for w, c in counts.most_common(top_most_used_words) + ] + + rows.append( + { + "author": author, + "events": int(events), + "total_words": int(total_words), + "unique_words": int(unique_words), + "vocab_richness": round(vocab_richness, 3), + "avg_words_per_event": round(avg_words, 2), + "top_words": top_words, + } + ) + + rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) + + return rows + def top_users(self, df: pd.DataFrame) -> list: counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) From 8372aa727864a73db1a8f62228b3c539cbc37a29 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 13:36:41 +0000 Subject: [PATCH 10/24] feat(api): add endpoint to view entire dataset --- server/app.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/server/app.py b/server/app.py index 3ba9295..7a5dea0 100644 --- a/server/app.py +++ b/server/app.py @@ -523,7 +523,28 @@ def get_interaction_analysis(dataset_id): except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred"}), 500 + +@app.route("/dataset//all", methods=["GET"]) +@jwt_required() +def get_full_dataset(dataset_id: int): + try: + user_id = int(get_jwt_identity()) + if not dataset_manager.authorize_user_dataset(dataset_id, user_id): + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) + dataset_content = dataset_manager.get_dataset_content(dataset_id) + return jsonify(dataset_content.to_dict(orient="records")), 200 + except NotAuthorisedException: + return jsonify({"error": "User is not authorised to access this content"}), 403 + except NonExistentDatasetException: + return jsonify({"error": "Dataset does not exist"}), 404 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred"}), 500 if __name__ == "__main__": app.run(debug=True) From 2a00384a5557e51aab0e2b081f58a8f723ec1457 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 19:03:56 +0000 Subject: [PATCH 11/24] feat(interaction): add top interaction pairs and initiator ratio methods --- server/analysis/interactional.py | 77 +++++++------------------------- server/analysis/stat_gen.py | 3 +- 2 files changed, 18 insertions(+), 62 deletions(-) diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 53d97dc..7e0c081 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -51,68 +51,23 @@ class InteractionAnalysis: return 0 return round(sum(depths) / len(depths), 2) + + def top_interaction_pairs(self, df: pd.DataFrame, top_n=10): + graph = self.interaction_graph(df) + pairs = [] - def average_thread_length_by_emotion(self, df: pd.DataFrame): - emotion_exclusions = {"emotion_neutral", "emotion_surprise"} + for a, targets in graph.items(): + for b, count in targets.items(): + pairs.append(((a, b), count)) - emotion_cols = [ - c - for c in df.columns - if c.startswith("emotion_") and c not in emotion_exclusions - ] + pairs.sort(key=lambda x: x[1], reverse=True) + return pairs[:top_n] + + def initiator_ratio(self, df: pd.DataFrame): + starters = df["reply_to"].isna().sum() + total = len(df) - id_to_reply = df.set_index("id")["reply_to"].to_dict() - length_cache = {} + if total == 0: + return 0 - def thread_length_from(start_id): - if start_id in length_cache: - return length_cache[start_id] - - seen = set() - length = 1 - current = start_id - - while True: - if current in seen: - # infinite loop shouldn't happen, but just in case - break - seen.add(current) - - reply_to = id_to_reply.get(current) - - if ( - reply_to is None - or (isinstance(reply_to, float) and pd.isna(reply_to)) - or reply_to == "" - ): - break - - length += 1 - current = reply_to - - if current in length_cache: - length += length_cache[current] - 1 - break - - length_cache[start_id] = length - return length - - emotion_to_lengths = {} - - # Fill NaNs in emotion cols to avoid max() issues - emo_df = df[["id"] + emotion_cols].copy() - emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0) - - for _, row in emo_df.iterrows(): - msg_id = row["id"] - length = thread_length_from(msg_id) - - emotions = {c: row[c] for c in emotion_cols} - dominant = max(emotions, key=emotions.get) - - emotion_to_lengths.setdefault(dominant, []).append(length) - - return { - emotion: round(sum(lengths) / len(lengths), 2) - for emotion, lengths in emotion_to_lengths.items() - } + return round(starters / total, 2) diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index c2f09ed..f5b328d 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -120,7 +120,8 @@ class StatGen: return { "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(filtered_df), + "top_interaction_pairs": self.interaction_analysis.top_interaction_pairs(filtered_df, top_n=100), + "initiator_ratio": self.interaction_analysis.initiator_ratio(filtered_df), "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df) } From 71998c450e13007d4e6254591415b467d9752661 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Mar 2026 19:49:03 +0000 Subject: [PATCH 12/24] fix(db): change title type to text Occasionally a Reddit post would have a long title, and would break in the schema. --- server/db/schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/db/schema.sql b/server/db/schema.sql index 4550633..5bec116 100644 --- a/server/db/schema.sql +++ b/server/db/schema.sql @@ -43,7 +43,7 @@ CREATE TABLE events ( weekday VARCHAR(255) NOT NULL, /* Posts Only */ - title VARCHAR(255), + title TEXT, /* Comments Only*/ parent_id VARCHAR(255), From 3e78a54388e4bc3cd2d2637a443736d29910fb48 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 18 Mar 2026 18:36:05 +0000 Subject: [PATCH 13/24] feat(stat): add conversation concentration metric Remove old `initiator_ratio` metric which wasn't working due every event having a `reply_to` value. This metric was suggested by AI, and is a surprisingly interesting one that gave interesting insights. --- server/analysis/interactional.py | 26 ++++++++++++++++++++------ server/analysis/stat_gen.py | 4 ++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 7e0c081..e15940e 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -63,11 +63,25 @@ class InteractionAnalysis: pairs.sort(key=lambda x: x[1], reverse=True) return pairs[:top_n] - def initiator_ratio(self, df: pd.DataFrame): - starters = df["reply_to"].isna().sum() - total = len(df) + def conversation_concentration(self, df: pd.DataFrame) -> dict: + if "type" not in df.columns: + return {} - if total == 0: - return 0 + comments = df[df["type"] == "comment"] + if comments.empty: + return {} - return round(starters / total, 2) + author_counts = comments["author"].value_counts() + total_comments = len(comments) + total_authors = len(author_counts) + + top_10_pct_n = max(1, int(total_authors * 0.1)) + top_10_pct_share = round(author_counts.head(top_10_pct_n).sum() / total_comments, 4) + + return { + "total_commenting_authors": total_authors, + "top_10pct_author_count": top_10_pct_n, + "top_10pct_comment_share": float(top_10_pct_share), + "single_comment_authors": int((author_counts == 1).sum()), + "single_comment_author_ratio": float(round((author_counts == 1).sum() / total_authors, 4)), + } \ No newline at end of file diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index f5b328d..4368841 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -121,8 +121,8 @@ class StatGen: return { "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df), "top_interaction_pairs": self.interaction_analysis.top_interaction_pairs(filtered_df, top_n=100), - "initiator_ratio": self.interaction_analysis.initiator_ratio(filtered_df), - "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df) + "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df), + "conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df) } def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict: From 436549641fede45e1217e2fc020fd517ccd9da3b Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 18 Mar 2026 18:37:39 +0000 Subject: [PATCH 14/24] chore(frontend): add api types for new backend data --- frontend/src/types/ApiTypes.ts | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/frontend/src/types/ApiTypes.ts b/frontend/src/types/ApiTypes.ts index 43b035b..f336fef 100644 --- a/frontend/src/types/ApiTypes.ts +++ b/frontend/src/types/ApiTypes.ts @@ -31,12 +31,25 @@ type User = { type InteractionGraph = Record>; +type ConversationConcentration = { + total_commenting_authors: number; + top_10pct_author_count: number; + top_10pct_comment_share: number; + single_comment_authors: number; + single_comment_author_ratio: number; +}; + type UserAnalysisResponse = { top_users: TopUser[]; users: User[]; interaction_graph: InteractionGraph; }; +type UserEndpointResponse = { + top_users: TopUser[]; + users: User[]; +}; + // Time Analysis type EventsPerDay = { date: Date; @@ -103,6 +116,65 @@ type ContentAnalysisResponse = { emotion_by_source?: EmotionBySource[]; } +type LinguisticAnalysisResponse = { + word_frequencies: FrequencyWord[]; + common_two_phrases: NGram[]; + common_three_phrases: NGram[]; + lexical_diversity?: Record; +}; + +type EmotionalAnalysisResponse = { + average_emotion_by_topic: AverageEmotionByTopic[]; + overall_emotion_average?: OverallEmotionAverage[]; + dominant_emotion_distribution?: DominantEmotionDistribution[]; + emotion_by_source?: EmotionBySource[]; +}; + +type InteractionAnalysisResponse = { + average_thread_depth?: number; + top_interaction_pairs?: [[string, string], number][]; + conversation_concentration?: ConversationConcentration; + interaction_graph: InteractionGraph; +}; + +type IdentityMarkers = { + in_group_usage: number; + out_group_usage: number; + in_group_ratio: number; + out_group_ratio: number; + in_group_posts: number; + out_group_posts: number; + tie_posts: number; + in_group_emotion_avg?: Record; + out_group_emotion_avg?: Record; +}; + +type StanceMarkers = { + hedge_total: number; + certainty_total: number; + deontic_total: number; + permission_total: number; + hedge_per_1k_tokens: number; + certainty_per_1k_tokens: number; + deontic_per_1k_tokens: number; + permission_per_1k_tokens: number; +}; + +type EntityEmotionAggregate = { + post_count: number; + emotion_avg: Record; +}; + +type AverageEmotionPerEntity = { + entity_emotion_avg: Record; +}; + +type CulturalAnalysisResponse = { + identity_markers?: IdentityMarkers; + stance_markers?: StanceMarkers; + avg_emotion_per_entity?: AverageEmotionPerEntity; +}; + // Summary type SummaryResponse = { total_events: number; @@ -129,7 +201,9 @@ export type { Vocab, User, InteractionGraph, + ConversationConcentration, UserAnalysisResponse, + UserEndpointResponse, FrequencyWord, AverageEmotionByTopic, OverallEmotionAverage, @@ -138,5 +212,13 @@ export type { SummaryResponse, TimeAnalysisResponse, ContentAnalysisResponse, + LinguisticAnalysisResponse, + EmotionalAnalysisResponse, + InteractionAnalysisResponse, + IdentityMarkers, + StanceMarkers, + EntityEmotionAggregate, + AverageEmotionPerEntity, + CulturalAnalysisResponse, FilterResponse } From 7e4a91bb5e92cef24e17b245b582650488ee6990 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 18 Mar 2026 18:40:39 +0000 Subject: [PATCH 15/24] style(frontend): style api types to be in order of the endpoint --- frontend/src/types/ApiTypes.ts | 172 +++++++++++++++++---------------- 1 file changed, 88 insertions(+), 84 deletions(-) diff --git a/frontend/src/types/ApiTypes.ts b/frontend/src/types/ApiTypes.ts index f336fef..0e41386 100644 --- a/frontend/src/types/ApiTypes.ts +++ b/frontend/src/types/ApiTypes.ts @@ -1,14 +1,28 @@ -// User Responses -type TopUser = { - author: string; - source: string; - count: number +// Shared types +type FrequencyWord = { + word: string; + count: number; }; -type FrequencyWord = { - word: string; - count: number; -} +type NGram = { + count: number; + ngram: string; +}; + +type Emotion = { + emotion_anger: number; + emotion_disgust: number; + emotion_fear: number; + emotion_joy: number; + emotion_sadness: number; +}; + +// User +type TopUser = { + author: string; + source: string; + count: number; +}; type Vocab = { author: string; @@ -31,12 +45,9 @@ type User = { type InteractionGraph = Record>; -type ConversationConcentration = { - total_commenting_authors: number; - top_10pct_author_count: number; - top_10pct_comment_share: number; - single_comment_authors: number; - single_comment_author_ratio: number; +type UserEndpointResponse = { + top_users: TopUser[]; + users: User[]; }; type UserAnalysisResponse = { @@ -45,42 +56,24 @@ type UserAnalysisResponse = { interaction_graph: InteractionGraph; }; -type UserEndpointResponse = { - top_users: TopUser[]; - users: User[]; -}; - -// Time Analysis +// Time type EventsPerDay = { - date: Date; - count: number; -} + date: Date; + count: number; +}; type HeatmapCell = { - date: Date; - hour: number; - count: number; -} - -type TimeAnalysisResponse = { - events_per_day: EventsPerDay[]; - weekday_hour_heatmap: HeatmapCell[]; -} - -// Content Analysis -type Emotion = { - emotion_anger: number; - emotion_disgust: number; - emotion_fear: number; - emotion_joy: number; - emotion_sadness: number; + date: Date; + hour: number; + count: number; }; -type NGram = { - count: number; - ngram: string; -} +type TimeAnalysisResponse = { + events_per_day: EventsPerDay[]; + weekday_hour_heatmap: HeatmapCell[]; +}; +// Content (combines emotional and linguistic) type AverageEmotionByTopic = Emotion & { n: number; topic: string; @@ -105,17 +98,17 @@ type EmotionBySource = { event_count: number; }; - type ContentAnalysisResponse = { - word_frequencies: FrequencyWord[]; - average_emotion_by_topic: AverageEmotionByTopic[]; - common_three_phrases: NGram[]; - common_two_phrases: NGram[]; - overall_emotion_average?: OverallEmotionAverage[]; - dominant_emotion_distribution?: DominantEmotionDistribution[]; - emotion_by_source?: EmotionBySource[]; -} + word_frequencies: FrequencyWord[]; + average_emotion_by_topic: AverageEmotionByTopic[]; + common_three_phrases: NGram[]; + common_two_phrases: NGram[]; + overall_emotion_average?: OverallEmotionAverage[]; + dominant_emotion_distribution?: DominantEmotionDistribution[]; + emotion_by_source?: EmotionBySource[]; +}; +// Linguistic type LinguisticAnalysisResponse = { word_frequencies: FrequencyWord[]; common_two_phrases: NGram[]; @@ -123,6 +116,7 @@ type LinguisticAnalysisResponse = { lexical_diversity?: Record; }; +// Emotional type EmotionalAnalysisResponse = { average_emotion_by_topic: AverageEmotionByTopic[]; overall_emotion_average?: OverallEmotionAverage[]; @@ -130,6 +124,15 @@ type EmotionalAnalysisResponse = { emotion_by_source?: EmotionBySource[]; }; +// Interactional +type ConversationConcentration = { + total_commenting_authors: number; + top_10pct_author_count: number; + top_10pct_comment_share: number; + single_comment_authors: number; + single_comment_author_ratio: number; +}; + type InteractionAnalysisResponse = { average_thread_depth?: number; top_interaction_pairs?: [[string, string], number][]; @@ -137,6 +140,7 @@ type InteractionAnalysisResponse = { interaction_graph: InteractionGraph; }; +// Cultural type IdentityMarkers = { in_group_usage: number; out_group_usage: number; @@ -175,7 +179,7 @@ type CulturalAnalysisResponse = { avg_emotion_per_entity?: AverageEmotionPerEntity; }; -// Summary +// Summary type SummaryResponse = { total_events: number; total_posts: number; @@ -190,35 +194,35 @@ type SummaryResponse = { sources: string[]; }; -// Filtering Response +// Filter type FilterResponse = { - rows: number - data: any; -} + rows: number; + data: any; +}; export type { - TopUser, - Vocab, - User, - InteractionGraph, - ConversationConcentration, - UserAnalysisResponse, - UserEndpointResponse, - FrequencyWord, - AverageEmotionByTopic, - OverallEmotionAverage, - DominantEmotionDistribution, - EmotionBySource, - SummaryResponse, - TimeAnalysisResponse, - ContentAnalysisResponse, - LinguisticAnalysisResponse, - EmotionalAnalysisResponse, - InteractionAnalysisResponse, - IdentityMarkers, - StanceMarkers, - EntityEmotionAggregate, - AverageEmotionPerEntity, - CulturalAnalysisResponse, - FilterResponse -} + TopUser, + Vocab, + User, + InteractionGraph, + ConversationConcentration, + UserAnalysisResponse, + UserEndpointResponse, + FrequencyWord, + AverageEmotionByTopic, + OverallEmotionAverage, + DominantEmotionDistribution, + EmotionBySource, + SummaryResponse, + TimeAnalysisResponse, + ContentAnalysisResponse, + LinguisticAnalysisResponse, + EmotionalAnalysisResponse, + InteractionAnalysisResponse, + IdentityMarkers, + StanceMarkers, + EntityEmotionAggregate, + AverageEmotionPerEntity, + CulturalAnalysisResponse, + FilterResponse, +}; From 17ef42e5489142eb9009e30e3fe784a61d7742fc Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 18 Mar 2026 18:43:49 +0000 Subject: [PATCH 16/24] feat!(frontend): add cultural, interactional and linguistic stat pages --- frontend/src/components/CulturalStats.tsx | 119 +++++++++++ .../src/components/InteractionalStats.tsx | 198 ++++++++++++++++++ frontend/src/components/LinguisticStats.tsx | 86 ++++++++ frontend/src/pages/Stats.tsx | 100 ++++++++- 4 files changed, 495 insertions(+), 8 deletions(-) create mode 100644 frontend/src/components/CulturalStats.tsx create mode 100644 frontend/src/components/InteractionalStats.tsx create mode 100644 frontend/src/components/LinguisticStats.tsx diff --git a/frontend/src/components/CulturalStats.tsx b/frontend/src/components/CulturalStats.tsx new file mode 100644 index 0000000..7f3a775 --- /dev/null +++ b/frontend/src/components/CulturalStats.tsx @@ -0,0 +1,119 @@ +import Card from "./Card"; +import StatsStyling from "../styles/stats_styling"; +import type { CulturalAnalysisResponse } from "../types/ApiTypes"; + +const styles = StatsStyling; + +type CulturalStatsProps = { + data: CulturalAnalysisResponse; +}; + +const CulturalStats = ({ data }: CulturalStatsProps) => { + const identity = data.identity_markers; + const stance = data.stance_markers; + const rawEntities = data.avg_emotion_per_entity?.entity_emotion_avg ?? {}; + const entities = Object.entries(rawEntities) + .sort((a, b) => (b[1].post_count - a[1].post_count)) + .slice(0, 20); + + const topEmotion = (emotionAvg: Record | undefined) => { + const entries = Object.entries(emotionAvg ?? {}); + if (!entries.length) { + return "—"; + } + + entries.sort((a, b) => b[1] - a[1]); + const dominant = entries[0] ?? ["emotion_unknown", 0]; + const dominantLabel = dominant[0].replace("emotion_", ""); + return `${dominantLabel} (${dominant[1].toFixed(3)})`; + }; + + return ( +
+
+ + + + + + + + + + +
+

In-Group Emotion Profile

+

Dominant average emotion where in-group framing is stronger.

+
{topEmotion(identity?.in_group_emotion_avg)}
+
+ +
+

Out-Group Emotion Profile

+

Dominant average emotion where out-group framing is stronger.

+
{topEmotion(identity?.out_group_emotion_avg)}
+
+ +
+

Entity Emotion Averages

+

Most frequent entities and their dominant average emotion signature.

+ {!entities.length ? ( +
No entity-level cultural data available.
+ ) : ( +
+ {entities.map(([entity, aggregate]) => ( +
+
{entity}
+
+ {aggregate.post_count.toLocaleString()} posts • Dominant emotion: {topEmotion(aggregate.emotion_avg)} +
+
+ ))} +
+ )} +
+
+
+ ); +}; + +export default CulturalStats; diff --git a/frontend/src/components/InteractionalStats.tsx b/frontend/src/components/InteractionalStats.tsx new file mode 100644 index 0000000..43567c5 --- /dev/null +++ b/frontend/src/components/InteractionalStats.tsx @@ -0,0 +1,198 @@ +import Card from "./Card"; +import StatsStyling from "../styles/stats_styling"; +import type { InteractionAnalysisResponse } from "../types/ApiTypes"; +import { + ResponsiveContainer, + BarChart, + Bar, + XAxis, + YAxis, + CartesianGrid, + Tooltip, + PieChart, + Pie, + Cell, + Legend, +} from "recharts"; + +const styles = StatsStyling; + +type InteractionalStatsProps = { + data: InteractionAnalysisResponse; +}; + +const InteractionalStats = ({ data }: InteractionalStatsProps) => { + const graph = data.interaction_graph ?? {}; + const userCount = Object.keys(graph).length; + const edges = Object.values(graph).flatMap((targets) => Object.values(targets)); + const edgeCount = edges.length; + const interactionVolume = edges.reduce((sum, value) => sum + value, 0); + const concentration = data.conversation_concentration; + const topTenCommentShare = typeof concentration?.top_10pct_comment_share === "number" + ? concentration?.top_10pct_comment_share + : null; + const topTenAuthorCount = typeof concentration?.top_10pct_author_count === "number" + ? concentration.top_10pct_author_count + : null; + const totalCommentingAuthors = typeof concentration?.total_commenting_authors === "number" + ? concentration.total_commenting_authors + : null; + const singleCommentAuthorRatio = typeof concentration?.single_comment_author_ratio === "number" + ? concentration.single_comment_author_ratio + : null; + + const topPairs = (data.top_interaction_pairs ?? []) + .filter((item): item is [[string, string], number] => { + if (!Array.isArray(item) || item.length !== 2) { + return false; + } + + const pair = item[0]; + const count = item[1]; + + return Array.isArray(pair) + && pair.length === 2 + && typeof pair[0] === "string" + && typeof pair[1] === "string" + && typeof count === "number"; + }) + .slice(0, 20); + + const topPairChartData = topPairs.slice(0, 8).map(([[source, target], value], index) => ({ + pair: `${source} -> ${target}`, + replies: value, + rank: index + 1, + })); + + const topTenSharePercent = topTenCommentShare === null + ? null + : topTenCommentShare * 100; + const nonTopTenSharePercent = topTenSharePercent === null + ? null + : Math.max(0, 100 - topTenSharePercent); + + let concentrationPieData: { name: string; value: number }[] = []; + if (topTenSharePercent !== null && nonTopTenSharePercent !== null) { + concentrationPieData = [ + { name: "Top 10% authors", value: topTenSharePercent }, + { name: "Other authors", value: nonTopTenSharePercent }, + ]; + } + + const PIE_COLORS = ["#2b6777", "#c8d8e4"]; + + return ( +
+
+ + + + + + + +
+

Interaction Visuals

+

Quick charts for interaction direction and conversation concentration.

+ +
+
+

Top Interaction Pairs

+
+ + + + + `#${value}`} + width={36} + /> + + + + +
+
+ +
+

Top 10% vs Other Comment Share

+
+ + + + {concentrationPieData.map((entry, index) => ( + + ))} + + + + + +
+
+
+
+ +
+

Top Interaction Pairs

+

Most frequent directed reply paths between users.

+ {!topPairs.length ? ( +
No interaction pair data available.
+ ) : ( +
+ {topPairs.map(([[source, target], value], index) => ( +
${target}-${index}`} style={styles.topUserItem}> +
{source} -> {target}
+
{value.toLocaleString()} replies
+
+ ))} +
+ )} +
+
+
+ ); +}; + +export default InteractionalStats; diff --git a/frontend/src/components/LinguisticStats.tsx b/frontend/src/components/LinguisticStats.tsx new file mode 100644 index 0000000..3569511 --- /dev/null +++ b/frontend/src/components/LinguisticStats.tsx @@ -0,0 +1,86 @@ +import Card from "./Card"; +import StatsStyling from "../styles/stats_styling"; +import type { LinguisticAnalysisResponse } from "../types/ApiTypes"; + +const styles = StatsStyling; + +type LinguisticStatsProps = { + data: LinguisticAnalysisResponse; +}; + +const LinguisticStats = ({ data }: LinguisticStatsProps) => { + const lexical = data.lexical_diversity; + const words = data.word_frequencies ?? []; + const bigrams = data.common_two_phrases ?? []; + const trigrams = data.common_three_phrases ?? []; + + const topWords = words.slice(0, 20); + const topBigrams = bigrams.slice(0, 10); + const topTrigrams = trigrams.slice(0, 10); + + return ( +
+
+ + + + +
+

Top Words

+

Most frequent filtered terms.

+
+ {topWords.map((item) => ( +
+
{item.word}
+
{item.count.toLocaleString()} uses
+
+ ))} +
+
+ +
+

Top Bigrams

+

Most frequent 2-word phrases.

+
+ {topBigrams.map((item) => ( +
+
{item.ngram}
+
{item.count.toLocaleString()} uses
+
+ ))} +
+
+ +
+

Top Trigrams

+

Most frequent 3-word phrases.

+
+ {topTrigrams.map((item) => ( +
+
{item.ngram}
+
{item.count.toLocaleString()} uses
+
+ ))} +
+
+
+
+ ); +}; + +export default LinguisticStats; diff --git a/frontend/src/pages/Stats.tsx b/frontend/src/pages/Stats.tsx index 683584f..910b0a0 100644 --- a/frontend/src/pages/Stats.tsx +++ b/frontend/src/pages/Stats.tsx @@ -5,12 +5,20 @@ import StatsStyling from "../styles/stats_styling"; import SummaryStats from "../components/SummaryStats"; import EmotionalStats from "../components/EmotionalStats"; import UserStats from "../components/UserStats"; +import LinguisticStats from "../components/LinguisticStats"; +import InteractionalStats from "../components/InteractionalStats"; +import CulturalStats from "../components/CulturalStats"; import { type SummaryResponse, type UserAnalysisResponse, type TimeAnalysisResponse, - type ContentAnalysisResponse + type ContentAnalysisResponse, + type UserEndpointResponse, + type LinguisticAnalysisResponse, + type EmotionalAnalysisResponse, + type InteractionAnalysisResponse, + type CulturalAnalysisResponse } from '../types/ApiTypes' const API_BASE_URL = import.meta.env.VITE_BACKEND_URL @@ -20,11 +28,14 @@ const StatPage = () => { const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>(); const [error, setError] = useState(''); const [loading, setLoading] = useState(false); - const [activeView, setActiveView] = useState<"summary" | "emotional" | "user">("summary"); + const [activeView, setActiveView] = useState<"summary" | "emotional" | "user" | "linguistic" | "interactional" | "cultural">("summary"); const [userData, setUserData] = useState(null); const [timeData, setTimeData] = useState(null); const [contentData, setContentData] = useState(null); + const [linguisticData, setLinguisticData] = useState(null); + const [interactionData, setInteractionData] = useState(null); + const [culturalData, setCulturalData] = useState(null); const [summary, setSummary] = useState(null); @@ -83,15 +94,23 @@ const StatPage = () => { setLoading(true); Promise.all([ - axios.get(`${API_BASE_URL}/dataset/${datasetId}/time`, { + axios.get(`${API_BASE_URL}/dataset/${datasetId}/temporal`, { params, headers: authHeaders, }), - axios.get(`${API_BASE_URL}/dataset/${datasetId}/user`, { + axios.get(`${API_BASE_URL}/dataset/${datasetId}/user`, { params, headers: authHeaders, }), - axios.get(`${API_BASE_URL}/dataset/${datasetId}/content`, { + axios.get(`${API_BASE_URL}/dataset/${datasetId}/linguistic`, { + params, + headers: authHeaders, + }), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/emotional`, { + params, + headers: authHeaders, + }), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/interactional`, { params, headers: authHeaders, }), @@ -99,11 +118,28 @@ const StatPage = () => { params, headers: authHeaders, }), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/cultural`, { + params, + headers: authHeaders, + }), ]) - .then(([timeRes, userRes, contentRes, summaryRes]) => { - setUserData(userRes.data || null); + .then(([timeRes, userRes, linguisticRes, emotionalRes, interactionRes, summaryRes, culturalRes]) => { + const combinedUserData: UserAnalysisResponse = { + ...userRes.data, + interaction_graph: interactionRes.data?.interaction_graph ?? {}, + }; + + const combinedContentData: ContentAnalysisResponse = { + ...linguisticRes.data, + ...emotionalRes.data, + }; + + setUserData(combinedUserData); setTimeData(timeRes.data || null); - setContentData(contentRes.data || null); + setContentData(combinedContentData); + setLinguisticData(linguisticRes.data || null); + setInteractionData(interactionRes.data || null); + setCulturalData(culturalRes.data || null); setSummary(summaryRes.data || null); }) .catch((e) => setError("Failed to load statistics: " + String(e))) @@ -218,6 +254,24 @@ return ( > Users + + + {activeView === "summary" && ( @@ -243,6 +297,36 @@ return ( )} + {activeView === "linguistic" && linguisticData && ( + + )} + + {activeView === "linguistic" && !linguisticData && ( +
+ No linguistic data available. +
+ )} + + {activeView === "interactional" && interactionData && ( + + )} + + {activeView === "interactional" && !interactionData && ( +
+ No interactional data available. +
+ )} + + {activeView === "cultural" && culturalData && ( + + )} + + {activeView === "cultural" && !culturalData && ( +
+ No cultural data available. +
+ )} + ); } From c215024ef2ccc25a0261eb0933ef1d49ab4f48a1 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 18 Mar 2026 18:50:51 +0000 Subject: [PATCH 17/24] feat(frontend): add deleted user filter Reddit often contains "[Deleted]" when a user is banned or deletes their post/comment. Keeping the backend faithful to the original dataset is important so the filtering is being done on the frontend. --- frontend/src/pages/Stats.tsx | 69 ++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 3 deletions(-) diff --git a/frontend/src/pages/Stats.tsx b/frontend/src/pages/Stats.tsx index 910b0a0..f3f483f 100644 --- a/frontend/src/pages/Stats.tsx +++ b/frontend/src/pages/Stats.tsx @@ -23,6 +23,11 @@ import { const API_BASE_URL = import.meta.env.VITE_BACKEND_URL const styles = StatsStyling; +const DELETED_USERS = ["[deleted]"]; + +const isDeletedUser = (value: string | null | undefined) => ( + DELETED_USERS.includes((value ?? "").trim().toLowerCase()) +); const StatPage = () => { const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>(); @@ -124,9 +129,56 @@ const StatPage = () => { }), ]) .then(([timeRes, userRes, linguisticRes, emotionalRes, interactionRes, summaryRes, culturalRes]) => { + const usersList = userRes.data.users ?? []; + const topUsersList = userRes.data.top_users ?? []; + const interactionGraphRaw = interactionRes.data?.interaction_graph ?? {}; + const topPairsRaw = interactionRes.data?.top_interaction_pairs ?? []; + + const filteredUsers: typeof usersList = []; + for (const user of usersList) { + if (isDeletedUser(user.author)) continue; + filteredUsers.push(user); + } + + const filteredTopUsers: typeof topUsersList = []; + for (const user of topUsersList) { + if (isDeletedUser(user.author)) continue; + filteredTopUsers.push(user); + } + + const filteredInteractionGraph: Record> = {}; + for (const [source, targets] of Object.entries(interactionGraphRaw)) { + if (isDeletedUser(source)) { + continue; + } + + const nextTargets: Record = {}; + for (const [target, count] of Object.entries(targets)) { + if (isDeletedUser(target)) { + continue; + } + nextTargets[target] = count; + } + + filteredInteractionGraph[source] = nextTargets; + } + + const filteredTopInteractionPairs: typeof topPairsRaw = []; + for (const pairEntry of topPairsRaw) { + const pair = pairEntry[0]; + const source = pair[0]; + const target = pair[1]; + if (isDeletedUser(source) || isDeletedUser(target)) { + continue; + } + filteredTopInteractionPairs.push(pairEntry); + } + const combinedUserData: UserAnalysisResponse = { ...userRes.data, - interaction_graph: interactionRes.data?.interaction_graph ?? {}, + users: filteredUsers, + top_users: filteredTopUsers, + interaction_graph: filteredInteractionGraph, }; const combinedContentData: ContentAnalysisResponse = { @@ -134,13 +186,24 @@ const StatPage = () => { ...emotionalRes.data, }; + const filteredInteractionData: InteractionAnalysisResponse = { + ...interactionRes.data, + interaction_graph: filteredInteractionGraph, + top_interaction_pairs: filteredTopInteractionPairs, + }; + + const filteredSummary: SummaryResponse = { + ...summaryRes.data, + unique_users: filteredUsers.length, + }; + setUserData(combinedUserData); setTimeData(timeRes.data || null); setContentData(combinedContentData); setLinguisticData(linguisticRes.data || null); - setInteractionData(interactionRes.data || null); + setInteractionData(filteredInteractionData || null); setCulturalData(culturalRes.data || null); - setSummary(summaryRes.data || null); + setSummary(filteredSummary || null); }) .catch((e) => setError("Failed to load statistics: " + String(e))) .finally(() => setLoading(false)); From 1446dd176d702c82de11293ba5c3bdea8f7d1775 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 18 Mar 2026 18:53:14 +0000 Subject: [PATCH 18/24] feat(frontend): center page selection --- frontend/src/pages/Stats.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frontend/src/pages/Stats.tsx b/frontend/src/pages/Stats.tsx index f3f483f..408cfa2 100644 --- a/frontend/src/pages/Stats.tsx +++ b/frontend/src/pages/Stats.tsx @@ -297,7 +297,7 @@ return (
Dataset #{datasetId ?? "-"}
-
+