diff --git a/db/database.py b/db/database.py index 80cf5b3..30b8f34 100644 --- a/db/database.py +++ b/db/database.py @@ -110,7 +110,7 @@ class PostgresConnector: row["source"], row.get("topic"), row.get("topic_confidence"), - Json(row["ner_entities"]) if row.get("ner_entities") else None, + Json(row["entities"]) if row.get("entities") else None, row.get("emotion_anger"), row.get("emotion_disgust"), row.get("emotion_fear"), diff --git a/server/app.py b/server/app.py index b3830a3..eef9d7a 100644 --- a/server/app.py +++ b/server/app.py @@ -1,4 +1,5 @@ import os +import datetime from dotenv import load_dotenv from flask import Flask, jsonify, request @@ -15,6 +16,7 @@ from server.stat_gen import StatGen from server.dataset_processor import DatasetProcessor from db.database import PostgresConnector from server.auth import AuthManager +from server.utils import get_request_filters, parse_datetime_filter import pandas as pd import traceback @@ -42,7 +44,6 @@ auth_manager = AuthManager(db, bcrypt) stat_gen = StatGen() - @app.route("/register", methods=["POST"]) def register_user(): data = request.get_json() @@ -112,7 +113,7 @@ def upload_data(): post_file = request.files["posts"] topic_file = request.files["topics"] - if post_file.filename == "" or topic_file == "": + if post_file.filename == "" or topic_file.filename == "": return jsonify({"error": "Empty filename"}), 400 if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith( @@ -136,7 +137,11 @@ def upload_data(): db.save_dataset_content(dataset_id, enriched_df) return jsonify( - {"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id} + { + "message": "File uploaded successfully", + "event_count": len(enriched_df), + "dataset_id": dataset_id, + } ), 200 except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 @@ -158,7 +163,8 @@ def get_dataset(dataset_id): if dataset_content.empty: return jsonify({"error": "Dataset content not found"}), 404 - return jsonify(dataset_content.to_dict(orient="records")), 200 + filters = get_request_filters() + return jsonify(stat_gen.filter_dataset(dataset_content, filters)), 200 @app.route("/dataset//content", methods=["GET"]) @@ -172,7 +178,8 @@ def content_endpoint(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_content_analysis(dataset_content)), 200 + filters = get_request_filters() + return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -192,7 +199,8 @@ def get_summary(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.summary(dataset_content)), 200 + filters = get_request_filters() + return jsonify(stat_gen.summary(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -212,7 +220,8 @@ def get_time_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_time_analysis(dataset_content)), 200 + filters = get_request_filters() + return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -232,7 +241,8 @@ def get_user_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_user_analysis(dataset_content)), 200 + filters = get_request_filters() + return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -252,7 +262,8 @@ def get_cultural_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200 + filters = get_request_filters() + return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -272,84 +283,15 @@ def get_interaction_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200 + filters = get_request_filters() + return jsonify( + stat_gen.get_interactional_analysis(dataset_content, filters) + ), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -# @app.route("/filter/query", methods=["POST"]) -# def filter_query(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# data = request.get_json(silent=True) or {} - -# if "query" not in data: -# return jsonify(stat_obj.df.to_dict(orient="records")), 200 - -# query = data["query"] -# filtered_df = stat_obj.filter_by_query(query) - -# return jsonify(filtered_df), 200 - - -# @app.route("/filter/time", methods=["POST"]) -# def filter_time(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# data = request.get_json(silent=True) -# if not data: -# return jsonify({"error": "Invalid or missing JSON body"}), 400 - -# if "start" not in data or "end" not in data: -# return jsonify({"error": "Please include both start and end dates"}), 400 - -# try: -# start = pd.to_datetime(data["start"], utc=True) -# end = pd.to_datetime(data["end"], utc=True) -# filtered_df = stat_obj.set_time_range(start, end) -# return jsonify(filtered_df), 200 -# except Exception: -# return jsonify({"error": "Invalid datetime format"}), 400 - - -# @app.route("/filter/sources", methods=["POST"]) -# def filter_sources(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# data = request.get_json(silent=True) -# if not data: -# return jsonify({"error": "Invalid or missing JSON body"}), 400 - -# if "sources" not in data: -# return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400 - -# try: -# filtered_df = stat_obj.filter_data_sources(data["sources"]) -# return jsonify(filtered_df), 200 -# except ValueError: -# return jsonify({"error": "Please enable at least one data source"}), 400 -# except Exception as e: -# return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500 - - -# @app.route("/filter/reset", methods=["GET"]) -# def reset_dataset(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# try: -# stat_obj.reset_dataset() -# return jsonify({"success": "Dataset successfully reset"}) -# except Exception as e: -# print(traceback.format_exc()) -# return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - - if __name__ == "__main__": app.run(debug=True) diff --git a/server/stat_gen.py b/server/stat_gen.py index dc748d0..2ea5ac1 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -39,97 +39,139 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() - def get_time_analysis(self, df: pd.DataFrame) -> dict: + ## Private Methods + def _prepare_filtered_df(self, + df: pd.DataFrame, + filters: dict | None = None + ) -> pd.DataFrame: + filters = filters or {} + filtered_df = df.copy() + + search_query = filters.get("search_query", None) + start_date_filter = filters.get("start_date", None) + end_date_filter = filters.get("end_date", None) + data_source_filter = filters.get("data_sources", None) + + if search_query: + mask = ( + filtered_df["content"].str.contains(search_query, case=False, na=False) + | filtered_df["author"].str.contains(search_query, case=False, na=False) + ) + + # Only include title if the column exists + if "title" in filtered_df.columns: + mask = mask | filtered_df["title"].str.contains( + search_query, case=False, na=False, regex=False + ) + + filtered_df = filtered_df[mask] + + if start_date_filter: + filtered_df = filtered_df[(filtered_df["dt"] >= start_date_filter)] + + if end_date_filter: + filtered_df = filtered_df[(filtered_df["dt"] <= end_date_filter)] + + if data_source_filter: + filtered_df = filtered_df[filtered_df["source"].isin(data_source_filter)] + + return filtered_df + + ## Public Methods + def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + return self._prepare_filtered_df(df, filters).to_dict(orient="records") + + def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "events_per_day": self.temporal_analysis.posts_per_day(df), - "weekday_hour_heatmap": self.temporal_analysis.heatmap(df), + "events_per_day": self.temporal_analysis.posts_per_day(filtered_df), + "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), } - def get_content_analysis(self, df: pd.DataFrame) -> dict: + def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "word_frequencies": self.linguistic_analysis.word_frequencies(df), - "common_two_phrases": self.linguistic_analysis.ngrams(df), - "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df), - "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df), + "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), + "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), + "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( + filtered_df + ), + "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion( + filtered_df + ), } - def get_user_analysis(self, df: pd.DataFrame) -> dict: + def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "top_users": self.interaction_analysis.top_users(df), - "users": self.interaction_analysis.per_user_analysis(df), - "interaction_graph": self.interaction_analysis.interaction_graph(df), + "top_users": self.interaction_analysis.top_users(filtered_df), + "users": self.interaction_analysis.per_user_analysis(filtered_df), + "interaction_graph": self.interaction_analysis.interaction_graph( + filtered_df + ), } - def get_interactional_analysis(self, df: pd.DataFrame) -> dict: + def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "average_thread_depth": self.interaction_analysis.average_thread_depth(df), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df), + "average_thread_depth": self.interaction_analysis.average_thread_depth( + filtered_df + ), + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion( + filtered_df + ), } - def get_cultural_analysis(self, df: pd.DataFrame) -> dict: + def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "identity_markers": self.cultural_analysis.get_identity_markers(df), - "stance_markers": self.cultural_analysis.get_stance_markers(df), - "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df), + "identity_markers": self.cultural_analysis.get_identity_markers( + filtered_df + ), + "stance_markers": self.cultural_analysis.get_stance_markers(filtered_df), + "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity( + filtered_df + ), } - def summary(self, df: pd.DataFrame) -> dict: - total_posts = (df["type"] == "post").sum() - total_comments = (df["type"] == "comment").sum() - events_per_user = df.groupby("author").size() + def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + total_posts = (filtered_df["type"] == "post").sum() + total_comments = (filtered_df["type"] == "comment").sum() + events_per_user = filtered_df.groupby("author").size() + + if filtered_df.empty: + return { + "total_events": 0, + "total_posts": 0, + "total_comments": 0, + "unique_users": 0, + "comments_per_post": 0, + "lurker_ratio": 0, + "time_range": { + "start": None, + "end": None, + }, + "sources": [], + } return { - "total_events": int(len(df)), + "total_events": int(len(filtered_df)), "total_posts": int(total_posts), "total_comments": int(total_comments), "unique_users": int(events_per_user.count()), "comments_per_post": round(total_comments / max(total_posts, 1), 2), "lurker_ratio": round((events_per_user == 1).mean(), 2), "time_range": { - "start": int(df["dt"].min().timestamp()), - "end": int(df["dt"].max().timestamp()), + "start": int(filtered_df["dt"].min().timestamp()), + "end": int(filtered_df["dt"].max().timestamp()), }, - "sources": df["source"].dropna().unique().tolist(), + "sources": filtered_df["source"].dropna().unique().tolist(), } - - # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict: - # filtered_df = df[df["content"].str.contains(search_query, na=False)] - - # return { - # "rows": len(filtered_df), - # "data": filtered_df.to_dict(orient="records"), - # } - - # def set_time_range( - # self, - # original_df: pd.DataFrame, - # start: datetime.datetime, - # end: datetime.datetime, - # ) -> dict: - # df = self._prepare_df(original_df) - # filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)] - - # return { - # "rows": len(filtered_df), - # "data": filtered_df.to_dict(orient="records"), - # } - - # def filter_data_sources( - # self, original_df: pd.DataFrame, data_sources: dict - # ) -> dict: - # df = self._prepare_df(original_df) - # enabled_sources = [src for src, enabled in data_sources.items() if enabled] - - # if not enabled_sources: - # raise ValueError("Please choose at least one data source") - - # filtered_df = df[df["source"].isin(enabled_sources)] - - # return { - # "rows": len(filtered_df), - # "data": filtered_df.to_dict(orient="records"), - # } - - # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame: - # return self._prepare_df(original_df) diff --git a/server/utils.py b/server/utils.py new file mode 100644 index 0000000..078d1e7 --- /dev/null +++ b/server/utils.py @@ -0,0 +1,50 @@ +import datetime +from flask import request + +def parse_datetime_filter(value): + if not value: + return None + + try: + return datetime.datetime.fromisoformat(value) + except ValueError: + try: + return datetime.datetime.fromtimestamp(float(value)) + except ValueError as err: + raise ValueError( + "Date filters must be ISO-8601 strings or Unix timestamps" + ) from err + + +def get_request_filters() -> dict: + filters = {} + + search_query = request.args.get("search_query") or request.args.get("query") + if search_query: + filters["search_query"] = search_query + + start_date = parse_datetime_filter( + request.args.get("start_date") or request.args.get("start") + ) + if start_date: + filters["start_date"] = start_date + + end_date = parse_datetime_filter( + request.args.get("end_date") or request.args.get("end") + ) + if end_date: + filters["end_date"] = end_date + + data_sources = request.args.getlist("data_sources") + if not data_sources: + data_sources = request.args.getlist("sources") + + if len(data_sources) == 1 and "," in data_sources[0]: + data_sources = [ + source.strip() for source in data_sources[0].split(",") if source.strip() + ] + + if data_sources: + filters["data_sources"] = data_sources + + return filters \ No newline at end of file