diff --git a/server/app.py b/server/app.py index 640cd73..64f4f53 100644 --- a/server/app.py +++ b/server/app.py @@ -43,6 +43,56 @@ auth_manager = AuthManager(db, bcrypt) stat_gen = StatGen() + +def _parse_datetime_filter(value): + if not value: + return None + + try: + return datetime.datetime.fromisoformat(value) + except ValueError: + try: + return datetime.datetime.fromtimestamp(float(value)) + except ValueError as err: + raise ValueError( + "Date filters must be ISO-8601 strings or Unix timestamps" + ) from err + + +def _get_request_filters() -> dict: + filters = {} + + search_query = request.args.get("search_query") or request.args.get("query") + if search_query: + filters["search_query"] = search_query + + start_date = _parse_datetime_filter( + request.args.get("start_date") or request.args.get("start") + ) + if start_date: + filters["start_date"] = start_date + + end_date = _parse_datetime_filter( + request.args.get("end_date") or request.args.get("end") + ) + if end_date: + filters["end_date"] = end_date + + data_sources = request.args.getlist("data_sources") + if not data_sources: + data_sources = request.args.getlist("sources") + + if len(data_sources) == 1 and "," in data_sources[0]: + data_sources = [ + source.strip() for source in data_sources[0].split(",") if source.strip() + ] + + if data_sources: + filters["data_sources"] = data_sources + + return filters + + @app.route("/register", methods=["POST"]) def register_user(): data = request.get_json() @@ -136,7 +186,11 @@ def upload_data(): db.save_dataset_content(dataset_id, enriched_df) return jsonify( - {"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id} + { + "message": "File uploaded successfully", + "event_count": len(enriched_df), + "dataset_id": dataset_id, + } ), 200 except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 @@ -172,7 +226,8 @@ def content_endpoint(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_content_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -192,7 +247,8 @@ def get_summary(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.summary(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.summary(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -212,7 +268,8 @@ def get_time_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_time_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -232,7 +289,8 @@ def get_user_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_user_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -252,7 +310,8 @@ def get_cultural_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -272,43 +331,15 @@ def get_interaction_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify( + stat_gen.get_interactional_analysis(dataset_content, filters) + ), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route("/dataset/query", methods=["POST"]) -@jwt_required() -def filter_query(): - data = request.get_json() - - if "query" in data: - stat_gen.set_search_query(data["query"]) - - if "start" in data: - start_timestamp = datetime.datetime.fromisoformat(data["start"]) - stat_gen.set_start_date(start_timestamp) - - if "end" in data: - end_timestamp = datetime.datetime.fromisoformat(data["end"]) - stat_gen.set_end_date(end_timestamp) - - if "sources" in data: - data_sources = set(data["sources"]) - stat_gen.set_data_sources(data_sources) - - return jsonify({"message": "Filters set successfully"}), 200 - - -@app.route("/database/query/reset", methods=["GET"]) -@jwt_required() -def reset_dataset(): - stat_gen.reset_filters() - return jsonify({"message": "Filters reset successfully"}), 200 - - if __name__ == "__main__": app.run(debug=True) diff --git a/server/stat_gen.py b/server/stat_gen.py index 961757e..cb8dbaa 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -39,105 +39,135 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() - self.search_query = "" - self.start_date_filter = None - self.end_date_filter = None - self.data_source_filters = set() - ## Private Methods - def _prepare_filtered_df(self, df: pd.DataFrame) -> pd.DataFrame: + def _prepare_filtered_df(self, + df: pd.DataFrame, + filters: dict | None = None + ) -> pd.DataFrame: + filters = filters or {} filtered_df = df.copy() - if self.search_query: + search_query = filters.get("search_query", None) + start_date_filter = filters.get("start_date", None) + end_date_filter = filters.get("end_date", None) + data_source_filter = filters.get("data_sources", None) + + if search_query: mask = ( - filtered_df["content"].str.contains(self.search_query, case=False, na=False) - | filtered_df["author"].str.contains(self.search_query, case=False, na=False).fillna(False) - | filtered_df["title"].str.contains(self.search_query, case=False, na=False, regex=False).fillna(False) + filtered_df["content"].str.contains(search_query, case=False, na=False) + | filtered_df["author"] + .str.contains(search_query, case=False, na=False) + .fillna(False) + | filtered_df["title"] + .str.contains(search_query, case=False, na=False, regex=False) + .fillna(False) ) filtered_df = filtered_df[mask] - if self.start_date_filter and self.end_date_filter: - filtered_df = filtered_df[ - (filtered_df["dt"] >= self.start_date_filter) & (filtered_df["dt"] <= self.end_date_filter) - ] + if start_date_filter: + filtered_df = filtered_df[(filtered_df["dt"] >= start_date_filter)] - if self.data_source_filters: - enabled_sources = [src for src, enabled in self.data_source_filters.items() if enabled] - if enabled_sources: - filtered_df = filtered_df[filtered_df["source"].isin(enabled_sources)] + if end_date_filter: + filtered_df = filtered_df[(filtered_df["dt"] <= end_date_filter)] + + if data_source_filter: + filtered_df = filtered_df[filtered_df["source"].isin(data_source_filter)] return filtered_df ## Public Methods - def get_time_analysis(self, df: pd.DataFrame) -> dict: + def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "events_per_day": self.temporal_analysis.posts_per_day(df), - "weekday_hour_heatmap": self.temporal_analysis.heatmap(df), + "events_per_day": self.temporal_analysis.posts_per_day(filtered_df), + "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), } - def get_content_analysis(self, df: pd.DataFrame) -> dict: + def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "word_frequencies": self.linguistic_analysis.word_frequencies(df), - "common_two_phrases": self.linguistic_analysis.ngrams(df), - "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df), - "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df), + "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), + "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), + "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( + filtered_df + ), + "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion( + filtered_df + ), } - def get_user_analysis(self, df: pd.DataFrame) -> dict: + def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "top_users": self.interaction_analysis.top_users(df), - "users": self.interaction_analysis.per_user_analysis(df), - "interaction_graph": self.interaction_analysis.interaction_graph(df), + "top_users": self.interaction_analysis.top_users(filtered_df), + "users": self.interaction_analysis.per_user_analysis(filtered_df), + "interaction_graph": self.interaction_analysis.interaction_graph( + filtered_df + ), } - def get_interactional_analysis(self, df: pd.DataFrame) -> dict: + def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "average_thread_depth": self.interaction_analysis.average_thread_depth(df), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df), + "average_thread_depth": self.interaction_analysis.average_thread_depth( + filtered_df + ), + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion( + filtered_df + ), } - def get_cultural_analysis(self, df: pd.DataFrame) -> dict: + def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "identity_markers": self.cultural_analysis.get_identity_markers(df), - "stance_markers": self.cultural_analysis.get_stance_markers(df), - "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df), + "identity_markers": self.cultural_analysis.get_identity_markers( + filtered_df + ), + "stance_markers": self.cultural_analysis.get_stance_markers(filtered_df), + "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity( + filtered_df + ), } - def summary(self, df: pd.DataFrame) -> dict: - total_posts = (df["type"] == "post").sum() - total_comments = (df["type"] == "comment").sum() - events_per_user = df.groupby("author").size() + def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + total_posts = (filtered_df["type"] == "post").sum() + total_comments = (filtered_df["type"] == "comment").sum() + events_per_user = filtered_df.groupby("author").size() + + if filtered_df.empty: + return { + "total_events": 0, + "total_posts": 0, + "total_comments": 0, + "unique_users": 0, + "comments_per_post": 0, + "lurker_ratio": 0, + "time_range": { + "start": None, + "end": None, + }, + "sources": [], + } return { - "total_events": int(len(df)), + "total_events": int(len(filtered_df)), "total_posts": int(total_posts), "total_comments": int(total_comments), "unique_users": int(events_per_user.count()), "comments_per_post": round(total_comments / max(total_posts, 1), 2), "lurker_ratio": round((events_per_user == 1).mean(), 2), "time_range": { - "start": int(df["dt"].min().timestamp()), - "end": int(df["dt"].max().timestamp()), + "start": int(filtered_df["dt"].min().timestamp()), + "end": int(filtered_df["dt"].max().timestamp()), }, - "sources": df["source"].dropna().unique().tolist(), + "sources": filtered_df["source"].dropna().unique().tolist(), } - - def set_search_query(self, search_query: str) -> None: - self.search_query = search_query - - def set_start_date(self, start: datetime.datetime) -> None: - self.start_date_filter = start - - def set_end_date(self, end: datetime.datetime) -> None: - self.end_date_filter = end - - def set_data_sources(self, data_sources: set) -> None: - self.data_source_filters = data_sources - - def reset_filters(self) -> None: - self.search_query = "" - self.start_date_filter = None - self.end_date_filter = None - self.data_source_filters = set()