From a6adea5a7dc55f81d5eb29d315f47a68ceed6899 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sun, 1 Mar 2026 18:28:08 +0000 Subject: [PATCH 1/6] fix: broken stat_gen filter methods --- server/stat_gen.py | 82 +++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/server/stat_gen.py b/server/stat_gen.py index dc748d0..65ed954 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -39,6 +39,37 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() + self.search_query = "" + self.start_date_filter = None + self.end_date_filter = None + self.data_source_filters = set() + + ## Private Methods + def _prepare_filtered_df(self, df: pd.DataFrame) -> pd.DataFrame: + filtered_df = df.copy() + + if self.search_query: + mask = ( + filtered_df["content"].str.contains(self.search_query, case=False, na=False) + | filtered_df["author"].str.contains(self.search_query, case=False, na=False).fillna(False) + | filtered_df["title"].str.contains(self.search_query, case=False, na=False, regex=False).fillna(False) + ) + filtered_df = filtered_df[mask] + + if self.start_date_filter and self.end_date_filter: + filtered_df = filtered_df[ + (filtered_df["dt"] >= self.start_date_filter) & (filtered_df["dt"] <= self.end_date_filter) + ] + + if self.data_source_filters: + enabled_sources = [src for src, enabled in self.data_source_filters.items() if enabled] + if enabled_sources: + filtered_df = filtered_df[filtered_df["source"].isin(enabled_sources)] + + return filtered_df + + ## Public Methods + def get_time_analysis(self, df: pd.DataFrame) -> dict: return { "events_per_day": self.temporal_analysis.posts_per_day(df), @@ -93,43 +124,18 @@ class StatGen: "sources": df["source"].dropna().unique().tolist(), } - # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict: - # filtered_df = df[df["content"].str.contains(search_query, na=False)] + def filter_by_query(self, search_query: str) -> None: + self.search_query = search_query - # return { - # "rows": len(filtered_df), - # "data": filtered_df.to_dict(orient="records"), - # } + def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> None: + self.start_date_filter = start + self.end_date_filter = end - # def set_time_range( - # self, - # original_df: pd.DataFrame, - # start: datetime.datetime, - # end: datetime.datetime, - # ) -> dict: - # df = self._prepare_df(original_df) - # filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)] - - # return { - # "rows": len(filtered_df), - # "data": filtered_df.to_dict(orient="records"), - # } - - # def filter_data_sources( - # self, original_df: pd.DataFrame, data_sources: dict - # ) -> dict: - # df = self._prepare_df(original_df) - # enabled_sources = [src for src, enabled in data_sources.items() if enabled] - - # if not enabled_sources: - # raise ValueError("Please choose at least one data source") - - # filtered_df = df[df["source"].isin(enabled_sources)] - - # return { - # "rows": len(filtered_df), - # "data": filtered_df.to_dict(orient="records"), - # } - - # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame: - # return self._prepare_df(original_df) + def filter_data_sources(self, data_sources: set) -> None: + self.data_source_filters = data_sources + + def reset_dataset(self) -> None: + self.search_query = "" + self.start_date_filter = None + self.end_date_filter = None + self.data_source_filters = set() From 8b4adf4a63c7dca8151946abab62882b8aa75ad2 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sun, 1 Mar 2026 18:44:46 +0000 Subject: [PATCH 2/6] refactor: update filtering method names --- server/stat_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/stat_gen.py b/server/stat_gen.py index 65ed954..305c891 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -124,14 +124,14 @@ class StatGen: "sources": df["source"].dropna().unique().tolist(), } - def filter_by_query(self, search_query: str) -> None: + def set_search_query(self, search_query: str) -> None: self.search_query = search_query def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> None: self.start_date_filter = start self.end_date_filter = end - def filter_data_sources(self, data_sources: set) -> None: + def search_data_sources(self, data_sources: set) -> None: self.data_source_filters = data_sources def reset_dataset(self) -> None: From 82a98f84bd8d4170e8233ccce97208635c86c601 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sun, 1 Mar 2026 19:06:49 +0000 Subject: [PATCH 3/6] refactor: combine query results into one endpoint --- server/app.py | 89 +++++++++++++--------------------------------- server/stat_gen.py | 8 +++-- 2 files changed, 29 insertions(+), 68 deletions(-) diff --git a/server/app.py b/server/app.py index b3830a3..640cd73 100644 --- a/server/app.py +++ b/server/app.py @@ -1,4 +1,5 @@ import os +import datetime from dotenv import load_dotenv from flask import Flask, jsonify, request @@ -42,7 +43,6 @@ auth_manager = AuthManager(db, bcrypt) stat_gen = StatGen() - @app.route("/register", methods=["POST"]) def register_user(): data = request.get_json() @@ -112,7 +112,7 @@ def upload_data(): post_file = request.files["posts"] topic_file = request.files["topics"] - if post_file.filename == "" or topic_file == "": + if post_file.filename == "" or topic_file.filename == "": return jsonify({"error": "Empty filename"}), 400 if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith( @@ -280,75 +280,34 @@ def get_interaction_analysis(dataset_id): return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 -# @app.route("/filter/query", methods=["POST"]) -# def filter_query(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 +@app.route("/dataset/query", methods=["POST"]) +@jwt_required() +def filter_query(): + data = request.get_json() -# data = request.get_json(silent=True) or {} + if "query" in data: + stat_gen.set_search_query(data["query"]) -# if "query" not in data: -# return jsonify(stat_obj.df.to_dict(orient="records")), 200 + if "start" in data: + start_timestamp = datetime.datetime.fromisoformat(data["start"]) + stat_gen.set_start_date(start_timestamp) -# query = data["query"] -# filtered_df = stat_obj.filter_by_query(query) + if "end" in data: + end_timestamp = datetime.datetime.fromisoformat(data["end"]) + stat_gen.set_end_date(end_timestamp) -# return jsonify(filtered_df), 200 + if "sources" in data: + data_sources = set(data["sources"]) + stat_gen.set_data_sources(data_sources) + + return jsonify({"message": "Filters set successfully"}), 200 -# @app.route("/filter/time", methods=["POST"]) -# def filter_time(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# data = request.get_json(silent=True) -# if not data: -# return jsonify({"error": "Invalid or missing JSON body"}), 400 - -# if "start" not in data or "end" not in data: -# return jsonify({"error": "Please include both start and end dates"}), 400 - -# try: -# start = pd.to_datetime(data["start"], utc=True) -# end = pd.to_datetime(data["end"], utc=True) -# filtered_df = stat_obj.set_time_range(start, end) -# return jsonify(filtered_df), 200 -# except Exception: -# return jsonify({"error": "Invalid datetime format"}), 400 - - -# @app.route("/filter/sources", methods=["POST"]) -# def filter_sources(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# data = request.get_json(silent=True) -# if not data: -# return jsonify({"error": "Invalid or missing JSON body"}), 400 - -# if "sources" not in data: -# return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400 - -# try: -# filtered_df = stat_obj.filter_data_sources(data["sources"]) -# return jsonify(filtered_df), 200 -# except ValueError: -# return jsonify({"error": "Please enable at least one data source"}), 400 -# except Exception as e: -# return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500 - - -# @app.route("/filter/reset", methods=["GET"]) -# def reset_dataset(): -# if stat_obj is None: -# return jsonify({"error": "No data uploaded"}), 400 - -# try: -# stat_obj.reset_dataset() -# return jsonify({"success": "Dataset successfully reset"}) -# except Exception as e: -# print(traceback.format_exc()) -# return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 +@app.route("/database/query/reset", methods=["GET"]) +@jwt_required() +def reset_dataset(): + stat_gen.reset_filters() + return jsonify({"message": "Filters reset successfully"}), 200 if __name__ == "__main__": diff --git a/server/stat_gen.py b/server/stat_gen.py index 305c891..961757e 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -127,14 +127,16 @@ class StatGen: def set_search_query(self, search_query: str) -> None: self.search_query = search_query - def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> None: + def set_start_date(self, start: datetime.datetime) -> None: self.start_date_filter = start + + def set_end_date(self, end: datetime.datetime) -> None: self.end_date_filter = end - def search_data_sources(self, data_sources: set) -> None: + def set_data_sources(self, data_sources: set) -> None: self.data_source_filters = data_sources - def reset_dataset(self) -> None: + def reset_filters(self) -> None: self.search_query = "" self.start_date_filter = None self.end_date_filter = None From 37cb2c9ff47e8d62b990ceb7f64292dfa229ad74 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 2 Mar 2026 16:18:02 +0000 Subject: [PATCH 4/6] feat(querying): make filters stateless Stateless filters are required as the server cannot store them in the StatGen object --- server/app.py | 107 +++++++++++++++++++----------- server/stat_gen.py | 158 +++++++++++++++++++++++++++------------------ 2 files changed, 163 insertions(+), 102 deletions(-) diff --git a/server/app.py b/server/app.py index 640cd73..64f4f53 100644 --- a/server/app.py +++ b/server/app.py @@ -43,6 +43,56 @@ auth_manager = AuthManager(db, bcrypt) stat_gen = StatGen() + +def _parse_datetime_filter(value): + if not value: + return None + + try: + return datetime.datetime.fromisoformat(value) + except ValueError: + try: + return datetime.datetime.fromtimestamp(float(value)) + except ValueError as err: + raise ValueError( + "Date filters must be ISO-8601 strings or Unix timestamps" + ) from err + + +def _get_request_filters() -> dict: + filters = {} + + search_query = request.args.get("search_query") or request.args.get("query") + if search_query: + filters["search_query"] = search_query + + start_date = _parse_datetime_filter( + request.args.get("start_date") or request.args.get("start") + ) + if start_date: + filters["start_date"] = start_date + + end_date = _parse_datetime_filter( + request.args.get("end_date") or request.args.get("end") + ) + if end_date: + filters["end_date"] = end_date + + data_sources = request.args.getlist("data_sources") + if not data_sources: + data_sources = request.args.getlist("sources") + + if len(data_sources) == 1 and "," in data_sources[0]: + data_sources = [ + source.strip() for source in data_sources[0].split(",") if source.strip() + ] + + if data_sources: + filters["data_sources"] = data_sources + + return filters + + @app.route("/register", methods=["POST"]) def register_user(): data = request.get_json() @@ -136,7 +186,11 @@ def upload_data(): db.save_dataset_content(dataset_id, enriched_df) return jsonify( - {"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id} + { + "message": "File uploaded successfully", + "event_count": len(enriched_df), + "dataset_id": dataset_id, + } ), 200 except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 @@ -172,7 +226,8 @@ def content_endpoint(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_content_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -192,7 +247,8 @@ def get_summary(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.summary(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.summary(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -212,7 +268,8 @@ def get_time_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_time_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -232,7 +289,8 @@ def get_user_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_user_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -252,7 +310,8 @@ def get_cultural_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: @@ -272,43 +331,15 @@ def get_interaction_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200 + filters = _get_request_filters() + return jsonify( + stat_gen.get_interactional_analysis(dataset_content, filters) + ), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route("/dataset/query", methods=["POST"]) -@jwt_required() -def filter_query(): - data = request.get_json() - - if "query" in data: - stat_gen.set_search_query(data["query"]) - - if "start" in data: - start_timestamp = datetime.datetime.fromisoformat(data["start"]) - stat_gen.set_start_date(start_timestamp) - - if "end" in data: - end_timestamp = datetime.datetime.fromisoformat(data["end"]) - stat_gen.set_end_date(end_timestamp) - - if "sources" in data: - data_sources = set(data["sources"]) - stat_gen.set_data_sources(data_sources) - - return jsonify({"message": "Filters set successfully"}), 200 - - -@app.route("/database/query/reset", methods=["GET"]) -@jwt_required() -def reset_dataset(): - stat_gen.reset_filters() - return jsonify({"message": "Filters reset successfully"}), 200 - - if __name__ == "__main__": app.run(debug=True) diff --git a/server/stat_gen.py b/server/stat_gen.py index 961757e..cb8dbaa 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -39,105 +39,135 @@ class StatGen: self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() - self.search_query = "" - self.start_date_filter = None - self.end_date_filter = None - self.data_source_filters = set() - ## Private Methods - def _prepare_filtered_df(self, df: pd.DataFrame) -> pd.DataFrame: + def _prepare_filtered_df(self, + df: pd.DataFrame, + filters: dict | None = None + ) -> pd.DataFrame: + filters = filters or {} filtered_df = df.copy() - if self.search_query: + search_query = filters.get("search_query", None) + start_date_filter = filters.get("start_date", None) + end_date_filter = filters.get("end_date", None) + data_source_filter = filters.get("data_sources", None) + + if search_query: mask = ( - filtered_df["content"].str.contains(self.search_query, case=False, na=False) - | filtered_df["author"].str.contains(self.search_query, case=False, na=False).fillna(False) - | filtered_df["title"].str.contains(self.search_query, case=False, na=False, regex=False).fillna(False) + filtered_df["content"].str.contains(search_query, case=False, na=False) + | filtered_df["author"] + .str.contains(search_query, case=False, na=False) + .fillna(False) + | filtered_df["title"] + .str.contains(search_query, case=False, na=False, regex=False) + .fillna(False) ) filtered_df = filtered_df[mask] - if self.start_date_filter and self.end_date_filter: - filtered_df = filtered_df[ - (filtered_df["dt"] >= self.start_date_filter) & (filtered_df["dt"] <= self.end_date_filter) - ] + if start_date_filter: + filtered_df = filtered_df[(filtered_df["dt"] >= start_date_filter)] - if self.data_source_filters: - enabled_sources = [src for src, enabled in self.data_source_filters.items() if enabled] - if enabled_sources: - filtered_df = filtered_df[filtered_df["source"].isin(enabled_sources)] + if end_date_filter: + filtered_df = filtered_df[(filtered_df["dt"] <= end_date_filter)] + + if data_source_filter: + filtered_df = filtered_df[filtered_df["source"].isin(data_source_filter)] return filtered_df ## Public Methods - def get_time_analysis(self, df: pd.DataFrame) -> dict: + def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "events_per_day": self.temporal_analysis.posts_per_day(df), - "weekday_hour_heatmap": self.temporal_analysis.heatmap(df), + "events_per_day": self.temporal_analysis.posts_per_day(filtered_df), + "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), } - def get_content_analysis(self, df: pd.DataFrame) -> dict: + def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "word_frequencies": self.linguistic_analysis.word_frequencies(df), - "common_two_phrases": self.linguistic_analysis.ngrams(df), - "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df), - "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df), + "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), + "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), + "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( + filtered_df + ), + "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion( + filtered_df + ), } - def get_user_analysis(self, df: pd.DataFrame) -> dict: + def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "top_users": self.interaction_analysis.top_users(df), - "users": self.interaction_analysis.per_user_analysis(df), - "interaction_graph": self.interaction_analysis.interaction_graph(df), + "top_users": self.interaction_analysis.top_users(filtered_df), + "users": self.interaction_analysis.per_user_analysis(filtered_df), + "interaction_graph": self.interaction_analysis.interaction_graph( + filtered_df + ), } - def get_interactional_analysis(self, df: pd.DataFrame) -> dict: + def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "average_thread_depth": self.interaction_analysis.average_thread_depth(df), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df), + "average_thread_depth": self.interaction_analysis.average_thread_depth( + filtered_df + ), + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion( + filtered_df + ), } - def get_cultural_analysis(self, df: pd.DataFrame) -> dict: + def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + return { - "identity_markers": self.cultural_analysis.get_identity_markers(df), - "stance_markers": self.cultural_analysis.get_stance_markers(df), - "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df), + "identity_markers": self.cultural_analysis.get_identity_markers( + filtered_df + ), + "stance_markers": self.cultural_analysis.get_stance_markers(filtered_df), + "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity( + filtered_df + ), } - def summary(self, df: pd.DataFrame) -> dict: - total_posts = (df["type"] == "post").sum() - total_comments = (df["type"] == "comment").sum() - events_per_user = df.groupby("author").size() + def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + total_posts = (filtered_df["type"] == "post").sum() + total_comments = (filtered_df["type"] == "comment").sum() + events_per_user = filtered_df.groupby("author").size() + + if filtered_df.empty: + return { + "total_events": 0, + "total_posts": 0, + "total_comments": 0, + "unique_users": 0, + "comments_per_post": 0, + "lurker_ratio": 0, + "time_range": { + "start": None, + "end": None, + }, + "sources": [], + } return { - "total_events": int(len(df)), + "total_events": int(len(filtered_df)), "total_posts": int(total_posts), "total_comments": int(total_comments), "unique_users": int(events_per_user.count()), "comments_per_post": round(total_comments / max(total_posts, 1), 2), "lurker_ratio": round((events_per_user == 1).mean(), 2), "time_range": { - "start": int(df["dt"].min().timestamp()), - "end": int(df["dt"].max().timestamp()), + "start": int(filtered_df["dt"].min().timestamp()), + "end": int(filtered_df["dt"].max().timestamp()), }, - "sources": df["source"].dropna().unique().tolist(), + "sources": filtered_df["source"].dropna().unique().tolist(), } - - def set_search_query(self, search_query: str) -> None: - self.search_query = search_query - - def set_start_date(self, start: datetime.datetime) -> None: - self.start_date_filter = start - - def set_end_date(self, end: datetime.datetime) -> None: - self.end_date_filter = end - - def set_data_sources(self, data_sources: set) -> None: - self.data_source_filters = data_sources - - def reset_filters(self) -> None: - self.search_query = "" - self.start_date_filter = None - self.end_date_filter = None - self.data_source_filters = set() From 5ea71023b55922cc554365344b430cd6fa5d0df6 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 2 Mar 2026 18:29:09 +0000 Subject: [PATCH 5/6] refactor: move query parameter extraction function out of flask app --- server/app.py | 66 +++++++--------------------------------------- server/stat_gen.py | 16 ++++++----- server/utils.py | 50 +++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 63 deletions(-) create mode 100644 server/utils.py diff --git a/server/app.py b/server/app.py index 64f4f53..eef9d7a 100644 --- a/server/app.py +++ b/server/app.py @@ -16,6 +16,7 @@ from server.stat_gen import StatGen from server.dataset_processor import DatasetProcessor from db.database import PostgresConnector from server.auth import AuthManager +from server.utils import get_request_filters, parse_datetime_filter import pandas as pd import traceback @@ -43,56 +44,6 @@ auth_manager = AuthManager(db, bcrypt) stat_gen = StatGen() - -def _parse_datetime_filter(value): - if not value: - return None - - try: - return datetime.datetime.fromisoformat(value) - except ValueError: - try: - return datetime.datetime.fromtimestamp(float(value)) - except ValueError as err: - raise ValueError( - "Date filters must be ISO-8601 strings or Unix timestamps" - ) from err - - -def _get_request_filters() -> dict: - filters = {} - - search_query = request.args.get("search_query") or request.args.get("query") - if search_query: - filters["search_query"] = search_query - - start_date = _parse_datetime_filter( - request.args.get("start_date") or request.args.get("start") - ) - if start_date: - filters["start_date"] = start_date - - end_date = _parse_datetime_filter( - request.args.get("end_date") or request.args.get("end") - ) - if end_date: - filters["end_date"] = end_date - - data_sources = request.args.getlist("data_sources") - if not data_sources: - data_sources = request.args.getlist("sources") - - if len(data_sources) == 1 and "," in data_sources[0]: - data_sources = [ - source.strip() for source in data_sources[0].split(",") if source.strip() - ] - - if data_sources: - filters["data_sources"] = data_sources - - return filters - - @app.route("/register", methods=["POST"]) def register_user(): data = request.get_json() @@ -212,7 +163,8 @@ def get_dataset(dataset_id): if dataset_content.empty: return jsonify({"error": "Dataset content not found"}), 404 - return jsonify(dataset_content.to_dict(orient="records")), 200 + filters = get_request_filters() + return jsonify(stat_gen.filter_dataset(dataset_content, filters)), 200 @app.route("/dataset//content", methods=["GET"]) @@ -226,7 +178,7 @@ def content_endpoint(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - filters = _get_request_filters() + filters = get_request_filters() return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 @@ -247,7 +199,7 @@ def get_summary(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - filters = _get_request_filters() + filters = get_request_filters() return jsonify(stat_gen.summary(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 @@ -268,7 +220,7 @@ def get_time_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - filters = _get_request_filters() + filters = get_request_filters() return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 @@ -289,7 +241,7 @@ def get_user_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - filters = _get_request_filters() + filters = get_request_filters() return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 @@ -310,7 +262,7 @@ def get_cultural_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - filters = _get_request_filters() + filters = get_request_filters() return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 @@ -331,7 +283,7 @@ def get_interaction_analysis(dataset_id): dataset_content = db.get_dataset_content(dataset_id) try: - filters = _get_request_filters() + filters = get_request_filters() return jsonify( stat_gen.get_interactional_analysis(dataset_content, filters) ), 200 diff --git a/server/stat_gen.py b/server/stat_gen.py index cb8dbaa..2ea5ac1 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -55,13 +55,15 @@ class StatGen: if search_query: mask = ( filtered_df["content"].str.contains(search_query, case=False, na=False) - | filtered_df["author"] - .str.contains(search_query, case=False, na=False) - .fillna(False) - | filtered_df["title"] - .str.contains(search_query, case=False, na=False, regex=False) - .fillna(False) + | filtered_df["author"].str.contains(search_query, case=False, na=False) ) + + # Only include title if the column exists + if "title" in filtered_df.columns: + mask = mask | filtered_df["title"].str.contains( + search_query, case=False, na=False, regex=False + ) + filtered_df = filtered_df[mask] if start_date_filter: @@ -76,6 +78,8 @@ class StatGen: return filtered_df ## Public Methods + def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + return self._prepare_filtered_df(df, filters).to_dict(orient="records") def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) diff --git a/server/utils.py b/server/utils.py new file mode 100644 index 0000000..078d1e7 --- /dev/null +++ b/server/utils.py @@ -0,0 +1,50 @@ +import datetime +from flask import request + +def parse_datetime_filter(value): + if not value: + return None + + try: + return datetime.datetime.fromisoformat(value) + except ValueError: + try: + return datetime.datetime.fromtimestamp(float(value)) + except ValueError as err: + raise ValueError( + "Date filters must be ISO-8601 strings or Unix timestamps" + ) from err + + +def get_request_filters() -> dict: + filters = {} + + search_query = request.args.get("search_query") or request.args.get("query") + if search_query: + filters["search_query"] = search_query + + start_date = parse_datetime_filter( + request.args.get("start_date") or request.args.get("start") + ) + if start_date: + filters["start_date"] = start_date + + end_date = parse_datetime_filter( + request.args.get("end_date") or request.args.get("end") + ) + if end_date: + filters["end_date"] = end_date + + data_sources = request.args.getlist("data_sources") + if not data_sources: + data_sources = request.args.getlist("sources") + + if len(data_sources) == 1 and "," in data_sources[0]: + data_sources = [ + source.strip() for source in data_sources[0].split(",") if source.strip() + ] + + if data_sources: + filters["data_sources"] = data_sources + + return filters \ No newline at end of file From dd44fad29469233aa6b203521699a723abf61a38 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 2 Mar 2026 18:30:52 +0000 Subject: [PATCH 6/6] fix(db): incorrect NER column name in database saving --- db/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/database.py b/db/database.py index 80cf5b3..30b8f34 100644 --- a/db/database.py +++ b/db/database.py @@ -110,7 +110,7 @@ class PostgresConnector: row["source"], row.get("topic"), row.get("topic_confidence"), - Json(row["ner_entities"]) if row.get("ner_entities") else None, + Json(row["entities"]) if row.get("entities") else None, row.get("emotion_anger"), row.get("emotion_disgust"), row.get("emotion_fear"),