From 07ab7529a96000915542c93059173fececf03ae3 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Sun, 1 Mar 2026 16:25:39 +0000 Subject: [PATCH] refactor: update analysis classes to accept DataFrame as parameter instead of instance variable --- server/analysis/cultural.py | 17 +- server/analysis/emotional.py | 15 +- server/analysis/interactional.py | 54 ++--- server/analysis/linguistic.py | 35 ++- server/analysis/temporal.py | 63 +++--- server/app.py | 365 ++++++++++++++++++------------- server/stat_gen.py | 243 +++++++++----------- 7 files changed, 403 insertions(+), 389 deletions(-) diff --git a/server/analysis/cultural.py b/server/analysis/cultural.py index ae55774..909233e 100644 --- a/server/analysis/cultural.py +++ b/server/analysis/cultural.py @@ -6,13 +6,12 @@ from typing import Any class CulturalAnalysis: - def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"): - self.df = df + def __init__(self, content_col: str = "content", topic_col: str = "topic"): self.content_col = content_col self.topic_col = topic_col - def get_identity_markers(self): - df = self.df.copy() + def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]: + df = original_df.copy() s = df[self.content_col].fillna("").astype(str).str.lower() in_group_words = {"we", "us", "our", "ourselves"} @@ -60,8 +59,8 @@ class CulturalAnalysis: return result - def get_stance_markers(self) -> dict[str, Any]: - s = self.df[self.content_col].fillna("").astype(str) + def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]: + s = df[self.content_col].fillna("").astype(str) hedges = { "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem", @@ -104,13 +103,11 @@ class CulturalAnalysis: "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3), } - def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]: - if "entities" not in self.df.columns: + def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]: + if "entities" not in df.columns: return {"entity_emotion_avg": {}} - df = self.df emotion_cols = [c for c in df.columns if c.startswith("emotion_")] - entity_counter = Counter() for row in df["entities"].dropna(): diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py index 10e897d..150aa20 100644 --- a/server/analysis/emotional.py +++ b/server/analysis/emotional.py @@ -1,18 +1,15 @@ import pandas as pd class EmotionalAnalysis: - def __init__(self, df: pd.DataFrame): - self.df = df - - def avg_emotion_by_topic(self) -> dict: + def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict: emotion_cols = [ - col for col in self.df.columns + col for col in df.columns if col.startswith("emotion_") ] counts = ( - self.df[ - (self.df["topic"] != "Misc") + df[ + (df["topic"] != "Misc") ] .groupby("topic") .size() @@ -20,8 +17,8 @@ class EmotionalAnalysis: ) avg_emotion_by_topic = ( - self.df[ - (self.df["topic"] != "Misc") + df[ + (df["topic"] != "Misc") ] .groupby("topic")[emotion_cols] .mean() diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 6fd5b3f..5c8ac3d 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -5,8 +5,7 @@ from collections import Counter class InteractionAnalysis: - def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): - self.df = df + def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions def _tokenize(self, text: str): @@ -14,9 +13,9 @@ class InteractionAnalysis: return [t for t in tokens if t not in self.word_exclusions] def _vocab_richness_per_user( - self, min_words: int = 20, top_most_used_words: int = 100 + self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 ) -> list: - df = self.df.copy() + df = df.copy() df["content"] = df["content"].fillna("").astype(str).str.lower() df["tokens"] = df["content"].apply(self._tokenize) @@ -58,10 +57,8 @@ class InteractionAnalysis: return rows - def top_users(self) -> list: - counts = ( - self.df.groupby(["author", "source"]).size().sort_values(ascending=False) - ) + def top_users(self, df: pd.DataFrame) -> list: + counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) top_users = [ {"author": author, "source": source, "count": int(count)} @@ -70,14 +67,14 @@ class InteractionAnalysis: return top_users - def per_user_analysis(self) -> dict: - per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0) + def per_user_analysis(self, df: pd.DataFrame) -> dict: + per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) - emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")] + emotion_cols = [col for col in df.columns if col.startswith("emotion_")] avg_emotions_by_author = {} if emotion_cols: - avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0) + avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) avg_emotions_by_author = { author: {emotion: float(score) for emotion, score in row.items()} for author, row in avg_emotions.iterrows() @@ -97,7 +94,7 @@ class InteractionAnalysis: per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user_records = per_user.reset_index().to_dict(orient="records") - vocab_rows = self._vocab_richness_per_user() + vocab_rows = self._vocab_richness_per_user(df) vocab_by_author = {row["author"]: row for row in vocab_rows} # merge vocab richness + per_user information @@ -112,7 +109,14 @@ class InteractionAnalysis: "comment_post_ratio": float(row.get("comment_post_ratio", 0)), "comment_share": float(row.get("comment_share", 0)), "avg_emotions": avg_emotions_by_author.get(author, {}), - "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}), + "vocab": vocab_by_author.get( + author, + { + "vocab_richness": 0, + "avg_words_per_event": 0, + "top_words": [], + }, + ), } ) @@ -120,13 +124,13 @@ class InteractionAnalysis: return merged_users - def interaction_graph(self): - interactions = {a: {} for a in self.df["author"].dropna().unique()} + def interaction_graph(self, df: pd.DataFrame): + interactions = {a: {} for a in df["author"].dropna().unique()} # reply_to refers to the comment id, this allows us to map comment ids to usernames - id_to_author = self.df.set_index("id")["author"].to_dict() + id_to_author = df.set_index("id")["author"].to_dict() - for _, row in self.df.iterrows(): + for _, row in df.iterrows(): a = row["author"] reply_id = row["reply_to"] @@ -141,10 +145,10 @@ class InteractionAnalysis: return interactions - def average_thread_depth(self): + def average_thread_depth(self, df: pd.DataFrame): depths = [] - id_to_reply = self.df.set_index("id")["reply_to"].to_dict() - for _, row in self.df.iterrows(): + id_to_reply = df.set_index("id")["reply_to"].to_dict() + for _, row in df.iterrows(): depth = 0 current_id = row["id"] @@ -163,16 +167,16 @@ class InteractionAnalysis: return round(sum(depths) / len(depths), 2) - def average_thread_length_by_emotion(self): + def average_thread_length_by_emotion(self, df: pd.DataFrame): emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_cols = [ c - for c in self.df.columns + for c in df.columns if c.startswith("emotion_") and c not in emotion_exclusions ] - id_to_reply = self.df.set_index("id")["reply_to"].to_dict() + id_to_reply = df.set_index("id")["reply_to"].to_dict() length_cache = {} def thread_length_from(start_id): @@ -211,7 +215,7 @@ class InteractionAnalysis: emotion_to_lengths = {} # Fill NaNs in emotion cols to avoid max() issues - emo_df = self.df[["id"] + emotion_cols].copy() + emo_df = df[["id"] + emotion_cols].copy() emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0) for _, row in emo_df.iterrows(): diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index 5718edc..dc91faf 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -4,9 +4,9 @@ import re from collections import Counter from itertools import islice + class LinguisticAnalysis: - def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): - self.df = df + def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions def _tokenize(self, text: str): @@ -14,29 +14,20 @@ class LinguisticAnalysis: return [t for t in tokens if t not in self.word_exclusions] def _clean_text(self, text: str) -> str: - text = re.sub(r"http\S+", "", text) # remove URLs + text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"www\S+", "", text) - text = re.sub(r"&\w+;", "", text) # remove HTML entities - text = re.sub(r"\bamp\b", "", text) # remove stray amp + text = re.sub(r"&\w+;", "", text) # remove HTML entities + text = re.sub(r"\bamp\b", "", text) # remove stray amp text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) return text - def word_frequencies(self, limit: int = 100) -> dict: - texts = ( - self.df["content"] - .dropna() - .astype(str) - .str.lower() - ) + def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]: + texts = df["content"].dropna().astype(str).str.lower() words = [] for text in texts: tokens = re.findall(r"\b[a-z]{3,}\b", text) - words.extend( - w for w in tokens - if w not in self.word_exclusions - ) - + words.extend(w for w in tokens if w not in self.word_exclusions) counts = Counter(words) @@ -48,16 +39,16 @@ class LinguisticAnalysis: ) return word_frequencies.to_dict(orient="records") - - def ngrams(self, n=2, limit=100): - texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower() + + def ngrams(self, df: pd.DataFrame, n=2, limit=100): + texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower() all_ngrams = [] for text in texts: tokens = re.findall(r"\b[a-z]{3,}\b", text) # stop word removal causes strange behaviors in ngrams - #tokens = [w for w in tokens if w not in self.word_exclusions] + # tokens = [w for w in tokens if w not in self.word_exclusions] ngrams = zip(*(islice(tokens, i, None) for i in range(n))) all_ngrams.extend([" ".join(ng) for ng in ngrams]) @@ -69,4 +60,4 @@ class LinguisticAnalysis: .sort_values("count", ascending=False) .head(limit) .to_dict(orient="records") - ) \ No newline at end of file + ) diff --git a/server/analysis/temporal.py b/server/analysis/temporal.py index ba5105f..0ab579f 100644 --- a/server/analysis/temporal.py +++ b/server/analysis/temporal.py @@ -1,16 +1,14 @@ import pandas as pd + class TemporalAnalysis: - def __init__(self, df: pd.DataFrame): - self.df = df - - def avg_reply_time_per_emotion(self) -> dict: - df = self.df.copy() + def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]: + df = df.copy() replies = df[ - (df["type"] == "comment") & - (df["reply_to"].notna()) & - (df["reply_to"] != "") + (df["type"] == "comment") + & (df["reply_to"].notna()) + & (df["reply_to"] != "") ] id_to_time = df.set_index("id")["dt"].to_dict() @@ -23,48 +21,51 @@ class TemporalAnalysis: return None return (row["dt"] - parent_time).total_seconds() - + replies["reply_time"] = replies.apply(compute_reply_time, axis=1) - emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")] + emotion_cols = [ + col + for col in df.columns + if col.startswith("emotion_") + and col not in ("emotion_neutral", "emotion_surprise") + ] replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1) - + grouped = ( - replies - .groupby("dominant_emotion")["reply_time"] + replies.groupby("dominant_emotion")["reply_time"] .agg(["mean", "count"]) .reset_index() ) return grouped.to_dict(orient="records") - - def posts_per_day(self) -> dict: - per_day = ( - self.df.groupby("date") - .size() - .reset_index(name="count") - ) + + def posts_per_day(self, df: pd.DataFrame) -> list[dict]: + per_day = df.groupby("date").size().reset_index(name="count") return per_day.to_dict(orient="records") - - def heatmap(self) -> dict: + + def heatmap(self, df: pd.DataFrame) -> list[dict]: weekday_order = [ - "Monday", "Tuesday", "Wednesday", - "Thursday", "Friday", "Saturday", "Sunday" + "Monday", + "Tuesday", + "Wednesday", + "Thursday", + "Friday", + "Saturday", + "Sunday", ] - self.df["weekday"] = pd.Categorical( - self.df["weekday"], - categories=weekday_order, - ordered=True + df = df.copy() + df["weekday"] = pd.Categorical( + df["weekday"], categories=weekday_order, ordered=True ) heatmap = ( - self.df - .groupby(["weekday", "hour"], observed=True) + df.groupby(["weekday", "hour"], observed=True) .size() .unstack(fill_value=0) .reindex(columns=range(24), fill_value=0) ) heatmap.columns = heatmap.columns.map(str) - return heatmap.to_dict(orient="records") \ No newline at end of file + return heatmap.to_dict(orient="records") diff --git a/server/app.py b/server/app.py index 961fd88..2f8a9e8 100644 --- a/server/app.py +++ b/server/app.py @@ -8,7 +8,7 @@ from flask_jwt_extended import ( JWTManager, create_access_token, jwt_required, - get_jwt_identity + get_jwt_identity, ) from server.stat_gen import StatGen @@ -27,31 +27,34 @@ db = PostgresConnector() load_dotenv() frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173") jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this") -jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes +jwt_access_token_expires = int( + os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200) +) # Default to 20 minutes # Flask Configuration CORS(app, resources={r"/*": {"origins": frontend_url}}) app.config["JWT_SECRET_KEY"] = jwt_secret_key -app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires +app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires bcrypt = Bcrypt(app) jwt = JWTManager(app) auth_manager = AuthManager(db, bcrypt) -# Global State -# posts_df = pd.read_json('small.jsonl', lines=True) -# with open("topic_buckets.json", "r", encoding="utf-8") as f: -# domain_topics = json.load(f) -# stat_obj = StatGen(posts_df, domain_topics) -stat_obj = None +stat_gen = StatGen() -@app.route('/register', methods=['POST']) + +@app.route("/register", methods=["POST"]) def register_user(): data = request.get_json() - if not data or "username" not in data or "email" not in data or "password" not in data: + if ( + not data + or "username" not in data + or "email" not in data + or "password" not in data + ): return jsonify({"error": "Missing username, email, or password"}), 400 - + username = data["username"] email = data["email"] password = data["password"] @@ -67,39 +70,40 @@ def register_user(): print(f"Registered new user: {username}") return jsonify({"message": f"User '{username}' registered successfully"}), 200 -@app.route('/login', methods=['POST']) + +@app.route("/login", methods=["POST"]) def login_user(): data = request.get_json() if not data or "username" not in data or "password" not in data: return jsonify({"error": "Missing username or password"}), 400 - + username = data["username"] password = data["password"] try: user = auth_manager.authenticate_user(username, password) if user: - access_token = create_access_token(identity=str(user['id'])) + access_token = create_access_token(identity=str(user["id"])) return jsonify({"access_token": access_token}), 200 else: return jsonify({"error": "Invalid username or password"}), 401 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - + + @app.route("/profile", methods=["GET"]) @jwt_required() def profile(): current_user = get_jwt_identity() return jsonify( - message="Access granted", - user=auth_manager.get_user_by_id(current_user) + message="Access granted", user=auth_manager.get_user_by_id(current_user) ), 200 -@app.route('/upload', methods=['POST']) +@app.route("/upload", methods=["POST"]) @jwt_required() def upload_data(): if "posts" not in request.files or "topics" not in request.files: @@ -111,27 +115,36 @@ def upload_data(): if post_file.filename == "" or topic_file == "": return jsonify({"error": "Empty filename"}), 400 - if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'): - return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400 - + if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith( + ".json" + ): + return jsonify( + {"error": "Invalid file type. Only .jsonl and .json files are allowed."} + ), 400 + try: current_user = get_jwt_identity() posts_df = pd.read_json(post_file, lines=True, convert_dates=False) topics = json.load(topic_file) - + processor = DatasetProcessor(posts_df, topics) enriched_df = processor.enrich() - dataset_id = db.save_dataset_info(current_user, f"dataset_{current_user}", topics) + dataset_id = db.save_dataset_info( + current_user, f"dataset_{current_user}", topics + ) db.save_dataset_content(dataset_id, enriched_df) - return jsonify({"message": "File uploaded successfully", "event_count": len(enriched_df)}), 200 + return jsonify( + {"message": "File uploaded successfully", "event_count": len(enriched_df)} + ), 200 except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 except Exception as e: return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route('/dataset/', methods=['GET']) + + +@app.route("/dataset/", methods=["GET"]) @jwt_required() def get_dataset(dataset_id): current_user = get_jwt_identity() @@ -139,159 +152,205 @@ def get_dataset(dataset_id): if dataset.get("user_id") != int(current_user): return jsonify({"error": "Unauthorized access to dataset"}), 403 - + dataset_content = db.get_dataset_content(dataset_id) - + if dataset_content.empty: return jsonify({"error": "Dataset content not found"}), 404 return jsonify(dataset_content.to_dict(orient="records")), 200 -@app.route('/stats/content', methods=['GET']) -def word_frequencies(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - + +@app.route("/dataset//content", methods=["GET"]) +@jwt_required() +def content_endpoint(dataset_id): + current_user = get_jwt_identity() + dataset = db.get_dataset_info(dataset_id) + + if dataset.get("user_id") != int(current_user): + return jsonify({"error": "Unauthorized access to dataset"}), 403 + + dataset_content = db.get_dataset_content(dataset_id) + try: - return jsonify(stat_obj.get_content_analysis()), 200 - except ValueError as e: - return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 - except Exception as e: - print(traceback.format_exc()) - return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route('/stats/summary', methods=["GET"]) -def get_summary(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - try: - return jsonify(stat_obj.summary()), 200 - except ValueError as e: - return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 - except Exception as e: - return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route("/stats/time", methods=["GET"]) -def get_time_analysis(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - try: - return jsonify(stat_obj.get_time_analysis()), 200 - except ValueError as e: - return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 - except Exception as e: - print(traceback.format_exc()) - return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route("/stats/user", methods=["GET"]) -def get_user_analysis(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - try: - return jsonify(stat_obj.get_user_analysis()), 200 - except ValueError as e: - return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 - except Exception as e: - print(traceback.format_exc()) - return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - -@app.route("/stats/cultural", methods=["GET"]) -def get_cultural_analysis(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - try: - return jsonify(stat_obj.get_cultural_analysis()), 200 + return jsonify(stat_gen.get_content_analysis(dataset_content)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 -@app.route("/stats/interaction", methods=["GET"]) -def get_interaction_analysis(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - + +@app.route("/dataset//summary", methods=["GET"]) +@jwt_required() +def get_summary(dataset_id): + current_user = get_jwt_identity() + dataset = db.get_dataset_info(dataset_id) + + if dataset.get("user_id") != int(current_user): + return jsonify({"error": "Unauthorized access to dataset"}), 403 + + dataset_content = db.get_dataset_content(dataset_id) + try: - return jsonify(stat_obj.get_interactional_analysis()), 200 + return jsonify(stat_gen.summary(dataset_content)), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 -@app.route('/filter/query', methods=["POST"]) -def filter_query(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - data = request.get_json(silent=True) or {} +@app.route("/dataset//time", methods=["GET"]) +@jwt_required() +def get_time_analysis(dataset_id): + current_user = get_jwt_identity() + dataset = db.get_dataset_info(dataset_id) - if "query" not in data: - return jsonify(stat_obj.df.to_dict(orient="records")), 200 - - query = data["query"] - filtered_df = stat_obj.filter_by_query(query) + if dataset.get("user_id") != int(current_user): + return jsonify({"error": "Unauthorized access to dataset"}), 403 - return jsonify(filtered_df), 200 + dataset_content = db.get_dataset_content(dataset_id) -@app.route('/filter/time', methods=["POST"]) -def filter_time(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - data = request.get_json(silent=True) - if not data: - return jsonify({"error": "Invalid or missing JSON body"}), 400 - - if "start" not in data or "end" not in data: - return jsonify({"error": "Please include both start and end dates"}), 400 - try: - start = pd.to_datetime(data["start"], utc=True) - end = pd.to_datetime(data["end"], utc=True) - filtered_df = stat_obj.set_time_range(start, end) - return jsonify(filtered_df), 200 - except Exception: - return jsonify({"error": "Invalid datetime format"}), 400 - -@app.route('/filter/sources', methods=["POST"]) -def filter_sources(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - data = request.get_json(silent=True) - if not data: - return jsonify({"error": "Invalid or missing JSON body"}), 400 - - if "sources" not in data: - return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400 - - try: - filtered_df = stat_obj.filter_data_sources(data["sources"]) - return jsonify(filtered_df), 200 - except ValueError: - return jsonify({"error": "Please enable at least one data source"}), 400 - except Exception as e: - return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500 - - -@app.route('/filter/reset', methods=["GET"]) -def reset_dataset(): - if stat_obj is None: - return jsonify({"error": "No data uploaded"}), 400 - - try: - stat_obj.reset_dataset() - return jsonify({"success": "Dataset successfully reset"}) + return jsonify(stat_gen.get_time_analysis(dataset_content)), 200 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - + + +@app.route("/dataset//user", methods=["GET"]) +@jwt_required() +def get_user_analysis(dataset_id): + current_user = get_jwt_identity() + dataset = db.get_dataset_info(dataset_id) + + if dataset.get("user_id") != int(current_user): + return jsonify({"error": "Unauthorized access to dataset"}), 403 + + dataset_content = db.get_dataset_content(dataset_id) + + try: + return jsonify(stat_gen.get_user_analysis(dataset_content)), 200 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 + + +@app.route("/dataset//cultural", methods=["GET"]) +@jwt_required() +def get_cultural_analysis(dataset_id): + current_user = get_jwt_identity() + dataset = db.get_dataset_info(dataset_id) + + if dataset.get("user_id") != int(current_user): + return jsonify({"error": "Unauthorized access to dataset"}), 403 + + dataset_content = db.get_dataset_content(dataset_id) + + try: + return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 + + +@app.route("/dataset//interaction", methods=["GET"]) +@jwt_required() +def get_interaction_analysis(dataset_id): + current_user = get_jwt_identity() + dataset = db.get_dataset_info(dataset_id) + + if dataset.get("user_id") != int(current_user): + return jsonify({"error": "Unauthorized access to dataset"}), 403 + + dataset_content = db.get_dataset_content(dataset_id) + + try: + return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 + + +# @app.route("/filter/query", methods=["POST"]) +# def filter_query(): +# if stat_obj is None: +# return jsonify({"error": "No data uploaded"}), 400 + +# data = request.get_json(silent=True) or {} + +# if "query" not in data: +# return jsonify(stat_obj.df.to_dict(orient="records")), 200 + +# query = data["query"] +# filtered_df = stat_obj.filter_by_query(query) + +# return jsonify(filtered_df), 200 + + +# @app.route("/filter/time", methods=["POST"]) +# def filter_time(): +# if stat_obj is None: +# return jsonify({"error": "No data uploaded"}), 400 + +# data = request.get_json(silent=True) +# if not data: +# return jsonify({"error": "Invalid or missing JSON body"}), 400 + +# if "start" not in data or "end" not in data: +# return jsonify({"error": "Please include both start and end dates"}), 400 + +# try: +# start = pd.to_datetime(data["start"], utc=True) +# end = pd.to_datetime(data["end"], utc=True) +# filtered_df = stat_obj.set_time_range(start, end) +# return jsonify(filtered_df), 200 +# except Exception: +# return jsonify({"error": "Invalid datetime format"}), 400 + + +# @app.route("/filter/sources", methods=["POST"]) +# def filter_sources(): +# if stat_obj is None: +# return jsonify({"error": "No data uploaded"}), 400 + +# data = request.get_json(silent=True) +# if not data: +# return jsonify({"error": "Invalid or missing JSON body"}), 400 + +# if "sources" not in data: +# return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400 + +# try: +# filtered_df = stat_obj.filter_data_sources(data["sources"]) +# return jsonify(filtered_df), 200 +# except ValueError: +# return jsonify({"error": "Please enable at least one data source"}), 400 +# except Exception as e: +# return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500 + + +# @app.route("/filter/reset", methods=["GET"]) +# def reset_dataset(): +# if stat_obj is None: +# return jsonify({"error": "No data uploaded"}), 400 + +# try: +# stat_obj.reset_dataset() +# return jsonify({"success": "Dataset successfully reset"}) +# except Exception as e: +# print(traceback.format_exc()) +# return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 + if __name__ == "__main__": - app.run(debug=True) \ No newline at end of file + app.run(debug=True) diff --git a/server/stat_gen.py b/server/stat_gen.py index bbba747..dc748d0 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -1,170 +1,135 @@ -import pandas as pd import datetime -import nltk +import nltk +import pandas as pd from nltk.corpus import stopwords -from server.analysis.nlp import NLP -from server.analysis.temporal import TemporalAnalysis + +from server.analysis.cultural import CulturalAnalysis from server.analysis.emotional import EmotionalAnalysis from server.analysis.interactional import InteractionAnalysis from server.analysis.linguistic import LinguisticAnalysis -from server.analysis.cultural import CulturalAnalysis +from server.analysis.temporal import TemporalAnalysis DOMAIN_STOPWORDS = { - "www", "https", "http", - "boards", "boardsie", - "comment", "comments", - "discussion", "thread", - "post", "posts", - "would", "get", "one" + "www", + "https", + "http", + "boards", + "boardsie", + "comment", + "comments", + "discussion", + "thread", + "post", + "posts", + "would", + "get", + "one", } -nltk.download('stopwords') -EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS +nltk.download("stopwords") +EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS + class StatGen: - def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None: - comments_df = df[["id", "comments"]].explode("comments") - comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))] - comments_df = pd.json_normalize(comments_df["comments"]) + def __init__(self) -> None: + self.temporal_analysis = TemporalAnalysis() + self.emotional_analysis = EmotionalAnalysis() + self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS) + self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) + self.cultural_analysis = CulturalAnalysis() - posts_df = df.drop(columns=["comments"]) - posts_df["type"] = "post" - posts_df["parent_id"] = None - - comments_df["type"] = "comment" - comments_df["parent_id"] = comments_df.get("post_id") - self.domain_topics = domain_topics - - self.df = pd.concat([posts_df, comments_df]) - self.df.drop(columns=["post_id"], inplace=True, errors="ignore") - - self.nlp = NLP(self.df, "title", "content", domain_topics) - self.nlp.add_emotion_cols() - self.nlp.add_topic_col() - self.nlp.add_ner_cols() - self._add_time_cols(self.df) - - self.temporal_analysis = TemporalAnalysis(self.df) - self.emotional_analysis = EmotionalAnalysis(self.df) - self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS) - self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS) - self.cultural_analysis = CulturalAnalysis(self.df) - - self.original_df = self.df.copy(deep=True) - - ## Private Methods - def _add_time_cols(self, df: pd.DataFrame) -> None: - df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce') - df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date - df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) - df["hour"] = df["dt"].dt.hour - df["weekday"] = df["dt"].dt.day_name() - - ## Public - - # topics over time - # emotions over time - def get_time_analysis(self) -> dict: + def get_time_analysis(self, df: pd.DataFrame) -> dict: return { - "events_per_day": self.temporal_analysis.posts_per_day(), - "weekday_hour_heatmap": self.temporal_analysis.heatmap() + "events_per_day": self.temporal_analysis.posts_per_day(df), + "weekday_hour_heatmap": self.temporal_analysis.heatmap(df), } - # average topic duration - def get_content_analysis(self) -> dict: + def get_content_analysis(self, df: pd.DataFrame) -> dict: return { - "word_frequencies": self.linguistic_analysis.word_frequencies(), - "common_two_phrases": self.linguistic_analysis.ngrams(), - "common_three_phrases": self.linguistic_analysis.ngrams(n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(), - "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() + "word_frequencies": self.linguistic_analysis.word_frequencies(df), + "common_two_phrases": self.linguistic_analysis.ngrams(df), + "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df), + "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df), } - - # average emotion per user - # average chain length - def get_user_analysis(self) -> dict: - return { - "top_users": self.interaction_analysis.top_users(), - "users": self.interaction_analysis.per_user_analysis() - } - - # average / max thread depth - # high engagment threads based on volume - def get_interactional_analysis(self) -> dict: - return { - "average_thread_depth": self.interaction_analysis.average_thread_depth(), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(), - "interaction_graph": self.interaction_analysis.interaction_graph() - } - - # detect community jargon - # in-group and out-group linguistic markers - def get_cultural_analysis(self) -> dict: - return { - "identity_markers": self.cultural_analysis.get_identity_markers(), - "stance_markers": self.cultural_analysis.get_stance_markers(), - "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity() - } - - def summary(self) -> dict: - total_posts = (self.df["type"] == "post").sum() - total_comments = (self.df["type"] == "comment").sum() - events_per_user = self.df.groupby("author").size() + def get_user_analysis(self, df: pd.DataFrame) -> dict: + return { + "top_users": self.interaction_analysis.top_users(df), + "users": self.interaction_analysis.per_user_analysis(df), + "interaction_graph": self.interaction_analysis.interaction_graph(df), + } + + def get_interactional_analysis(self, df: pd.DataFrame) -> dict: + return { + "average_thread_depth": self.interaction_analysis.average_thread_depth(df), + "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df), + } + + def get_cultural_analysis(self, df: pd.DataFrame) -> dict: + return { + "identity_markers": self.cultural_analysis.get_identity_markers(df), + "stance_markers": self.cultural_analysis.get_stance_markers(df), + "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df), + } + + def summary(self, df: pd.DataFrame) -> dict: + total_posts = (df["type"] == "post").sum() + total_comments = (df["type"] == "comment").sum() + events_per_user = df.groupby("author").size() return { - "total_events": int(len(self.df)), + "total_events": int(len(df)), "total_posts": int(total_posts), "total_comments": int(total_comments), "unique_users": int(events_per_user.count()), "comments_per_post": round(total_comments / max(total_posts, 1), 2), "lurker_ratio": round((events_per_user == 1).mean(), 2), "time_range": { - "start": int(self.df["dt"].min().timestamp()), - "end": int(self.df["dt"].max().timestamp()) + "start": int(df["dt"].min().timestamp()), + "end": int(df["dt"].max().timestamp()), }, - "sources": self.df["source"].dropna().unique().tolist() - } - - def filter_by_query(self, search_query: str) -> dict: - self.df = self.df[ - self.df["content"].str.contains(search_query) - ] - - return { - "rows": len(self.df), - "data": self.df.to_dict(orient="records") - } - - def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict: - self.df = self.df[ - (self.df["dt"] >= start) & - (self.df["dt"] <= end) - ] - - return { - "rows": len(self.df), - "data": self.df.to_dict(orient="records") - } - - """ - Input is a hash map (source_name: str -> enabled: bool) - """ - def filter_data_sources(self, data_sources: dict) -> dict: - enabled_sources = [src for src, enabled in data_sources.items() if enabled] - - if not enabled_sources: - raise ValueError("Please choose at least one data source") - - self.df = self.df[self.df["source"].isin(enabled_sources)] - - return { - "rows": len(self.df), - "data": self.df.to_dict(orient="records") + "sources": df["source"].dropna().unique().tolist(), } - - def reset_dataset(self) -> None: - self.df = self.original_df.copy(deep=True) + # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict: + # filtered_df = df[df["content"].str.contains(search_query, na=False)] + # return { + # "rows": len(filtered_df), + # "data": filtered_df.to_dict(orient="records"), + # } + + # def set_time_range( + # self, + # original_df: pd.DataFrame, + # start: datetime.datetime, + # end: datetime.datetime, + # ) -> dict: + # df = self._prepare_df(original_df) + # filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)] + + # return { + # "rows": len(filtered_df), + # "data": filtered_df.to_dict(orient="records"), + # } + + # def filter_data_sources( + # self, original_df: pd.DataFrame, data_sources: dict + # ) -> dict: + # df = self._prepare_df(original_df) + # enabled_sources = [src for src, enabled in data_sources.items() if enabled] + + # if not enabled_sources: + # raise ValueError("Please choose at least one data source") + + # filtered_df = df[df["source"].isin(enabled_sources)] + + # return { + # "rows": len(filtered_df), + # "data": filtered_df.to_dict(orient="records"), + # } + + # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame: + # return self._prepare_df(original_df)