diff --git a/server/app.py b/server/app.py index 484807d..9eb2478 100644 --- a/server/app.py +++ b/server/app.py @@ -2,6 +2,7 @@ from flask import Flask, jsonify, request from flask_cors import CORS from nltk.corpus import stopwords from datetime import datetime +from server.stat_gen import StatGen import nltk import pandas as pd @@ -12,8 +13,7 @@ app = Flask(__name__) CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}}) # Global State -posts_df = None -comments_df = None +stat_obj = None nltk.download('stopwords') EXCLUDE_WORDS = set(stopwords.words('english')) @@ -33,112 +33,51 @@ def upload_data(): return jsonify({"error": "Invalid file type. Only .jsonl files are allowed."}), 400 try: - global posts_df, comments_df - posts_df = pd.read_json(post_file, lines=True) - comments_df = pd.read_json(comment_file, lines=True) + global stat_obj + stat_obj = StatGen(post_file, comment_file) except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 except Exception as e: return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - return jsonify({"message": "File uploaded successfully", "posts_count": len(posts_df), "comments_count": len(comments_df)}), 200 + return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200 @app.route('/stats/posts_per_day', methods=['GET']) def posts_per_day(): - if posts_df is None: + if stat_obj is None: return jsonify({"error": "No data uploaded"}), 400 try: - posts_df['date'] = pd.to_datetime(posts_df['timestamp'], unit='s').dt.date - posts_per_day = ( - posts_df - .groupby('date') - .size() - .reset_index(name='posts_count') - ) + return jsonify(stat_obj.get_events_per_day().to_dict(orient='records')), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - return jsonify(posts_per_day.to_dict(orient='records')), 200 - -@app.route('/stats/comments_per_day', methods=['GET']) -def comments_per_day(): - if comments_df is None: - return jsonify({"error": "No data uploaded"}), 400 - - try: - comments_df['date'] = pd.to_datetime(comments_df['timestamp'], unit='s').dt.date - comments_per_day = ( - comments_df - .groupby('date') - .size() - .reset_index(name='comments_count') - ) - except ValueError as e: - return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 - except Exception as e: - return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - - return jsonify(comments_per_day.to_dict(orient='records')), 200 - -@app.route("/stats/heatmap", methods=['GET']) +@app.route("/stats/heatmap", methods=["GET"]) def get_heatmap(): + if stat_obj is None: + return jsonify({"error": "No data uploaded"}), 400 + try: - posts_df["dt"] = pd.to_datetime(posts_df["timestamp"], unit='s', utc=True) - posts_df["hour"] = posts_df["dt"].dt.hour - posts_df["weekday"] = posts_df["dt"].dt.day_name() - - weekday_order = [ - "Monday", "Tuesday", "Wednesday", - "Thursday", "Friday", "Saturday", "Sunday" - ] - - posts_df["weekday"] = pd.Categorical( - posts_df["weekday"], - categories=weekday_order, - ordered=True - ) - - heatmap = ( - posts_df - .groupby(["weekday", "hour"]) - .size() - .unstack(fill_value=0) - .reindex(columns=range(24), fill_value=0) - ) + return jsonify(stat_obj.get_heatmap().to_dict(orient="records")), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: - return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 + return jsonify({"error": str(e)}), 500 - return jsonify(heatmap.to_dict(orient="records")), 200 @app.route('/stats/word_frequencies', methods=['GET']) def word_frequencies(): - if posts_df is None: + if stat_obj is None: return jsonify({"error": "No data uploaded"}), 400 try: - all_text = " ".join(posts_df['content'].fillna('')) - words = all_text.split() - word_freq = {} - for word in words: - clean_word = ''.join(c.lower() for c in word if c.isalnum()) - if clean_word and clean_word not in EXCLUDE_WORDS: - word_freq[clean_word] = word_freq.get(clean_word, 0) + 1 - - sorted_words = sorted(word_freq.items(), key=lambda item: item[1], reverse=True) - - # Get top 100 words and their frequencies and return as list of dicts - sorted_words = [{"word": word, "frequency": freq} for word, freq in sorted_words] + return jsonify(stat_obj.get_word_frequencies().to_dict(orient='records')), 200 except ValueError as e: return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 except Exception as e: return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 - return jsonify(sorted_words[:100]), 200 - if __name__ == "__main__": app.run(debug=True) \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py new file mode 100644 index 0000000..2d6f4c7 --- /dev/null +++ b/server/stat_gen.py @@ -0,0 +1,108 @@ +import pandas as pd +import re +import nltk + +from nltk.corpus import stopwords +from collections import Counter + +DOMAIN_STOPWORDS = { + "www", "https", "http", + "boards", "boardsie", + "comment", "comments", + "discussion", "thread", + "post", "posts", + "would", "could", "should", + "like", "get", "one" +} + +nltk.download('stopwords') +EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS + +class StatGen: + def __init__(self, posts: list, comments: list) -> None: + posts_df = pd.read_json(posts, lines=True) + comments_df = pd.read_json(comments, lines=True) + + posts_df["type"] = "post" + posts_df["parent_id"] = None + + comments_df["type"] = "comment" + comments_df["parent_id"] = comments_df.get("post_id") + + self.df = pd.concat([posts_df, comments_df]) + self._add_date_cols(self.df) + + # Datasets + self.heatmap = self._generate_heatmap() + self.word_frequencies = self._get_word_frequencies(100) + self.events_per_day = self._get_events_per_day() + + ## Private Methods + def _add_date_cols(self, df: pd.DataFrame) -> None: + df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date + df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) + df["hour"] = df["dt"].dt.hour + df["weekday"] = df["dt"].dt.day_name() + + def _get_events_per_day(self) -> pd.DataFrame: + return ( + self.df + .groupby('date') + .size() + .reset_index(name='posts_count') + ) + + def _generate_heatmap(self) -> pd.DataFrame: + weekday_order = [ + "Monday", "Tuesday", "Wednesday", + "Thursday", "Friday", "Saturday", "Sunday" + ] + + self.df["weekday"] = pd.Categorical( + self.df["weekday"], + categories=weekday_order, + ordered=True + ) + + return ( + self.df + .groupby(["weekday", "hour"]) + .size() + .unstack(fill_value=0) + .reindex(columns=range(24), fill_value=0) + ) + + def _get_word_frequencies(self, limit: int) -> pd.DataFrame: + texts = ( + self.df["content"] + .dropna() + .astype(str) + .str.lower() + ) + + words = [] + for text in texts: + tokens = re.findall(r"\b[a-z]{3,}\b", text) + words.extend( + w for w in tokens + if w not in EXCLUDE_WORDS + ) + + counts = Counter(words) + + return ( + pd.DataFrame(counts.items(), columns=["word", "count"]) + .sort_values("count", ascending=False) + .head(limit) + .reset_index(drop=True) + ) + + ## Public + def get_heatmap(self) -> pd.DataFrame: + return self.heatmap + + def get_word_frequencies(self) -> pd.DataFrame: + return self.word_frequencies + + def get_events_per_day(self) -> pd.DataFrame: + return self.events_per_day \ No newline at end of file