From 43ce58fd401fb872d1c1b577b94c0468e8eb8404 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 11 Feb 2026 19:00:59 +0000 Subject: [PATCH] feat: combine post and comment uploads into a single file --- frontend/src/components/SummaryStats.tsx | 2 ++ frontend/src/pages/Upload.tsx | 8 +------- server/app.py | 15 ++++++--------- server/nlp.py | 3 +-- server/stat_gen.py | 10 ++++++++-- 5 files changed, 18 insertions(+), 20 deletions(-) diff --git a/frontend/src/components/SummaryStats.tsx b/frontend/src/components/SummaryStats.tsx index 3ca414a..0ff46ac 100644 --- a/frontend/src/components/SummaryStats.tsx +++ b/frontend/src/components/SummaryStats.tsx @@ -58,6 +58,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr const [selectedUser, setSelectedUser] = useState(null); const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null; + console.log(summary) + return (
diff --git a/frontend/src/pages/Upload.tsx b/frontend/src/pages/Upload.tsx index db1ad3d..2218231 100644 --- a/frontend/src/pages/Upload.tsx +++ b/frontend/src/pages/Upload.tsx @@ -8,20 +8,18 @@ const styles = StatsStyling; const UploadPage = () => { let postFile: File | undefined; - let commentFile: File | undefined; let topicBucketFile: File | undefined; const [returnMessage, setReturnMessage] = useState('') const navigate = useNavigate() const uploadFiles = async () => { - if (!postFile || !commentFile || !topicBucketFile) { + if (!postFile || !topicBucketFile) { alert('Please upload all files before uploading.') return } const formData = new FormData() formData.append('posts', postFile) - formData.append('comments', commentFile) formData.append('topics', topicBucketFile) try { @@ -44,10 +42,6 @@ const UploadPage = () => {

Posts File

postFile = e.target.files?.[0]}>
-
-

Comments File

- commentFile = e.target.files?.[0]}> -

Topic Buckets File

topicBucketFile = e.target.files?.[0]}> diff --git a/server/app.py b/server/app.py index 5c31c99..36f3ac5 100644 --- a/server/app.py +++ b/server/app.py @@ -12,33 +12,30 @@ app = Flask(__name__) CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}}) # Global State -posts_df = pd.read_json('posts.jsonl', lines=True) -comments_df = pd.read_json('comments.jsonl', lines=True) +posts_df = pd.read_json('posts_test.jsonl', lines=True) with open("topic_buckets.json", "r", encoding="utf-8") as f: domain_topics = json.load(f) -stat_obj = StatGen(posts_df, comments_df, domain_topics) +stat_obj = StatGen(posts_df, domain_topics) @app.route('/upload', methods=['POST']) def upload_data(): - if "posts" not in request.files or "comments" not in request.files or "topics" not in request.files: + if "posts" not in request.files or "topics" not in request.files: return jsonify({"error": "Missing required files or form data"}), 400 post_file = request.files["posts"] - comment_file = request.files["comments"] topic_file = request.files["topics"] - if post_file.filename == "" or comment_file.filename == "" or topic_file == "": + if post_file.filename == "" or topic_file == "": return jsonify({"error": "Empty filename"}), 400 - if not post_file.filename.endswith('.jsonl') or not comment_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'): + if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'): return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400 try: global stat_obj posts_df = pd.read_json(post_file, lines=True) - comments_df = pd.read_json(comment_file, lines=True) - stat_obj = StatGen(posts_df, comments_df, json.load(topic_file)) + stat_obj = StatGen(posts_df, json.load(topic_file)) return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200 except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 diff --git a/server/nlp.py b/server/nlp.py index b6a1d46..98a7d54 100644 --- a/server/nlp.py +++ b/server/nlp.py @@ -1,12 +1,11 @@ import torch import pandas as pd import numpy as np -from typing import Any +from typing import Any from transformers import pipeline from sentence_transformers import SentenceTransformer - class NLP: _topic_models: dict[str, SentenceTransformer] = {} _emotion_classifiers: dict[str, Any] = {} diff --git a/server/stat_gen.py b/server/stat_gen.py index 2ffa5fb..8eff312 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -21,7 +21,11 @@ nltk.download('stopwords') EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS class StatGen: - def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None: + def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None: + comments_df = df[["id", "comments"]].explode("comments") + comments_df = pd.json_normalize(comments_df["comments"]) + + posts_df = df.drop(columns=["comments"]) posts_df["type"] = "post" posts_df["parent_id"] = None @@ -30,6 +34,7 @@ class StatGen: self.domain_topics = domain_topics self.df = pd.concat([posts_df, comments_df]) + self.df.drop(columns=["post_id"], inplace=True, errors="ignore") self.nlp = NLP(self.df, "title", "content", domain_topics) self._add_extra_cols(self.df) @@ -37,6 +42,7 @@ class StatGen: ## Private Methods def _add_extra_cols(self, df: pd.DataFrame) -> None: + df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce') df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) df["hour"] = df["dt"].dt.hour @@ -165,7 +171,7 @@ class StatGen: "start": int(self.df["dt"].min().timestamp()), "end": int(self.df["dt"].max().timestamp()) }, - "sources": self.df["source"].unique().tolist() + "sources": self.df["source"].dropna().unique().tolist() } def content_analysis(self, limit: int = 100) -> dict: