From 43ce58fd401fb872d1c1b577b94c0468e8eb8404 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Wed, 11 Feb 2026 19:00:59 +0000
Subject: [PATCH] feat: combine post and comment uploads into a single file

---
 frontend/src/components/SummaryStats.tsx |  2 ++
 frontend/src/pages/Upload.tsx            |  8 +-------
 server/app.py                            | 15 ++++++---------
 server/nlp.py                            |  3 +--
 server/stat_gen.py                       | 10 ++++++++--
 5 files changed, 18 insertions(+), 20 deletions(-)
diff --git a/frontend/src/components/SummaryStats.tsx b/frontend/src/components/SummaryStats.tsx
index 3ca414a..0ff46ac 100644
--- a/frontend/src/components/SummaryStats.tsx
+++ b/frontend/src/components/SummaryStats.tsx
@@ -58,6 +58,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
     const [selectedUser, setSelectedUser] = useState<string | null>(null);
     const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
 
+    console.log(summary)
+
     return (
     <div style={styles.page}>
 
diff --git a/frontend/src/pages/Upload.tsx b/frontend/src/pages/Upload.tsx
index db1ad3d..2218231 100644
--- a/frontend/src/pages/Upload.tsx
+++ b/frontend/src/pages/Upload.tsx
@@ -8,20 +8,18 @@ const styles = StatsStyling;
 
 const UploadPage = () => {
   let postFile: File | undefined;
-  let commentFile: File | undefined;
   let topicBucketFile: File | undefined;
   const [returnMessage, setReturnMessage] = useState('')
   const navigate = useNavigate()
 
   const uploadFiles = async () => {
-    if (!postFile || !commentFile || !topicBucketFile) {
+    if (!postFile || !topicBucketFile) {
       alert('Please upload all files before uploading.')
       return
     }
 
     const formData = new FormData()
     formData.append('posts', postFile)
-    formData.append('comments', commentFile)
     formData.append('topics', topicBucketFile)
 
     try {
@@ -44,10 +42,6 @@ const UploadPage = () => {
         <h2 style={{color: "black" }}>Posts File</h2>
         <input style={{color: "black" }} type="file" onChange={(e) => postFile = e.target.files?.[0]}></input>
       </div>
-      <div style={{ ...styles.card }}>
-        <h2 style={{color: "black" }}>Comments File</h2>
-        <input style={{color: "black" }} type="file" onChange={(e) => commentFile = e.target.files?.[0]}></input>
-      </div>
       <div style={{ ...styles.card }}>
         <h2 style={{color: "black" }}>Topic Buckets File</h2>
         <input style={{color: "black" }} type="file" onChange={(e) => topicBucketFile = e.target.files?.[0]}></input>
diff --git a/server/app.py b/server/app.py
index 5c31c99..36f3ac5 100644
--- a/server/app.py
+++ b/server/app.py
@@ -12,33 +12,30 @@ app = Flask(__name__)
 CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
 
 # Global State
-posts_df = pd.read_json('posts.jsonl', lines=True)
-comments_df = pd.read_json('comments.jsonl', lines=True)
+posts_df = pd.read_json('posts_test.jsonl', lines=True)
 with open("topic_buckets.json", "r", encoding="utf-8") as f:
     domain_topics = json.load(f)
-stat_obj = StatGen(posts_df, comments_df, domain_topics)
+stat_obj = StatGen(posts_df, domain_topics)
 
 @app.route('/upload', methods=['POST'])
 def upload_data():
-    if "posts" not in request.files or "comments" not in request.files or "topics" not in request.files:
+    if "posts" not in request.files or "topics" not in request.files:
         return jsonify({"error": "Missing required files or form data"}), 400
 
     post_file = request.files["posts"]
-    comment_file = request.files["comments"]
     topic_file = request.files["topics"]
 
-    if post_file.filename == "" or comment_file.filename == "" or topic_file == "":
+    if post_file.filename == "" or topic_file == "":
         return jsonify({"error": "Empty filename"}), 400
 
-    if not post_file.filename.endswith('.jsonl') or not comment_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
+    if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
         return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
     
     try:
         global stat_obj
 
         posts_df = pd.read_json(post_file, lines=True)
-        comments_df = pd.read_json(comment_file, lines=True)
-        stat_obj = StatGen(posts_df, comments_df, json.load(topic_file))
+        stat_obj = StatGen(posts_df, json.load(topic_file))
         return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
     except ValueError as e:
         return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
diff --git a/server/nlp.py b/server/nlp.py
index b6a1d46..98a7d54 100644
--- a/server/nlp.py
+++ b/server/nlp.py
@@ -1,12 +1,11 @@
 import torch
 import pandas as pd
 import numpy as np
-from typing import Any
 
+from typing import Any
 from transformers import pipeline
 from sentence_transformers import SentenceTransformer
 
-
 class NLP:
     _topic_models: dict[str, SentenceTransformer] = {}
     _emotion_classifiers: dict[str, Any] = {}
diff --git a/server/stat_gen.py b/server/stat_gen.py
index 2ffa5fb..8eff312 100644
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -21,7 +21,11 @@ nltk.download('stopwords')
 EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
 
 class StatGen:
-    def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None:
+    def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
+        comments_df = df[["id", "comments"]].explode("comments")
+        comments_df = pd.json_normalize(comments_df["comments"])
+
+        posts_df = df.drop(columns=["comments"])
         posts_df["type"] = "post"
         posts_df["parent_id"] = None
 
@@ -30,6 +34,7 @@ class StatGen:
         self.domain_topics = domain_topics
 
         self.df = pd.concat([posts_df, comments_df])
+        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
         self.nlp = NLP(self.df, "title", "content", domain_topics)
         self._add_extra_cols(self.df)
 
@@ -37,6 +42,7 @@ class StatGen:
 
     ## Private Methods
     def _add_extra_cols(self, df: pd.DataFrame) -> None:
+        df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce')
         df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
         df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
         df["hour"] = df["dt"].dt.hour
@@ -165,7 +171,7 @@ class StatGen:
                 "start": int(self.df["dt"].min().timestamp()),
                 "end": int(self.df["dt"].max().timestamp())
             },
-            "sources": self.df["source"].unique().tolist()
+            "sources": self.df["source"].dropna().unique().tolist()
         }
 
     def content_analysis(self, limit: int = 100) -> dict: