feat: combine post and comment uploads into a single file

This commit is contained in:
2026-02-11 19:00:59 +00:00
parent 9d7569cfc5
commit 43ce58fd40
5 changed files with 18 additions and 20 deletions

View File

@@ -58,6 +58,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
const [selectedUser, setSelectedUser] = useState<string | null>(null); const [selectedUser, setSelectedUser] = useState<string | null>(null);
const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null; const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
console.log(summary)
return ( return (
<div style={styles.page}> <div style={styles.page}>

View File

@@ -8,20 +8,18 @@ const styles = StatsStyling;
const UploadPage = () => { const UploadPage = () => {
let postFile: File | undefined; let postFile: File | undefined;
let commentFile: File | undefined;
let topicBucketFile: File | undefined; let topicBucketFile: File | undefined;
const [returnMessage, setReturnMessage] = useState('') const [returnMessage, setReturnMessage] = useState('')
const navigate = useNavigate() const navigate = useNavigate()
const uploadFiles = async () => { const uploadFiles = async () => {
if (!postFile || !commentFile || !topicBucketFile) { if (!postFile || !topicBucketFile) {
alert('Please upload all files before uploading.') alert('Please upload all files before uploading.')
return return
} }
const formData = new FormData() const formData = new FormData()
formData.append('posts', postFile) formData.append('posts', postFile)
formData.append('comments', commentFile)
formData.append('topics', topicBucketFile) formData.append('topics', topicBucketFile)
try { try {
@@ -44,10 +42,6 @@ const UploadPage = () => {
<h2 style={{color: "black" }}>Posts File</h2> <h2 style={{color: "black" }}>Posts File</h2>
<input style={{color: "black" }} type="file" onChange={(e) => postFile = e.target.files?.[0]}></input> <input style={{color: "black" }} type="file" onChange={(e) => postFile = e.target.files?.[0]}></input>
</div> </div>
<div style={{ ...styles.card }}>
<h2 style={{color: "black" }}>Comments File</h2>
<input style={{color: "black" }} type="file" onChange={(e) => commentFile = e.target.files?.[0]}></input>
</div>
<div style={{ ...styles.card }}> <div style={{ ...styles.card }}>
<h2 style={{color: "black" }}>Topic Buckets File</h2> <h2 style={{color: "black" }}>Topic Buckets File</h2>
<input style={{color: "black" }} type="file" onChange={(e) => topicBucketFile = e.target.files?.[0]}></input> <input style={{color: "black" }} type="file" onChange={(e) => topicBucketFile = e.target.files?.[0]}></input>

View File

@@ -12,33 +12,30 @@ app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}}) CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
# Global State # Global State
posts_df = pd.read_json('posts.jsonl', lines=True) posts_df = pd.read_json('posts_test.jsonl', lines=True)
comments_df = pd.read_json('comments.jsonl', lines=True)
with open("topic_buckets.json", "r", encoding="utf-8") as f: with open("topic_buckets.json", "r", encoding="utf-8") as f:
domain_topics = json.load(f) domain_topics = json.load(f)
stat_obj = StatGen(posts_df, comments_df, domain_topics) stat_obj = StatGen(posts_df, domain_topics)
@app.route('/upload', methods=['POST']) @app.route('/upload', methods=['POST'])
def upload_data(): def upload_data():
if "posts" not in request.files or "comments" not in request.files or "topics" not in request.files: if "posts" not in request.files or "topics" not in request.files:
return jsonify({"error": "Missing required files or form data"}), 400 return jsonify({"error": "Missing required files or form data"}), 400
post_file = request.files["posts"] post_file = request.files["posts"]
comment_file = request.files["comments"]
topic_file = request.files["topics"] topic_file = request.files["topics"]
if post_file.filename == "" or comment_file.filename == "" or topic_file == "": if post_file.filename == "" or topic_file == "":
return jsonify({"error": "Empty filename"}), 400 return jsonify({"error": "Empty filename"}), 400
if not post_file.filename.endswith('.jsonl') or not comment_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'): if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400 return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
try: try:
global stat_obj global stat_obj
posts_df = pd.read_json(post_file, lines=True) posts_df = pd.read_json(post_file, lines=True)
comments_df = pd.read_json(comment_file, lines=True) stat_obj = StatGen(posts_df, json.load(topic_file))
stat_obj = StatGen(posts_df, comments_df, json.load(topic_file))
return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200 return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400

View File

@@ -1,12 +1,11 @@
import torch import torch
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from typing import Any
from typing import Any
from transformers import pipeline from transformers import pipeline
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
class NLP: class NLP:
_topic_models: dict[str, SentenceTransformer] = {} _topic_models: dict[str, SentenceTransformer] = {}
_emotion_classifiers: dict[str, Any] = {} _emotion_classifiers: dict[str, Any] = {}

View File

@@ -21,7 +21,11 @@ nltk.download('stopwords')
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
class StatGen: class StatGen:
def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None: def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
comments_df = df[["id", "comments"]].explode("comments")
comments_df = pd.json_normalize(comments_df["comments"])
posts_df = df.drop(columns=["comments"])
posts_df["type"] = "post" posts_df["type"] = "post"
posts_df["parent_id"] = None posts_df["parent_id"] = None
@@ -30,6 +34,7 @@ class StatGen:
self.domain_topics = domain_topics self.domain_topics = domain_topics
self.df = pd.concat([posts_df, comments_df]) self.df = pd.concat([posts_df, comments_df])
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
self.nlp = NLP(self.df, "title", "content", domain_topics) self.nlp = NLP(self.df, "title", "content", domain_topics)
self._add_extra_cols(self.df) self._add_extra_cols(self.df)
@@ -37,6 +42,7 @@ class StatGen:
## Private Methods ## Private Methods
def _add_extra_cols(self, df: pd.DataFrame) -> None: def _add_extra_cols(self, df: pd.DataFrame) -> None:
df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce')
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour df["hour"] = df["dt"].dt.hour
@@ -165,7 +171,7 @@ class StatGen:
"start": int(self.df["dt"].min().timestamp()), "start": int(self.df["dt"].min().timestamp()),
"end": int(self.df["dt"].max().timestamp()) "end": int(self.df["dt"].max().timestamp())
}, },
"sources": self.df["source"].unique().tolist() "sources": self.df["source"].dropna().unique().tolist()
} }
def content_analysis(self, limit: int = 100) -> dict: def content_analysis(self, limit: int = 100) -> dict: