feat: combine post and comment uploads into a single file
This commit is contained in:
@@ -58,6 +58,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
|
|||||||
const [selectedUser, setSelectedUser] = useState<string | null>(null);
|
const [selectedUser, setSelectedUser] = useState<string | null>(null);
|
||||||
const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
|
const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
|
||||||
|
|
||||||
|
console.log(summary)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div style={styles.page}>
|
<div style={styles.page}>
|
||||||
|
|
||||||
|
|||||||
@@ -8,20 +8,18 @@ const styles = StatsStyling;
|
|||||||
|
|
||||||
const UploadPage = () => {
|
const UploadPage = () => {
|
||||||
let postFile: File | undefined;
|
let postFile: File | undefined;
|
||||||
let commentFile: File | undefined;
|
|
||||||
let topicBucketFile: File | undefined;
|
let topicBucketFile: File | undefined;
|
||||||
const [returnMessage, setReturnMessage] = useState('')
|
const [returnMessage, setReturnMessage] = useState('')
|
||||||
const navigate = useNavigate()
|
const navigate = useNavigate()
|
||||||
|
|
||||||
const uploadFiles = async () => {
|
const uploadFiles = async () => {
|
||||||
if (!postFile || !commentFile || !topicBucketFile) {
|
if (!postFile || !topicBucketFile) {
|
||||||
alert('Please upload all files before uploading.')
|
alert('Please upload all files before uploading.')
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
const formData = new FormData()
|
const formData = new FormData()
|
||||||
formData.append('posts', postFile)
|
formData.append('posts', postFile)
|
||||||
formData.append('comments', commentFile)
|
|
||||||
formData.append('topics', topicBucketFile)
|
formData.append('topics', topicBucketFile)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -44,10 +42,6 @@ const UploadPage = () => {
|
|||||||
<h2 style={{color: "black" }}>Posts File</h2>
|
<h2 style={{color: "black" }}>Posts File</h2>
|
||||||
<input style={{color: "black" }} type="file" onChange={(e) => postFile = e.target.files?.[0]}></input>
|
<input style={{color: "black" }} type="file" onChange={(e) => postFile = e.target.files?.[0]}></input>
|
||||||
</div>
|
</div>
|
||||||
<div style={{ ...styles.card }}>
|
|
||||||
<h2 style={{color: "black" }}>Comments File</h2>
|
|
||||||
<input style={{color: "black" }} type="file" onChange={(e) => commentFile = e.target.files?.[0]}></input>
|
|
||||||
</div>
|
|
||||||
<div style={{ ...styles.card }}>
|
<div style={{ ...styles.card }}>
|
||||||
<h2 style={{color: "black" }}>Topic Buckets File</h2>
|
<h2 style={{color: "black" }}>Topic Buckets File</h2>
|
||||||
<input style={{color: "black" }} type="file" onChange={(e) => topicBucketFile = e.target.files?.[0]}></input>
|
<input style={{color: "black" }} type="file" onChange={(e) => topicBucketFile = e.target.files?.[0]}></input>
|
||||||
|
|||||||
@@ -12,33 +12,30 @@ app = Flask(__name__)
|
|||||||
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
|
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
|
||||||
|
|
||||||
# Global State
|
# Global State
|
||||||
posts_df = pd.read_json('posts.jsonl', lines=True)
|
posts_df = pd.read_json('posts_test.jsonl', lines=True)
|
||||||
comments_df = pd.read_json('comments.jsonl', lines=True)
|
|
||||||
with open("topic_buckets.json", "r", encoding="utf-8") as f:
|
with open("topic_buckets.json", "r", encoding="utf-8") as f:
|
||||||
domain_topics = json.load(f)
|
domain_topics = json.load(f)
|
||||||
stat_obj = StatGen(posts_df, comments_df, domain_topics)
|
stat_obj = StatGen(posts_df, domain_topics)
|
||||||
|
|
||||||
@app.route('/upload', methods=['POST'])
|
@app.route('/upload', methods=['POST'])
|
||||||
def upload_data():
|
def upload_data():
|
||||||
if "posts" not in request.files or "comments" not in request.files or "topics" not in request.files:
|
if "posts" not in request.files or "topics" not in request.files:
|
||||||
return jsonify({"error": "Missing required files or form data"}), 400
|
return jsonify({"error": "Missing required files or form data"}), 400
|
||||||
|
|
||||||
post_file = request.files["posts"]
|
post_file = request.files["posts"]
|
||||||
comment_file = request.files["comments"]
|
|
||||||
topic_file = request.files["topics"]
|
topic_file = request.files["topics"]
|
||||||
|
|
||||||
if post_file.filename == "" or comment_file.filename == "" or topic_file == "":
|
if post_file.filename == "" or topic_file == "":
|
||||||
return jsonify({"error": "Empty filename"}), 400
|
return jsonify({"error": "Empty filename"}), 400
|
||||||
|
|
||||||
if not post_file.filename.endswith('.jsonl') or not comment_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
|
if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
|
||||||
return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
|
return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
|
||||||
|
|
||||||
try:
|
try:
|
||||||
global stat_obj
|
global stat_obj
|
||||||
|
|
||||||
posts_df = pd.read_json(post_file, lines=True)
|
posts_df = pd.read_json(post_file, lines=True)
|
||||||
comments_df = pd.read_json(comment_file, lines=True)
|
stat_obj = StatGen(posts_df, json.load(topic_file))
|
||||||
stat_obj = StatGen(posts_df, comments_df, json.load(topic_file))
|
|
||||||
return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
|
return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
|
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
|
||||||
|
|||||||
@@ -1,12 +1,11 @@
|
|||||||
import torch
|
import torch
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
from transformers import pipeline
|
from transformers import pipeline
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
class NLP:
|
class NLP:
|
||||||
_topic_models: dict[str, SentenceTransformer] = {}
|
_topic_models: dict[str, SentenceTransformer] = {}
|
||||||
_emotion_classifiers: dict[str, Any] = {}
|
_emotion_classifiers: dict[str, Any] = {}
|
||||||
|
|||||||
@@ -21,7 +21,11 @@ nltk.download('stopwords')
|
|||||||
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
|
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
|
||||||
|
|
||||||
class StatGen:
|
class StatGen:
|
||||||
def __init__(self, posts_df: pd.DataFrame, comments_df: pd.DataFrame, domain_topics: dict) -> None:
|
def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
|
||||||
|
comments_df = df[["id", "comments"]].explode("comments")
|
||||||
|
comments_df = pd.json_normalize(comments_df["comments"])
|
||||||
|
|
||||||
|
posts_df = df.drop(columns=["comments"])
|
||||||
posts_df["type"] = "post"
|
posts_df["type"] = "post"
|
||||||
posts_df["parent_id"] = None
|
posts_df["parent_id"] = None
|
||||||
|
|
||||||
@@ -30,6 +34,7 @@ class StatGen:
|
|||||||
self.domain_topics = domain_topics
|
self.domain_topics = domain_topics
|
||||||
|
|
||||||
self.df = pd.concat([posts_df, comments_df])
|
self.df = pd.concat([posts_df, comments_df])
|
||||||
|
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
|
||||||
self.nlp = NLP(self.df, "title", "content", domain_topics)
|
self.nlp = NLP(self.df, "title", "content", domain_topics)
|
||||||
self._add_extra_cols(self.df)
|
self._add_extra_cols(self.df)
|
||||||
|
|
||||||
@@ -37,6 +42,7 @@ class StatGen:
|
|||||||
|
|
||||||
## Private Methods
|
## Private Methods
|
||||||
def _add_extra_cols(self, df: pd.DataFrame) -> None:
|
def _add_extra_cols(self, df: pd.DataFrame) -> None:
|
||||||
|
df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce')
|
||||||
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
|
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
|
||||||
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
|
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
|
||||||
df["hour"] = df["dt"].dt.hour
|
df["hour"] = df["dt"].dt.hour
|
||||||
@@ -165,7 +171,7 @@ class StatGen:
|
|||||||
"start": int(self.df["dt"].min().timestamp()),
|
"start": int(self.df["dt"].min().timestamp()),
|
||||||
"end": int(self.df["dt"].max().timestamp())
|
"end": int(self.df["dt"].max().timestamp())
|
||||||
},
|
},
|
||||||
"sources": self.df["source"].unique().tolist()
|
"sources": self.df["source"].dropna().unique().tolist()
|
||||||
}
|
}
|
||||||
|
|
||||||
def content_analysis(self, limit: int = 100) -> dict:
|
def content_analysis(self, limit: int = 100) -> dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user