2026-03-01 16:47:25 +00:00
14 changed files with 774 additions and 387 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,5 @@ __pycache__/
 # React App Vite
 node_modules/
 dist/
 *.sh
--- a/db/database.py
+++ b/db/database.py
@@ -0,0 +1,138 @@
 import os
 import psycopg2
 import pandas as pd
 from psycopg2.extras import RealDictCursor
 from psycopg2.extras import execute_batch, Json
 class PostgresConnector:
    """
    Simple PostgreSQL connector (single connection).
    """
    def __init__(self):
        self.connection = psycopg2.connect(
            host=os.getenv("POSTGRES_HOST", "localhost"),
            port=os.getenv("POSTGRES_PORT", 5432),
            user=os.getenv("POSTGRES_USER", "postgres"),
            password=os.getenv("POSTGRES_PASSWORD", "postgres"),
            database=os.getenv("POSTGRES_DB", "postgres"),
        )
        self.connection.autocommit = False
    def execute(self, query, params=None, fetch=False) -> list:
        with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
            cursor.execute(query, params)
            if fetch:
                return cursor.fetchall()
            self.connection.commit()
    def executemany(self, query, param_list) -> list:
        with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
            cursor.executemany(query, param_list)
            self.connection.commit()
    ## User Management Methods
    def save_user(self, username, email, password_hash):
        query = """
            INSERT INTO users (username, email, password_hash)
            VALUES (%s, %s, %s)
        """
        self.execute(query, (username, email, password_hash))
    def get_user_by_username(self, username) -> dict:
        query = "SELECT id, username, email, password_hash FROM users WHERE username = %s"
        result = self.execute(query, (username,), fetch=True)
        return result[0] if result else None
    def get_user_by_email(self, email) -> dict:
        query = "SELECT id, username, email, password_hash FROM users WHERE email = %s"
        result = self.execute(query, (email,), fetch=True)
        return result[0] if result else None
    # Dataset Management Methods
    def save_dataset_info(self, user_id: int, dataset_name: str, topics: dict) -> int:
        query = """
            INSERT INTO datasets (user_id, name, topics)
            VALUES (%s, %s, %s)
            RETURNING id
        """
        result = self.execute(query, (user_id, dataset_name, Json(topics)), fetch=True)
        return result[0]["id"] if result else None
    def save_dataset_content(self, dataset_id: int, event_data: pd.DataFrame):
        query = """
            INSERT INTO events (
                dataset_id,
                type,
                parent_id,
                author,
                content,
                timestamp,
                date,
                dt,
                hour,
                weekday,
                reply_to,
                source,
                topic,
                topic_confidence,
                ner_entities,
                emotion_anger,
                emotion_disgust,
                emotion_fear,
                emotion_joy,
                emotion_sadness
            )
            VALUES (
                %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s
            )
        """
        values = []
        for _, row in event_data.iterrows():
            values.append((
                dataset_id,
                row["type"],
                row["parent_id"],
                row["author"],
                row["content"],
                row["timestamp"],
                row["date"],
                row["dt"],
                row["hour"],
                row["weekday"],
                row.get("reply_to"),
                row["source"],
                row.get("topic"),
                row.get("topic_confidence"),
                Json(row["ner_entities"]) if row.get("ner_entities") else None,
                row.get("emotion_anger"),
                row.get("emotion_disgust"),
                row.get("emotion_fear"),
                row.get("emotion_joy"),
                row.get("emotion_sadness"),
            ))
        with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
            execute_batch(cursor, query, values)
            self.connection.commit()
    def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
        query = "SELECT * FROM events WHERE dataset_id = %s"
        result = self.execute(query, (dataset_id,), fetch=True)
        return pd.DataFrame(result)
    def get_dataset_info(self, dataset_id: int) -> dict:
        query = "SELECT * FROM datasets WHERE id = %s"
        result = self.execute(query, (dataset_id,), fetch=True)
        return result[0] if result else None
    def close(self):
        if self.connection:
            self.connection.close()
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -0,0 +1,51 @@
 CREATE TABLE users (
    id SERIAL PRIMARY KEY,
    username VARCHAR(255) NOT NULL UNIQUE,
    email VARCHAR(255) NOT NULL UNIQUE,
    password_hash VARCHAR(255) NOT NULL,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE datasets (
    id SERIAL PRIMARY KEY,
    user_id INTEGER NOT NULL,
    name VARCHAR(255) NOT NULL,
    description TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    topics JSONB,
    FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
 );
 CREATE TABLE events (
    /* Required Fields */
    id SERIAL PRIMARY KEY,
    dataset_id INTEGER NOT NULL,
    type VARCHAR(255) NOT NULL,
    author VARCHAR(255) NOT NULL,
    content TEXT NOT NULL,
    timestamp BIGINT NOT NULL,
    date DATE NOT NULL,
    dt TIMESTAMP NOT NULL,
    hour INTEGER NOT NULL,
    weekday VARCHAR(255) NOT NULL,
    /* Comments and Replies */
    parent_id VARCHAR(255),
    reply_to VARCHAR(255),
    source VARCHAR(255) NOT NULL,
    /* NLP Fields */
    topic VARCHAR(255),
    topic_confidence FLOAT,
    ner_entities JSONB,
    emotion_anger FLOAT,
    emotion_disgust FLOAT,
    emotion_fear FLOAT,
    emotion_joy FLOAT,
    emotion_sadness FLOAT,
    FOREIGN KEY (dataset_id) REFERENCES datasets(id) ON DELETE CASCADE
 );
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,15 @@
 services:
  postgres:
    image: postgres:16
    container_name: postgres_db
    restart: unless-stopped
    env_file:
      - .env
    ports:
      - "5432:5432"
    volumes:
      - ./db/postgres_vol:/var/lib/postgresql/data
      - ./db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
 volumes:
  postgres_data:
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,12 @@
 beautifulsoup4==4.14.3
-Flask==3.1.2
+Flask==3.1.3
 flask_cors==6.0.2
 google_api_python_client==2.188.0
 keybert==0.9.0
 nltk==3.9.2
-pandas==3.0.0
+numpy==2.4.2
 pandas==3.0.1
 psycopg2==2.9.11
 psycopg2_binary==2.9.11
 python-dotenv==1.2.1
 Requests==2.32.5
 sentence_transformers==5.2.2
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -6,13 +6,12 @@ from typing import Any
 class CulturalAnalysis:
-    def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
+    def __init__(self, content_col: str = "content", topic_col: str = "topic"):
        self.df = df
        self.content_col = content_col
        self.topic_col = topic_col
-    def get_identity_markers(self):
+    def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
-        df = self.df.copy()
+        df = original_df.copy()
        s = df[self.content_col].fillna("").astype(str).str.lower()
        in_group_words = {"we", "us", "our", "ourselves"}
@@ -60,8 +59,8 @@ class CulturalAnalysis:
        return result
-    def get_stance_markers(self) -> dict[str, Any]:
+    def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
-        s = self.df[self.content_col].fillna("").astype(str)
+        s = df[self.content_col].fillna("").astype(str)
        hedges = {
            "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
@@ -104,13 +103,11 @@ class CulturalAnalysis:
            "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
        }
-    def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
+    def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
-        if "entities" not in self.df.columns:
+        if "entities" not in df.columns:
            return {"entity_emotion_avg": {}}
        df = self.df
        emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
        entity_counter = Counter()
        for row in df["entities"].dropna():
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -1,18 +1,15 @@
 import pandas as pd
 class EmotionalAnalysis:
-    def __init__(self, df: pd.DataFrame):
+    def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
        self.df = df
    def avg_emotion_by_topic(self) -> dict:
        emotion_cols = [
-            col for col in self.df.columns
+            col for col in df.columns
            if col.startswith("emotion_")
        ]
        counts = (
-            self.df[
+            df[
-                (self.df["topic"] != "Misc")
+                (df["topic"] != "Misc")
            ]
            .groupby("topic")
            .size()
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
        )
        avg_emotion_by_topic = (
-            self.df[
+            df[
-                (self.df["topic"] != "Misc")
+                (df["topic"] != "Misc")
            ]
            .groupby("topic")[emotion_cols]
            .mean()
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -5,8 +5,7 @@ from collections import Counter
 class InteractionAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
+    def __init__(self, word_exclusions: set[str]):
        self.df = df
        self.word_exclusions = word_exclusions
    def _tokenize(self, text: str):
@@ -14,9 +13,9 @@ class InteractionAnalysis:
        return [t for t in tokens if t not in self.word_exclusions]
    def _vocab_richness_per_user(
-        self, min_words: int = 20, top_most_used_words: int = 100
+        self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
    ) -> list:
-        df = self.df.copy()
+        df = df.copy()
        df["content"] = df["content"].fillna("").astype(str).str.lower()
        df["tokens"] = df["content"].apply(self._tokenize)
@@ -58,10 +57,8 @@ class InteractionAnalysis:
        return rows
-    def top_users(self) -> list:
+    def top_users(self, df: pd.DataFrame) -> list:
-        counts = (
+        counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
            self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
        )
        top_users = [
            {"author": author, "source": source, "count": int(count)}
@@ -70,14 +67,14 @@ class InteractionAnalysis:
        return top_users
-    def per_user_analysis(self) -> dict:
+    def per_user_analysis(self, df: pd.DataFrame) -> dict:
-        per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
+        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
-        emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
+        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
        avg_emotions_by_author = {}
        if emotion_cols:
-            avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
+            avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
            avg_emotions_by_author = {
                author: {emotion: float(score) for emotion, score in row.items()}
                for author, row in avg_emotions.iterrows()
@@ -97,7 +94,7 @@ class InteractionAnalysis:
        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
        per_user_records = per_user.reset_index().to_dict(orient="records")
-        vocab_rows = self._vocab_richness_per_user()
+        vocab_rows = self._vocab_richness_per_user(df)
        vocab_by_author = {row["author"]: row for row in vocab_rows}
        # merge vocab richness + per_user information
@@ -112,7 +109,14 @@ class InteractionAnalysis:
                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                    "comment_share": float(row.get("comment_share", 0)),
                    "avg_emotions": avg_emotions_by_author.get(author, {}),
-                    "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
+                    "vocab": vocab_by_author.get(
                        author,
                        {
                            "vocab_richness": 0,
                            "avg_words_per_event": 0,
                            "top_words": [],
                        },
                    ),
                }
            )
@@ -120,13 +124,13 @@ class InteractionAnalysis:
        return merged_users
-    def interaction_graph(self):
+    def interaction_graph(self, df: pd.DataFrame):
-        interactions = {a: {} for a in self.df["author"].dropna().unique()}
+        interactions = {a: {} for a in df["author"].dropna().unique()}
        # reply_to refers to the comment id, this allows us to map comment ids to usernames
-        id_to_author = self.df.set_index("id")["author"].to_dict()
+        id_to_author = df.set_index("id")["author"].to_dict()
-        for _, row in self.df.iterrows():
+        for _, row in df.iterrows():
            a = row["author"]
            reply_id = row["reply_to"]
@@ -141,10 +145,10 @@ class InteractionAnalysis:
        return interactions
-    def average_thread_depth(self):
+    def average_thread_depth(self, df: pd.DataFrame):
        depths = []
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
-        for _, row in self.df.iterrows():
+        for _, row in df.iterrows():
            depth = 0
            current_id = row["id"]
@@ -163,16 +167,16 @@ class InteractionAnalysis:
        return round(sum(depths) / len(depths), 2)
-    def average_thread_length_by_emotion(self):
+    def average_thread_length_by_emotion(self, df: pd.DataFrame):
        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
        emotion_cols = [
            c
-            for c in self.df.columns
+            for c in df.columns
            if c.startswith("emotion_") and c not in emotion_exclusions
        ]
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
        length_cache = {}
        def thread_length_from(start_id):
@@ -211,7 +215,7 @@ class InteractionAnalysis:
        emotion_to_lengths = {}
        # Fill NaNs in emotion cols to avoid max() issues
-        emo_df = self.df[["id"] + emotion_cols].copy()
+        emo_df = df[["id"] + emotion_cols].copy()
        emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
        for _, row in emo_df.iterrows():
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -4,9 +4,9 @@ import re
 from collections import Counter
 from itertools import islice
 class LinguisticAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
+    def __init__(self, word_exclusions: set[str]):
        self.df = df
        self.word_exclusions = word_exclusions
    def _tokenize(self, text: str):
@@ -21,22 +21,13 @@ class LinguisticAnalysis:
        text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
        return text
-    def word_frequencies(self, limit: int = 100) -> dict:
+    def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
-        texts = (
+        texts = df["content"].dropna().astype(str).str.lower()
            self.df["content"]
            .dropna()
            .astype(str)
            .str.lower()
        )
        words = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
-            words.extend(
+            words.extend(w for w in tokens if w not in self.word_exclusions)
                w for w in tokens
                if w not in self.word_exclusions
            )
        counts = Counter(words)
@@ -49,8 +40,8 @@ class LinguisticAnalysis:
        return word_frequencies.to_dict(orient="records")
-    def ngrams(self, n=2, limit=100):
+    def ngrams(self, df: pd.DataFrame, n=2, limit=100):
-        texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
+        texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
        all_ngrams = []
        for text in texts:
--- a/server/analysis/temporal.py
+++ b/server/analysis/temporal.py
@@ -1,16 +1,14 @@
 import pandas as pd
 class TemporalAnalysis:
    def __init__(self, df: pd.DataFrame):
        self.df = df
-    def avg_reply_time_per_emotion(self) -> dict:
+class TemporalAnalysis:
-        df = self.df.copy()
+    def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
        df = df.copy()
        replies = df[
-            (df["type"] == "comment") &
+            (df["type"] == "comment")
-            (df["reply_to"].notna()) &
+            & (df["reply_to"].notna())
-            (df["reply_to"] != "")
+            & (df["reply_to"] != "")
        ]
        id_to_time = df.set_index("id")["dt"].to_dict()
@@ -25,42 +23,45 @@ class TemporalAnalysis:
            return (row["dt"] - parent_time).total_seconds()
        replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
-        emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
+        emotion_cols = [
            col
            for col in df.columns
            if col.startswith("emotion_")
            and col not in ("emotion_neutral", "emotion_surprise")
        ]
        replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
        grouped = (
-            replies
+            replies.groupby("dominant_emotion")["reply_time"]
            .groupby("dominant_emotion")["reply_time"]
            .agg(["mean", "count"])
            .reset_index()
        )
        return grouped.to_dict(orient="records")
-    def posts_per_day(self) -> dict:
+    def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
-        per_day = (
+        per_day = df.groupby("date").size().reset_index(name="count")
            self.df.groupby("date")
            .size()
            .reset_index(name="count")
        )
        return per_day.to_dict(orient="records")
-    def heatmap(self) -> dict:
+    def heatmap(self, df: pd.DataFrame) -> list[dict]:
        weekday_order = [
-            "Monday", "Tuesday", "Wednesday",
+            "Monday",
-            "Thursday", "Friday", "Saturday", "Sunday"
+            "Tuesday",
            "Wednesday",
            "Thursday",
            "Friday",
            "Saturday",
            "Sunday",
        ]
-        self.df["weekday"] = pd.Categorical(
+        df = df.copy()
-            self.df["weekday"],
+        df["weekday"] = pd.Categorical(
-            categories=weekday_order,
+            df["weekday"], categories=weekday_order, ordered=True
            ordered=True
        )
        heatmap = (
-            self.df
+            df.groupby(["weekday", "hour"], observed=True)
            .groupby(["weekday", "hour"], observed=True)
            .size()
            .unstack(fill_value=0)
            .reindex(columns=range(24), fill_value=0)
--- a/server/app.py
+++ b/server/app.py
@@ -1,23 +1,110 @@
 import os
 from dotenv import load_dotenv
 from flask import Flask, jsonify, request
 from flask_cors import CORS
 from flask_bcrypt import Bcrypt
 from flask_jwt_extended import (
    JWTManager,
    create_access_token,
    jwt_required,
    get_jwt_identity,
 )
 from server.stat_gen import StatGen
 from server.dataset_processor import DatasetProcessor
 from db.database import PostgresConnector
 from server.auth import AuthManager
 import pandas as pd
 import traceback
 import json
 app = Flask(__name__)
 db = PostgresConnector()
-# Allow for CORS from localhost:5173
+# Env Variables
-CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
+load_dotenv()
 frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
 jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
 jwt_access_token_expires = int(
    os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
 )  # Default to 20 minutes
-# Global State
+# Flask Configuration
-posts_df = pd.read_json('small.jsonl', lines=True)
+CORS(app, resources={r"/*": {"origins": frontend_url}})
-with open("topic_buckets.json", "r", encoding="utf-8") as f:
+app.config["JWT_SECRET_KEY"] = jwt_secret_key
-    domain_topics = json.load(f)
+app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
 stat_obj = StatGen(posts_df, domain_topics)
-@app.route('/upload', methods=['POST'])
+bcrypt = Bcrypt(app)
 jwt = JWTManager(app)
 auth_manager = AuthManager(db, bcrypt)
 stat_gen = StatGen()
@app.route("/register", methods=["POST"])
 def register_user():
    data = request.get_json()
    if (
        not data
        or "username" not in data
        or "email" not in data
        or "password" not in data
    ):
        return jsonify({"error": "Missing username, email, or password"}), 400
    username = data["username"]
    email = data["email"]
    password = data["password"]
    try:
        auth_manager.register_user(username, email, password)
    except ValueError as e:
        return jsonify({"error": str(e)}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
    print(f"Registered new user: {username}")
    return jsonify({"message": f"User '{username}' registered successfully"}), 200
@app.route("/login", methods=["POST"])
 def login_user():
    data = request.get_json()
    if not data or "username" not in data or "password" not in data:
        return jsonify({"error": "Missing username or password"}), 400
    username = data["username"]
    password = data["password"]
    try:
        user = auth_manager.authenticate_user(username, password)
        if user:
            access_token = create_access_token(identity=str(user["id"]))
            return jsonify({"access_token": access_token}), 200
        else:
            return jsonify({"error": "Invalid username or password"}), 401
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/profile", methods=["GET"])
@jwt_required()
 def profile():
    current_user = get_jwt_identity()
    return jsonify(
        message="Access granted", user=auth_manager.get_user_by_id(current_user)
    ), 200
@app.route("/upload", methods=["POST"])
@jwt_required()
 def upload_data():
    if "posts" not in request.files or "topics" not in request.files:
        return jsonify({"error": "Missing required files or form data"}), 400
@@ -28,172 +115,241 @@ def upload_data():
    if post_file.filename == "" or topic_file == "":
        return jsonify({"error": "Empty filename"}), 400
-    if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
+    if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
-        return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
+        ".json"
    ):
        return jsonify(
            {"error": "Invalid file type. Only .jsonl and .json files are allowed."}
        ), 400
    try:
-        global stat_obj
+        current_user = get_jwt_identity()
-        posts_df = pd.read_json(post_file, lines=True)
+        posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
-        stat_obj = StatGen(posts_df, json.load(topic_file))
+        topics = json.load(topic_file)
-        return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
+
        processor = DatasetProcessor(posts_df, topics)
        enriched_df = processor.enrich()
        dataset_id = db.save_dataset_info(
            current_user, f"dataset_{current_user}", topics
        )
        db.save_dataset_content(dataset_id, enriched_df)
        return jsonify(
            {"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id}
        ), 200
    except ValueError as e:
        return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
    except Exception as e:
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/dataset', methods=['GET'])
 def get_dataset():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
-    return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"}
+@app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required()
 def get_dataset(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
-@app.route('/stats/content', methods=['GET'])
+    if dataset.get("user_id") != int(current_user):
-def word_frequencies():
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    dataset_content = db.get_dataset_content(dataset_id)
    if dataset_content.empty:
        return jsonify({"error": "Dataset content not found"}), 404
    return jsonify(dataset_content.to_dict(orient="records")), 200
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
@jwt_required()
 def content_endpoint(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_content_analysis()), 200
+        return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route('/stats/summary', methods=["GET"])
+
-def get_summary():
+@app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_summary(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.summary()), 200
+        return jsonify(stat_gen.summary(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/time", methods=["GET"])
 def get_time_analysis():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    try:
        return jsonify(stat_obj.get_time_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route("/stats/user", methods=["GET"])
+
-def get_user_analysis():
+@app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_time_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_user_analysis()), 200
+        return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route("/stats/cultural", methods=["GET"])
+
-def get_cultural_analysis():
+@app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_user_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_cultural_analysis()), 200
+        return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route("/stats/interaction", methods=["GET"])
+
-def get_interaction_analysis():
+@app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_cultural_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_interactional_analysis()), 200
+        return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/filter/query', methods=["POST"])
 def filter_query():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
-    data = request.get_json(silent=True) or {}
+@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
@jwt_required()
 def get_interaction_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
-    if "query" not in data:
+    if dataset.get("user_id") != int(current_user):
-        return jsonify(stat_obj.df.to_dict(orient="records")), 200
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
-    query = data["query"]
+    dataset_content = db.get_dataset_content(dataset_id)
    filtered_df = stat_obj.filter_by_query(query)
    return jsonify(filtered_df), 200
@app.route('/filter/time', methods=["POST"])
 def filter_time():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    data = request.get_json(silent=True)
    if not data:
        return jsonify({"error": "Invalid or missing JSON body"}), 400
    if "start" not in data or "end" not in data:
        return jsonify({"error": "Please include both start and end dates"}), 400
    try:
-        start = pd.to_datetime(data["start"], utc=True)
+        return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
-        end = pd.to_datetime(data["end"], utc=True)
+    except ValueError as e:
-        filtered_df = stat_obj.set_time_range(start, end)
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
        return jsonify(filtered_df), 200
    except Exception:
        return jsonify({"error": "Invalid datetime format"}), 400
@app.route('/filter/sources', methods=["POST"])
 def filter_sources():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    data = request.get_json(silent=True)
    if not data:
        return jsonify({"error": "Invalid or missing JSON body"}), 400
    if "sources" not in data:
        return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
    try:
        filtered_df = stat_obj.filter_data_sources(data["sources"])
        return jsonify(filtered_df), 200
    except ValueError:
        return jsonify({"error": "Please enable at least one data source"}), 400
    except Exception as e:
        return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
@app.route('/filter/reset', methods=["GET"])
 def reset_dataset():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    try:
        stat_obj.reset_dataset()
        return jsonify({"success": "Dataset successfully reset"})
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
 # @app.route("/filter/query", methods=["POST"])
 # def filter_query():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     data = request.get_json(silent=True) or {}
 #     if "query" not in data:
 #         return jsonify(stat_obj.df.to_dict(orient="records")), 200
 #     query = data["query"]
 #     filtered_df = stat_obj.filter_by_query(query)
 #     return jsonify(filtered_df), 200
 # @app.route("/filter/time", methods=["POST"])
 # def filter_time():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     data = request.get_json(silent=True)
 #     if not data:
 #         return jsonify({"error": "Invalid or missing JSON body"}), 400
 #     if "start" not in data or "end" not in data:
 #         return jsonify({"error": "Please include both start and end dates"}), 400
 #     try:
 #         start = pd.to_datetime(data["start"], utc=True)
 #         end = pd.to_datetime(data["end"], utc=True)
 #         filtered_df = stat_obj.set_time_range(start, end)
 #         return jsonify(filtered_df), 200
 #     except Exception:
 #         return jsonify({"error": "Invalid datetime format"}), 400
 # @app.route("/filter/sources", methods=["POST"])
 # def filter_sources():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     data = request.get_json(silent=True)
 #     if not data:
 #         return jsonify({"error": "Invalid or missing JSON body"}), 400
 #     if "sources" not in data:
 #         return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
 #     try:
 #         filtered_df = stat_obj.filter_data_sources(data["sources"])
 #         return jsonify(filtered_df), 200
 #     except ValueError:
 #         return jsonify({"error": "Please enable at least one data source"}), 400
 #     except Exception as e:
 #         return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
 # @app.route("/filter/reset", methods=["GET"])
 # def reset_dataset():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     try:
 #         stat_obj.reset_dataset()
 #         return jsonify({"success": "Dataset successfully reset"})
 #     except Exception as e:
 #         print(traceback.format_exc())
 #         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
 if __name__ == "__main__":
    app.run(debug=True)
--- a/server/auth.py
+++ b/server/auth.py
@@ -0,0 +1,29 @@
 from db.database import PostgresConnector
 from flask_bcrypt import Bcrypt
 class AuthManager:
    def __init__(self, db: PostgresConnector, bcrypt: Bcrypt):
        self.db = db
        self.bcrypt = bcrypt
    def register_user(self, username, email, password):
        hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8")
        if self.db.get_user_by_email(email):
            raise ValueError("Email already registered")
        if self.db.get_user_by_username(username):
            raise ValueError("Username already taken")
        self.db.save_user(username, email, hashed_password)
    def authenticate_user(self, username, password):
        user = self.db.get_user_by_username(username)
        if user and self.bcrypt.check_password_hash(user['password_hash'], password):
            return user
        return None
    def get_user_by_id(self, user_id):
        query = "SELECT id, username, email FROM users WHERE id = %s"
        result = self.db.execute(query, (user_id,), fetch=True)
        return result[0] if result else None
--- a/server/dataset_processor.py
+++ b/server/dataset_processor.py
@@ -0,0 +1,39 @@
 import pandas as pd
 from server.analysis.nlp import NLP
 class DatasetProcessor:
    def __init__(self, df, topics):
        self.df = self._explode_comments(df)
        self.topics = topics
        self.nlp = NLP(self.df, "title", "content", self.topics)
    def _explode_comments(self, df) -> pd.DataFrame:
        comments_df = df[["id", "comments"]].explode("comments")
        comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
        comments_df = pd.json_normalize(comments_df["comments"])
        posts_df = df.drop(columns=["comments"])
        posts_df["type"] = "post"
        posts_df["parent_id"] = None
        comments_df["type"] = "comment"
        comments_df["parent_id"] = comments_df.get("post_id")
        df = pd.concat([posts_df, comments_df])
        df.drop(columns=["post_id"], inplace=True, errors="ignore")
        return df
    def enrich(self) -> pd.DataFrame:
        self.df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='raise')
        self.df['date'] = pd.to_datetime(self.df['timestamp'], unit='s').dt.date
        self.df["dt"] = pd.to_datetime(self.df["timestamp"], unit="s", utc=True)
        self.df["hour"] = self.df["dt"].dt.hour
        self.df["weekday"] = self.df["dt"].dt.day_name()
        self.nlp.add_emotion_cols()
        self.nlp.add_topic_col()
        self.nlp.add_ner_cols()
        return self.df
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -1,170 +1,135 @@
 import pandas as pd
 import datetime
 import nltk
 import nltk
 import pandas as pd
 from nltk.corpus import stopwords
-from server.analysis.nlp import NLP
+
-from server.analysis.temporal import TemporalAnalysis
+from server.analysis.cultural import CulturalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
-from server.analysis.cultural import CulturalAnalysis
+from server.analysis.temporal import TemporalAnalysis
 DOMAIN_STOPWORDS = {
-    "www", "https", "http",
+    "www",
-    "boards", "boardsie",
+    "https",
-    "comment", "comments",
+    "http",
-    "discussion", "thread",
+    "boards",
-    "post", "posts",
+    "boardsie",
-    "would", "get", "one"
+    "comment",
    "comments",
    "discussion",
    "thread",
    "post",
    "posts",
    "would",
    "get",
    "one",
 }
-nltk.download('stopwords')
+nltk.download("stopwords")
-EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
+EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
 class StatGen:
-    def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
+    def __init__(self) -> None:
-        comments_df = df[["id", "comments"]].explode("comments")
+        self.temporal_analysis = TemporalAnalysis()
-        comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
+        self.emotional_analysis = EmotionalAnalysis()
-        comments_df = pd.json_normalize(comments_df["comments"])
+        self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
        self.cultural_analysis = CulturalAnalysis()
-        posts_df = df.drop(columns=["comments"])
+    def get_time_analysis(self, df: pd.DataFrame) -> dict:
        posts_df["type"] = "post"
        posts_df["parent_id"] = None
        comments_df["type"] = "comment"
        comments_df["parent_id"] = comments_df.get("post_id")
        self.domain_topics = domain_topics
        self.df = pd.concat([posts_df, comments_df])
        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
        self.nlp = NLP(self.df, "title", "content", domain_topics)
        self.nlp.add_emotion_cols()
        self.nlp.add_topic_col()
        self.nlp.add_ner_cols()
        self._add_time_cols(self.df)
        self.temporal_analysis = TemporalAnalysis(self.df)
        self.emotional_analysis = EmotionalAnalysis(self.df)
        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
        self.cultural_analysis = CulturalAnalysis(self.df)
        self.original_df = self.df.copy(deep=True)
    ## Private Methods
    def _add_time_cols(self, df: pd.DataFrame) -> None:
        df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
        df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
        df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
        df["hour"] = df["dt"].dt.hour
        df["weekday"] = df["dt"].dt.day_name()
    ## Public
    # topics over time
    # emotions over time
    def get_time_analysis(self) -> dict:
        return {
-            "events_per_day": self.temporal_analysis.posts_per_day(),
+            "events_per_day": self.temporal_analysis.posts_per_day(df),
-            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
+            "weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
        }
-    # average topic duration
+    def get_content_analysis(self, df: pd.DataFrame) -> dict:
    def get_content_analysis(self) -> dict:
        return {
-            "word_frequencies": self.linguistic_analysis.word_frequencies(),
+            "word_frequencies": self.linguistic_analysis.word_frequencies(df),
-            "common_two_phrases": self.linguistic_analysis.ngrams(),
+            "common_two_phrases": self.linguistic_analysis.ngrams(df),
-            "common_three_phrases": self.linguistic_analysis.ngrams(n=3),
+            "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
-            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
+            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
-            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
+            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
        }
-    # average emotion per user
+    def get_user_analysis(self, df: pd.DataFrame) -> dict:
    # average chain length
    def get_user_analysis(self) -> dict:
        return {
-            "top_users": self.interaction_analysis.top_users(),
+            "top_users": self.interaction_analysis.top_users(df),
-            "users": self.interaction_analysis.per_user_analysis()
+            "users": self.interaction_analysis.per_user_analysis(df),
            "interaction_graph": self.interaction_analysis.interaction_graph(df),
        }
-    # average / max thread depth
+    def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
    # high engagment threads based on volume
    def get_interactional_analysis(self) -> dict:
        return {
-            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
+            "average_thread_depth": self.interaction_analysis.average_thread_depth(df),
-            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
+            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
            "interaction_graph": self.interaction_analysis.interaction_graph()
        }
-    # detect community jargon
+    def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
    # in-group and out-group linguistic markers
    def get_cultural_analysis(self) -> dict:
        return {
-            "identity_markers": self.cultural_analysis.get_identity_markers(),
+            "identity_markers": self.cultural_analysis.get_identity_markers(df),
-            "stance_markers": self.cultural_analysis.get_stance_markers(),
+            "stance_markers": self.cultural_analysis.get_stance_markers(df),
-            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
+            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
        }
-    def summary(self) -> dict:
+    def summary(self, df: pd.DataFrame) -> dict:
-        total_posts = (self.df["type"] == "post").sum()
+        total_posts = (df["type"] == "post").sum()
-        total_comments = (self.df["type"] == "comment").sum()
+        total_comments = (df["type"] == "comment").sum()
-
+        events_per_user = df.groupby("author").size()
        events_per_user = self.df.groupby("author").size()
        return {
-            "total_events": int(len(self.df)),
+            "total_events": int(len(df)),
            "total_posts": int(total_posts),
            "total_comments": int(total_comments),
            "unique_users": int(events_per_user.count()),
            "comments_per_post": round(total_comments / max(total_posts, 1), 2),
            "lurker_ratio": round((events_per_user == 1).mean(), 2),
            "time_range": {
-                "start": int(self.df["dt"].min().timestamp()),
+                "start": int(df["dt"].min().timestamp()),
-                "end": int(self.df["dt"].max().timestamp())
+                "end": int(df["dt"].max().timestamp()),
            },
-            "sources": self.df["source"].dropna().unique().tolist()
+            "sources": df["source"].dropna().unique().tolist(),
        }
-    def filter_by_query(self, search_query: str) -> dict:
+    # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
-        self.df = self.df[
+    #     filtered_df = df[df["content"].str.contains(search_query, na=False)]
            self.df["content"].str.contains(search_query)
        ]
-        return {
+    #     return {
-            "rows": len(self.df),
+    #         "rows": len(filtered_df),
-            "data": self.df.to_dict(orient="records")
+    #         "data": filtered_df.to_dict(orient="records"),
-        }
+    #     }
-    def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict:
+    # def set_time_range(
-        self.df = self.df[
+    #     self,
-            (self.df["dt"] >= start) &
+    #     original_df: pd.DataFrame,
-            (self.df["dt"] <= end)
+    #     start: datetime.datetime,
-        ]
+    #     end: datetime.datetime,
    # ) -> dict:
    #     df = self._prepare_df(original_df)
    #     filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
-        return {
+    #     return {
-            "rows": len(self.df),
+    #         "rows": len(filtered_df),
-            "data": self.df.to_dict(orient="records")
+    #         "data": filtered_df.to_dict(orient="records"),
-        }
+    #     }
-    """
+    # def filter_data_sources(
-    Input is a hash map (source_name: str -> enabled: bool)
+    #     self, original_df: pd.DataFrame, data_sources: dict
-    """
+    # ) -> dict:
-    def filter_data_sources(self, data_sources: dict) -> dict:
+    #     df = self._prepare_df(original_df)
-        enabled_sources = [src for src, enabled in data_sources.items() if enabled]
+    #     enabled_sources = [src for src, enabled in data_sources.items() if enabled]
-        if not enabled_sources:
+    #     if not enabled_sources:
-            raise ValueError("Please choose at least one data source")
+    #         raise ValueError("Please choose at least one data source")
-        self.df = self.df[self.df["source"].isin(enabled_sources)]
+    #     filtered_df = df[df["source"].isin(enabled_sources)]
-        return {
+    #     return {
-            "rows": len(self.df),
+    #         "rows": len(filtered_df),
-            "data": self.df.to_dict(orient="records")
+    #         "data": filtered_df.to_dict(orient="records"),
-        }
+    #     }
    def reset_dataset(self) -> None:
        self.df = self.original_df.copy(deep=True)
    # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
    #     return self._prepare_df(original_df)