From 07ab7529a96000915542c93059173fececf03ae3 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Sun, 1 Mar 2026 16:25:39 +0000
Subject: [PATCH] refactor: update analysis classes to accept DataFrame as
 parameter instead of instance variable

---
 server/analysis/cultural.py      |  17 +-
 server/analysis/emotional.py     |  15 +-
 server/analysis/interactional.py |  54 ++---
 server/analysis/linguistic.py    |  35 ++-
 server/analysis/temporal.py      |  63 +++---
 server/app.py                    | 365 ++++++++++++++++++-------------
 server/stat_gen.py               | 243 +++++++++-----------
 7 files changed, 403 insertions(+), 389 deletions(-)

diff --git a/server/analysis/cultural.py b/server/analysis/cultural.py
index ae55774..909233e 100644
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -6,13 +6,12 @@ from typing import Any
 
 
 class CulturalAnalysis:
-    def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
-        self.df = df
+    def __init__(self, content_col: str = "content", topic_col: str = "topic"):
         self.content_col = content_col
         self.topic_col = topic_col
 
-    def get_identity_markers(self):
-        df = self.df.copy()
+    def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
+        df = original_df.copy()
         s = df[self.content_col].fillna("").astype(str).str.lower()
 
         in_group_words = {"we", "us", "our", "ourselves"}
@@ -60,8 +59,8 @@ class CulturalAnalysis:
 
         return result
     
-    def get_stance_markers(self) -> dict[str, Any]:
-        s = self.df[self.content_col].fillna("").astype(str)
+    def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
+        s = df[self.content_col].fillna("").astype(str)
 
         hedges = {
             "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
@@ -104,13 +103,11 @@ class CulturalAnalysis:
             "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
         }
     
-    def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
-        if "entities" not in self.df.columns:
+    def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
+        if "entities" not in df.columns:
             return {"entity_emotion_avg": {}}
 
-        df = self.df
         emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
-
         entity_counter = Counter()
 
         for row in df["entities"].dropna():
diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py
index 10e897d..150aa20 100644
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -1,18 +1,15 @@
 import pandas as pd
 
 class EmotionalAnalysis:
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-
-    def avg_emotion_by_topic(self) -> dict:
+    def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
         emotion_cols = [
-            col for col in self.df.columns
+            col for col in df.columns
             if col.startswith("emotion_")
         ]
 
         counts = (
-            self.df[
-                (self.df["topic"] != "Misc")
+            df[
+                (df["topic"] != "Misc")
             ]
             .groupby("topic")
             .size()
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
         )
 
         avg_emotion_by_topic = (
-            self.df[
-                (self.df["topic"] != "Misc")
+            df[
+                (df["topic"] != "Misc")
             ]
             .groupby("topic")[emotion_cols]
             .mean()
diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py
index 6fd5b3f..5c8ac3d 100644
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -5,8 +5,7 @@ from collections import Counter
 
 
 class InteractionAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
-        self.df = df
+    def __init__(self, word_exclusions: set[str]):
         self.word_exclusions = word_exclusions
 
     def _tokenize(self, text: str):
@@ -14,9 +13,9 @@ class InteractionAnalysis:
         return [t for t in tokens if t not in self.word_exclusions]
 
     def _vocab_richness_per_user(
-        self, min_words: int = 20, top_most_used_words: int = 100
+        self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
     ) -> list:
-        df = self.df.copy()
+        df = df.copy()
         df["content"] = df["content"].fillna("").astype(str).str.lower()
         df["tokens"] = df["content"].apply(self._tokenize)
 
@@ -58,10 +57,8 @@ class InteractionAnalysis:
 
         return rows
 
-    def top_users(self) -> list:
-        counts = (
-            self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
-        )
+    def top_users(self, df: pd.DataFrame) -> list:
+        counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
 
         top_users = [
             {"author": author, "source": source, "count": int(count)}
@@ -70,14 +67,14 @@ class InteractionAnalysis:
 
         return top_users
 
-    def per_user_analysis(self) -> dict:
-        per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
+    def per_user_analysis(self, df: pd.DataFrame) -> dict:
+        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
 
-        emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
+        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
 
         avg_emotions_by_author = {}
         if emotion_cols:
-            avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
+            avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
             avg_emotions_by_author = {
                 author: {emotion: float(score) for emotion, score in row.items()}
                 for author, row in avg_emotions.iterrows()
@@ -97,7 +94,7 @@ class InteractionAnalysis:
         per_user = per_user.sort_values("comment_post_ratio", ascending=True)
         per_user_records = per_user.reset_index().to_dict(orient="records")
 
-        vocab_rows = self._vocab_richness_per_user()
+        vocab_rows = self._vocab_richness_per_user(df)
         vocab_by_author = {row["author"]: row for row in vocab_rows}
 
         # merge vocab richness + per_user information
@@ -112,7 +109,14 @@ class InteractionAnalysis:
                     "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                     "comment_share": float(row.get("comment_share", 0)),
                     "avg_emotions": avg_emotions_by_author.get(author, {}),
-                    "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
+                    "vocab": vocab_by_author.get(
+                        author,
+                        {
+                            "vocab_richness": 0,
+                            "avg_words_per_event": 0,
+                            "top_words": [],
+                        },
+                    ),
                 }
             )
 
@@ -120,13 +124,13 @@ class InteractionAnalysis:
 
         return merged_users
 
-    def interaction_graph(self):
-        interactions = {a: {} for a in self.df["author"].dropna().unique()}
+    def interaction_graph(self, df: pd.DataFrame):
+        interactions = {a: {} for a in df["author"].dropna().unique()}
 
         # reply_to refers to the comment id, this allows us to map comment ids to usernames
-        id_to_author = self.df.set_index("id")["author"].to_dict()
+        id_to_author = df.set_index("id")["author"].to_dict()
 
-        for _, row in self.df.iterrows():
+        for _, row in df.iterrows():
             a = row["author"]
             reply_id = row["reply_to"]
 
@@ -141,10 +145,10 @@ class InteractionAnalysis:
 
         return interactions
 
-    def average_thread_depth(self):
+    def average_thread_depth(self, df: pd.DataFrame):
         depths = []
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
-        for _, row in self.df.iterrows():
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
+        for _, row in df.iterrows():
             depth = 0
             current_id = row["id"]
 
@@ -163,16 +167,16 @@ class InteractionAnalysis:
 
         return round(sum(depths) / len(depths), 2)
 
-    def average_thread_length_by_emotion(self):
+    def average_thread_length_by_emotion(self, df: pd.DataFrame):
         emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
 
         emotion_cols = [
             c
-            for c in self.df.columns
+            for c in df.columns
             if c.startswith("emotion_") and c not in emotion_exclusions
         ]
 
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
         length_cache = {}
 
         def thread_length_from(start_id):
@@ -211,7 +215,7 @@ class InteractionAnalysis:
         emotion_to_lengths = {}
 
         # Fill NaNs in emotion cols to avoid max() issues
-        emo_df = self.df[["id"] + emotion_cols].copy()
+        emo_df = df[["id"] + emotion_cols].copy()
         emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
 
         for _, row in emo_df.iterrows():
diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py
index 5718edc..dc91faf 100644
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -4,9 +4,9 @@ import re
 from collections import Counter
 from itertools import islice
 
+
 class LinguisticAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
-        self.df = df
+    def __init__(self, word_exclusions: set[str]):
         self.word_exclusions = word_exclusions
 
     def _tokenize(self, text: str):
@@ -14,29 +14,20 @@ class LinguisticAnalysis:
         return [t for t in tokens if t not in self.word_exclusions]
 
     def _clean_text(self, text: str) -> str:
-        text = re.sub(r"http\S+", "", text)        # remove URLs
+        text = re.sub(r"http\S+", "", text)  # remove URLs
         text = re.sub(r"www\S+", "", text)
-        text = re.sub(r"&\w+;", "", text)          # remove HTML entities
-        text = re.sub(r"\bamp\b", "", text)        # remove stray amp
+        text = re.sub(r"&\w+;", "", text)  # remove HTML entities
+        text = re.sub(r"\bamp\b", "", text)  # remove stray amp
         text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
         return text
 
-    def word_frequencies(self, limit: int = 100) -> dict:
-        texts = (
-            self.df["content"]
-            .dropna()
-            .astype(str)
-            .str.lower()
-        )
+    def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
+        texts = df["content"].dropna().astype(str).str.lower()
 
         words = []
         for text in texts:
             tokens = re.findall(r"\b[a-z]{3,}\b", text)
-            words.extend(
-                w for w in tokens
-                if w not in self.word_exclusions
-            )
-
+            words.extend(w for w in tokens if w not in self.word_exclusions)
 
         counts = Counter(words)
 
@@ -48,16 +39,16 @@ class LinguisticAnalysis:
         )
 
         return word_frequencies.to_dict(orient="records")
-    
-    def ngrams(self, n=2, limit=100):
-        texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
+
+    def ngrams(self, df: pd.DataFrame, n=2, limit=100):
+        texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
         all_ngrams = []
 
         for text in texts:
             tokens = re.findall(r"\b[a-z]{3,}\b", text)
 
             # stop word removal causes strange behaviors in ngrams
-            #tokens = [w for w in tokens if w not in self.word_exclusions]
+            # tokens = [w for w in tokens if w not in self.word_exclusions]
 
             ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
             all_ngrams.extend([" ".join(ng) for ng in ngrams])
@@ -69,4 +60,4 @@ class LinguisticAnalysis:
             .sort_values("count", ascending=False)
             .head(limit)
             .to_dict(orient="records")
-        )
\ No newline at end of file
+        )
diff --git a/server/analysis/temporal.py b/server/analysis/temporal.py
index ba5105f..0ab579f 100644
--- a/server/analysis/temporal.py
+++ b/server/analysis/temporal.py
@@ -1,16 +1,14 @@
 import pandas as pd
 
+
 class TemporalAnalysis:
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-  
-    def avg_reply_time_per_emotion(self) -> dict:
-        df = self.df.copy()
+    def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
+        df = df.copy()
 
         replies = df[
-            (df["type"] == "comment") &
-            (df["reply_to"].notna()) &
-            (df["reply_to"] != "")
+            (df["type"] == "comment")
+            & (df["reply_to"].notna())
+            & (df["reply_to"] != "")
         ]
 
         id_to_time = df.set_index("id")["dt"].to_dict()
@@ -23,48 +21,51 @@ class TemporalAnalysis:
                 return None
 
             return (row["dt"] - parent_time).total_seconds()
-        
+
         replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
-        emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
+        emotion_cols = [
+            col
+            for col in df.columns
+            if col.startswith("emotion_")
+            and col not in ("emotion_neutral", "emotion_surprise")
+        ]
         replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
-        
+
         grouped = (
-            replies
-            .groupby("dominant_emotion")["reply_time"]
+            replies.groupby("dominant_emotion")["reply_time"]
             .agg(["mean", "count"])
             .reset_index()
         )
 
         return grouped.to_dict(orient="records")
-    
-    def posts_per_day(self) -> dict:
-        per_day = (
-            self.df.groupby("date")
-            .size()
-            .reset_index(name="count")
-        )
+
+    def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
+        per_day = df.groupby("date").size().reset_index(name="count")
 
         return per_day.to_dict(orient="records")
-    
-    def heatmap(self) -> dict:
+
+    def heatmap(self, df: pd.DataFrame) -> list[dict]:
         weekday_order = [
-            "Monday", "Tuesday", "Wednesday",
-            "Thursday", "Friday", "Saturday", "Sunday"
+            "Monday",
+            "Tuesday",
+            "Wednesday",
+            "Thursday",
+            "Friday",
+            "Saturday",
+            "Sunday",
         ]
 
-        self.df["weekday"] = pd.Categorical(
-            self.df["weekday"],
-            categories=weekday_order,
-            ordered=True
+        df = df.copy()
+        df["weekday"] = pd.Categorical(
+            df["weekday"], categories=weekday_order, ordered=True
         )
 
         heatmap = (
-            self.df
-            .groupby(["weekday", "hour"], observed=True)
+            df.groupby(["weekday", "hour"], observed=True)
             .size()
             .unstack(fill_value=0)
             .reindex(columns=range(24), fill_value=0)
         )
 
         heatmap.columns = heatmap.columns.map(str)
-        return heatmap.to_dict(orient="records")
\ No newline at end of file
+        return heatmap.to_dict(orient="records")
diff --git a/server/app.py b/server/app.py
index 961fd88..2f8a9e8 100644
--- a/server/app.py
+++ b/server/app.py
@@ -8,7 +8,7 @@ from flask_jwt_extended import (
     JWTManager,
     create_access_token,
     jwt_required,
-    get_jwt_identity
+    get_jwt_identity,
 )
 
 from server.stat_gen import StatGen
@@ -27,31 +27,34 @@ db = PostgresConnector()
 load_dotenv()
 frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
 jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
-jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes
+jwt_access_token_expires = int(
+    os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
+)  # Default to 20 minutes
 
 # Flask Configuration
 CORS(app, resources={r"/*": {"origins": frontend_url}})
 app.config["JWT_SECRET_KEY"] = jwt_secret_key
-app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires 
+app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
 
 bcrypt = Bcrypt(app)
 jwt = JWTManager(app)
 auth_manager = AuthManager(db, bcrypt)
 
-# Global State
-# posts_df = pd.read_json('small.jsonl', lines=True)
-# with open("topic_buckets.json", "r", encoding="utf-8") as f:
-#     domain_topics = json.load(f)
-# stat_obj = StatGen(posts_df, domain_topics)
-stat_obj = None
+stat_gen = StatGen()
 
-@app.route('/register', methods=['POST'])
+
+@app.route("/register", methods=["POST"])
 def register_user():
     data = request.get_json()
 
-    if not data or "username" not in data or "email" not in data or "password" not in data: 
+    if (
+        not data
+        or "username" not in data
+        or "email" not in data
+        or "password" not in data
+    ):
         return jsonify({"error": "Missing username, email, or password"}), 400
-    
+
     username = data["username"]
     email = data["email"]
     password = data["password"]
@@ -67,39 +70,40 @@ def register_user():
     print(f"Registered new user: {username}")
     return jsonify({"message": f"User '{username}' registered successfully"}), 200
 
-@app.route('/login', methods=['POST'])
+
+@app.route("/login", methods=["POST"])
 def login_user():
     data = request.get_json()
 
     if not data or "username" not in data or "password" not in data:
         return jsonify({"error": "Missing username or password"}), 400
-    
+
     username = data["username"]
     password = data["password"]
 
     try:
         user = auth_manager.authenticate_user(username, password)
         if user:
-            access_token = create_access_token(identity=str(user['id']))
+            access_token = create_access_token(identity=str(user["id"]))
             return jsonify({"access_token": access_token}), 200
         else:
             return jsonify({"error": "Invalid username or password"}), 401
     except Exception as e:
         print(traceback.format_exc())
         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
+
+
 @app.route("/profile", methods=["GET"])
 @jwt_required()
 def profile():
     current_user = get_jwt_identity()
 
     return jsonify(
-        message="Access granted",
-        user=auth_manager.get_user_by_id(current_user)
+        message="Access granted", user=auth_manager.get_user_by_id(current_user)
     ), 200
 
 
-@app.route('/upload', methods=['POST'])
+@app.route("/upload", methods=["POST"])
 @jwt_required()
 def upload_data():
     if "posts" not in request.files or "topics" not in request.files:
@@ -111,27 +115,36 @@ def upload_data():
     if post_file.filename == "" or topic_file == "":
         return jsonify({"error": "Empty filename"}), 400
 
-    if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
-        return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
-    
+    if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
+        ".json"
+    ):
+        return jsonify(
+            {"error": "Invalid file type. Only .jsonl and .json files are allowed."}
+        ), 400
+
     try:
         current_user = get_jwt_identity()
 
         posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
         topics = json.load(topic_file)
-        
+
         processor = DatasetProcessor(posts_df, topics)
         enriched_df = processor.enrich()
-        dataset_id = db.save_dataset_info(current_user, f"dataset_{current_user}", topics)
+        dataset_id = db.save_dataset_info(
+            current_user, f"dataset_{current_user}", topics
+        )
         db.save_dataset_content(dataset_id, enriched_df)
 
-        return jsonify({"message": "File uploaded successfully", "event_count": len(enriched_df)}), 200
+        return jsonify(
+            {"message": "File uploaded successfully", "event_count": len(enriched_df)}
+        ), 200
     except ValueError as e:
         return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
     except Exception as e:
         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route('/dataset/<int:dataset_id>', methods=['GET'])
+
+
+@app.route("/dataset/<int:dataset_id>", methods=["GET"])
 @jwt_required()
 def get_dataset(dataset_id):
     current_user = get_jwt_identity()
@@ -139,159 +152,205 @@ def get_dataset(dataset_id):
 
     if dataset.get("user_id") != int(current_user):
         return jsonify({"error": "Unauthorized access to dataset"}), 403
-    
+
     dataset_content = db.get_dataset_content(dataset_id)
-    
+
     if dataset_content.empty:
         return jsonify({"error": "Dataset content not found"}), 404
 
     return jsonify(dataset_content.to_dict(orient="records")), 200
 
-@app.route('/stats/content', methods=['GET'])
-def word_frequencies():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
+
+@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
+@jwt_required()
+def content_endpoint(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
     try:
-        return jsonify(stat_obj.get_content_analysis()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route('/stats/summary', methods=["GET"])
-def get_summary():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.summary()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route("/stats/time", methods=["GET"])
-def get_time_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.get_time_analysis()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route("/stats/user", methods=["GET"])
-def get_user_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.get_user_analysis()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route("/stats/cultural", methods=["GET"])
-def get_cultural_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.get_cultural_analysis()), 200
+        return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
     except ValueError as e:
         return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
     except Exception as e:
         print(traceback.format_exc())
         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
 
-@app.route("/stats/interaction", methods=["GET"])
-def get_interaction_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
+
+@app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
+@jwt_required()
+def get_summary(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
     try:
-        return jsonify(stat_obj.get_interactional_analysis()), 200
+        return jsonify(stat_gen.summary(dataset_content)), 200
     except ValueError as e:
         return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
     except Exception as e:
         print(traceback.format_exc())
         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
 
-@app.route('/filter/query', methods=["POST"])
-def filter_query():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
 
-    data = request.get_json(silent=True) or {}
+@app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
+@jwt_required()
+def get_time_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
 
-    if "query" not in data:
-        return jsonify(stat_obj.df.to_dict(orient="records")), 200
-    
-    query = data["query"]
-    filtered_df = stat_obj.filter_by_query(query)
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
 
-    return jsonify(filtered_df), 200
+    dataset_content = db.get_dataset_content(dataset_id)
 
-@app.route('/filter/time', methods=["POST"])
-def filter_time():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    data = request.get_json(silent=True)
-    if not data:
-        return jsonify({"error": "Invalid or missing JSON body"}), 400
-
-    if "start" not in data or "end" not in data:
-        return jsonify({"error": "Please include both start and end dates"}), 400
-    
     try:
-        start = pd.to_datetime(data["start"], utc=True)
-        end = pd.to_datetime(data["end"], utc=True)
-        filtered_df = stat_obj.set_time_range(start, end)
-        return jsonify(filtered_df), 200
-    except Exception:
-        return jsonify({"error": "Invalid datetime format"}), 400
-    
-@app.route('/filter/sources', methods=["POST"])
-def filter_sources():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    data = request.get_json(silent=True)
-    if not data:
-        return jsonify({"error": "Invalid or missing JSON body"}), 400
-    
-    if "sources" not in data:
-        return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
-    
-    try:
-        filtered_df = stat_obj.filter_data_sources(data["sources"])
-        return jsonify(filtered_df), 200
-    except ValueError:
-        return jsonify({"error": "Please enable at least one data source"}), 400
-    except Exception as e:
-        return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
-    
-    
-@app.route('/filter/reset', methods=["GET"])
-def reset_dataset():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        stat_obj.reset_dataset()
-        return jsonify({"success": "Dataset successfully reset"})
+        return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
     except Exception as e:
         print(traceback.format_exc())
         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
+
+
+@app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
+@jwt_required()
+def get_user_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
+    try:
+        return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+
+@app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
+@jwt_required()
+def get_cultural_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
+    try:
+        return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+
+@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
+@jwt_required()
+def get_interaction_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
+    try:
+        return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+
+# @app.route("/filter/query", methods=["POST"])
+# def filter_query():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     data = request.get_json(silent=True) or {}
+
+#     if "query" not in data:
+#         return jsonify(stat_obj.df.to_dict(orient="records")), 200
+
+#     query = data["query"]
+#     filtered_df = stat_obj.filter_by_query(query)
+
+#     return jsonify(filtered_df), 200
+
+
+# @app.route("/filter/time", methods=["POST"])
+# def filter_time():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     data = request.get_json(silent=True)
+#     if not data:
+#         return jsonify({"error": "Invalid or missing JSON body"}), 400
+
+#     if "start" not in data or "end" not in data:
+#         return jsonify({"error": "Please include both start and end dates"}), 400
+
+#     try:
+#         start = pd.to_datetime(data["start"], utc=True)
+#         end = pd.to_datetime(data["end"], utc=True)
+#         filtered_df = stat_obj.set_time_range(start, end)
+#         return jsonify(filtered_df), 200
+#     except Exception:
+#         return jsonify({"error": "Invalid datetime format"}), 400
+
+
+# @app.route("/filter/sources", methods=["POST"])
+# def filter_sources():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     data = request.get_json(silent=True)
+#     if not data:
+#         return jsonify({"error": "Invalid or missing JSON body"}), 400
+
+#     if "sources" not in data:
+#         return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
+
+#     try:
+#         filtered_df = stat_obj.filter_data_sources(data["sources"])
+#         return jsonify(filtered_df), 200
+#     except ValueError:
+#         return jsonify({"error": "Please enable at least one data source"}), 400
+#     except Exception as e:
+#         return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
+
+
+# @app.route("/filter/reset", methods=["GET"])
+# def reset_dataset():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     try:
+#         stat_obj.reset_dataset()
+#         return jsonify({"success": "Dataset successfully reset"})
+#     except Exception as e:
+#         print(traceback.format_exc())
+#         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
 
 if __name__ == "__main__":
-    app.run(debug=True)
\ No newline at end of file
+    app.run(debug=True)
diff --git a/server/stat_gen.py b/server/stat_gen.py
index bbba747..dc748d0 100644
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -1,170 +1,135 @@
-import pandas as pd
 import datetime
-import nltk
 
+import nltk
+import pandas as pd
 from nltk.corpus import stopwords
-from server.analysis.nlp import NLP
-from server.analysis.temporal import TemporalAnalysis
+
+from server.analysis.cultural import CulturalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
-from server.analysis.cultural import CulturalAnalysis
+from server.analysis.temporal import TemporalAnalysis
 
 DOMAIN_STOPWORDS = {
-    "www", "https", "http",
-    "boards", "boardsie",
-    "comment", "comments",
-    "discussion", "thread",
-    "post", "posts",
-    "would", "get", "one"
+    "www",
+    "https",
+    "http",
+    "boards",
+    "boardsie",
+    "comment",
+    "comments",
+    "discussion",
+    "thread",
+    "post",
+    "posts",
+    "would",
+    "get",
+    "one",
 }
 
-nltk.download('stopwords')
-EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
+nltk.download("stopwords")
+EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
+
 
 class StatGen:
-    def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
-        comments_df = df[["id", "comments"]].explode("comments")
-        comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
-        comments_df = pd.json_normalize(comments_df["comments"])
+    def __init__(self) -> None:
+        self.temporal_analysis = TemporalAnalysis()
+        self.emotional_analysis = EmotionalAnalysis()
+        self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
+        self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
+        self.cultural_analysis = CulturalAnalysis()
 
-        posts_df = df.drop(columns=["comments"])
-        posts_df["type"] = "post"
-        posts_df["parent_id"] = None
-
-        comments_df["type"] = "comment"
-        comments_df["parent_id"] = comments_df.get("post_id")
-        self.domain_topics = domain_topics
-
-        self.df = pd.concat([posts_df, comments_df])
-        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
-
-        self.nlp = NLP(self.df, "title", "content", domain_topics)
-        self.nlp.add_emotion_cols()
-        self.nlp.add_topic_col()
-        self.nlp.add_ner_cols()
-        self._add_time_cols(self.df)
-
-        self.temporal_analysis = TemporalAnalysis(self.df)
-        self.emotional_analysis = EmotionalAnalysis(self.df)
-        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
-        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
-        self.cultural_analysis = CulturalAnalysis(self.df)
-
-        self.original_df = self.df.copy(deep=True)
-
-    ## Private Methods
-    def _add_time_cols(self, df: pd.DataFrame) -> None:
-        df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
-        df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
-        df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
-        df["hour"] = df["dt"].dt.hour
-        df["weekday"] = df["dt"].dt.day_name()
-    
-    ## Public
-
-    # topics over time
-    # emotions over time
-    def get_time_analysis(self) -> dict:
+    def get_time_analysis(self, df: pd.DataFrame) -> dict:
         return {
-            "events_per_day": self.temporal_analysis.posts_per_day(),
-            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
+            "events_per_day": self.temporal_analysis.posts_per_day(df),
+            "weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
         }
 
-    # average topic duration
-    def get_content_analysis(self) -> dict:
+    def get_content_analysis(self, df: pd.DataFrame) -> dict:
         return {
-            "word_frequencies": self.linguistic_analysis.word_frequencies(),
-            "common_two_phrases": self.linguistic_analysis.ngrams(),
-            "common_three_phrases": self.linguistic_analysis.ngrams(n=3),
-            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
-            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
+            "word_frequencies": self.linguistic_analysis.word_frequencies(df),
+            "common_two_phrases": self.linguistic_analysis.ngrams(df),
+            "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
+            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
+            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
         }
-    
-    # average emotion per user
-    # average chain length
-    def get_user_analysis(self) -> dict:
-        return {
-            "top_users": self.interaction_analysis.top_users(),
-            "users": self.interaction_analysis.per_user_analysis()
-        }
-    
-    # average / max thread depth
-    # high engagment threads based on volume
-    def get_interactional_analysis(self) -> dict:
-        return {
-            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
-            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
-            "interaction_graph": self.interaction_analysis.interaction_graph()
-        }
-    
-    # detect community jargon
-    # in-group and out-group linguistic markers
-    def get_cultural_analysis(self) -> dict:
-        return {
-            "identity_markers": self.cultural_analysis.get_identity_markers(),
-            "stance_markers": self.cultural_analysis.get_stance_markers(),
-            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
-        }
-    
-    def summary(self) -> dict:
-        total_posts = (self.df["type"] == "post").sum()
-        total_comments = (self.df["type"] == "comment").sum()
 
-        events_per_user = self.df.groupby("author").size()
+    def get_user_analysis(self, df: pd.DataFrame) -> dict:
+        return {
+            "top_users": self.interaction_analysis.top_users(df),
+            "users": self.interaction_analysis.per_user_analysis(df),
+            "interaction_graph": self.interaction_analysis.interaction_graph(df),
+        }
+
+    def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
+        return {
+            "average_thread_depth": self.interaction_analysis.average_thread_depth(df),
+            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
+        }
+
+    def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
+        return {
+            "identity_markers": self.cultural_analysis.get_identity_markers(df),
+            "stance_markers": self.cultural_analysis.get_stance_markers(df),
+            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
+        }
+
+    def summary(self, df: pd.DataFrame) -> dict:
+        total_posts = (df["type"] == "post").sum()
+        total_comments = (df["type"] == "comment").sum()
+        events_per_user = df.groupby("author").size()
 
         return {
-            "total_events": int(len(self.df)),
+            "total_events": int(len(df)),
             "total_posts": int(total_posts),
             "total_comments": int(total_comments),
             "unique_users": int(events_per_user.count()),
             "comments_per_post": round(total_comments / max(total_posts, 1), 2),
             "lurker_ratio": round((events_per_user == 1).mean(), 2),
             "time_range": {
-                "start": int(self.df["dt"].min().timestamp()),
-                "end": int(self.df["dt"].max().timestamp())
+                "start": int(df["dt"].min().timestamp()),
+                "end": int(df["dt"].max().timestamp()),
             },
-            "sources": self.df["source"].dropna().unique().tolist()
-        }
-        
-    def filter_by_query(self, search_query: str) -> dict:
-        self.df = self.df[
-            self.df["content"].str.contains(search_query)
-        ]
-
-        return {
-            "rows": len(self.df),
-            "data": self.df.to_dict(orient="records")
-        }
-    
-    def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict:
-        self.df = self.df[
-            (self.df["dt"] >= start) &
-            (self.df["dt"] <= end)
-        ]
-
-        return {
-            "rows": len(self.df),
-            "data": self.df.to_dict(orient="records")
-        }
-    
-    """
-    Input is a hash map (source_name: str -> enabled: bool)
-    """
-    def filter_data_sources(self, data_sources: dict) -> dict:
-        enabled_sources = [src for src, enabled in data_sources.items() if enabled]
-
-        if not enabled_sources:
-            raise ValueError("Please choose at least one data source")
-        
-        self.df = self.df[self.df["source"].isin(enabled_sources)]
-
-        return {
-            "rows": len(self.df),
-            "data": self.df.to_dict(orient="records")
+            "sources": df["source"].dropna().unique().tolist(),
         }
 
-    
-    def reset_dataset(self) -> None:
-        self.df = self.original_df.copy(deep=True)
+    # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
+    #     filtered_df = df[df["content"].str.contains(search_query, na=False)]
 
+    #     return {
+    #         "rows": len(filtered_df),
+    #         "data": filtered_df.to_dict(orient="records"),
+    #     }
+
+    # def set_time_range(
+    #     self,
+    #     original_df: pd.DataFrame,
+    #     start: datetime.datetime,
+    #     end: datetime.datetime,
+    # ) -> dict:
+    #     df = self._prepare_df(original_df)
+    #     filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
+
+    #     return {
+    #         "rows": len(filtered_df),
+    #         "data": filtered_df.to_dict(orient="records"),
+    #     }
+
+    # def filter_data_sources(
+    #     self, original_df: pd.DataFrame, data_sources: dict
+    # ) -> dict:
+    #     df = self._prepare_df(original_df)
+    #     enabled_sources = [src for src, enabled in data_sources.items() if enabled]
+
+    #     if not enabled_sources:
+    #         raise ValueError("Please choose at least one data source")
+
+    #     filtered_df = df[df["source"].isin(enabled_sources)]
+
+    #     return {
+    #         "rows": len(filtered_df),
+    #         "data": filtered_df.to_dict(orient="records"),
+    #     }
+
+    # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
+    #     return self._prepare_df(original_df)