2026-03-01 16:47:25 +00:00
7 changed files with 403 additions and 389 deletions
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -6,13 +6,12 @@ from typing import Any


 class CulturalAnalysis:
-    def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
-        self.df = df
+    def __init__(self, content_col: str = "content", topic_col: str = "topic"):
        self.content_col = content_col
        self.topic_col = topic_col

-    def get_identity_markers(self):
-        df = self.df.copy()
+    def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
+        df = original_df.copy()
        s = df[self.content_col].fillna("").astype(str).str.lower()

        in_group_words = {"we", "us", "our", "ourselves"}
@@ -60,8 +59,8 @@ class CulturalAnalysis:

        return result
    
-    def get_stance_markers(self) -> dict[str, Any]:
-        s = self.df[self.content_col].fillna("").astype(str)
+    def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
+        s = df[self.content_col].fillna("").astype(str)

        hedges = {
            "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
@@ -104,13 +103,11 @@ class CulturalAnalysis:
            "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
        }
    
-    def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
-        if "entities" not in self.df.columns:
+    def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
+        if "entities" not in df.columns:
            return {"entity_emotion_avg": {}}

-        df = self.df
        emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
-
        entity_counter = Counter()

        for row in df["entities"].dropna():
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -1,18 +1,15 @@
 import pandas as pd

 class EmotionalAnalysis:
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-
-    def avg_emotion_by_topic(self) -> dict:
+    def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
        emotion_cols = [
-            col for col in self.df.columns
+            col for col in df.columns
            if col.startswith("emotion_")
        ]

        counts = (
-            self.df[
-                (self.df["topic"] != "Misc")
+            df[
+                (df["topic"] != "Misc")
            ]
            .groupby("topic")
            .size()
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
        )

        avg_emotion_by_topic = (
-            self.df[
-                (self.df["topic"] != "Misc")
+            df[
+                (df["topic"] != "Misc")
            ]
            .groupby("topic")[emotion_cols]
            .mean()
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -5,8 +5,7 @@ from collections import Counter


 class InteractionAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
-        self.df = df
+    def __init__(self, word_exclusions: set[str]):
        self.word_exclusions = word_exclusions

    def _tokenize(self, text: str):
@@ -14,9 +13,9 @@ class InteractionAnalysis:
        return [t for t in tokens if t not in self.word_exclusions]

    def _vocab_richness_per_user(
-        self, min_words: int = 20, top_most_used_words: int = 100
+        self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
    ) -> list:
-        df = self.df.copy()
+        df = df.copy()
        df["content"] = df["content"].fillna("").astype(str).str.lower()
        df["tokens"] = df["content"].apply(self._tokenize)

@@ -58,10 +57,8 @@ class InteractionAnalysis:

        return rows

-    def top_users(self) -> list:
-        counts = (
-            self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
-        )
+    def top_users(self, df: pd.DataFrame) -> list:
+        counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)

        top_users = [
            {"author": author, "source": source, "count": int(count)}
@@ -70,14 +67,14 @@ class InteractionAnalysis:

        return top_users

-    def per_user_analysis(self) -> dict:
-        per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
+    def per_user_analysis(self, df: pd.DataFrame) -> dict:
+        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)

-        emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
+        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]

        avg_emotions_by_author = {}
        if emotion_cols:
-            avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
+            avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
            avg_emotions_by_author = {
                author: {emotion: float(score) for emotion, score in row.items()}
                for author, row in avg_emotions.iterrows()
@@ -97,7 +94,7 @@ class InteractionAnalysis:
        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
        per_user_records = per_user.reset_index().to_dict(orient="records")

-        vocab_rows = self._vocab_richness_per_user()
+        vocab_rows = self._vocab_richness_per_user(df)
        vocab_by_author = {row["author"]: row for row in vocab_rows}

        # merge vocab richness + per_user information
@@ -112,7 +109,14 @@ class InteractionAnalysis:
                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                    "comment_share": float(row.get("comment_share", 0)),
                    "avg_emotions": avg_emotions_by_author.get(author, {}),
-                    "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
+                    "vocab": vocab_by_author.get(
+                        author,
+                        {
+                            "vocab_richness": 0,
+                            "avg_words_per_event": 0,
+                            "top_words": [],
+                        },
+                    ),
                }
            )

@@ -120,13 +124,13 @@ class InteractionAnalysis:

        return merged_users

-    def interaction_graph(self):
-        interactions = {a: {} for a in self.df["author"].dropna().unique()}
+    def interaction_graph(self, df: pd.DataFrame):
+        interactions = {a: {} for a in df["author"].dropna().unique()}

        # reply_to refers to the comment id, this allows us to map comment ids to usernames
-        id_to_author = self.df.set_index("id")["author"].to_dict()
+        id_to_author = df.set_index("id")["author"].to_dict()

-        for _, row in self.df.iterrows():
+        for _, row in df.iterrows():
            a = row["author"]
            reply_id = row["reply_to"]

@@ -141,10 +145,10 @@ class InteractionAnalysis:

        return interactions

-    def average_thread_depth(self):
+    def average_thread_depth(self, df: pd.DataFrame):
        depths = []
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
-        for _, row in self.df.iterrows():
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
+        for _, row in df.iterrows():
            depth = 0
            current_id = row["id"]

@@ -163,16 +167,16 @@ class InteractionAnalysis:

        return round(sum(depths) / len(depths), 2)

-    def average_thread_length_by_emotion(self):
+    def average_thread_length_by_emotion(self, df: pd.DataFrame):
        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}

        emotion_cols = [
            c
-            for c in self.df.columns
+            for c in df.columns
            if c.startswith("emotion_") and c not in emotion_exclusions
        ]

-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
        length_cache = {}

        def thread_length_from(start_id):
@@ -211,7 +215,7 @@ class InteractionAnalysis:
        emotion_to_lengths = {}

        # Fill NaNs in emotion cols to avoid max() issues
-        emo_df = self.df[["id"] + emotion_cols].copy()
+        emo_df = df[["id"] + emotion_cols].copy()
        emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)

        for _, row in emo_df.iterrows():
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -4,9 +4,9 @@ import re
 from collections import Counter
 from itertools import islice

+
 class LinguisticAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
-        self.df = df
+    def __init__(self, word_exclusions: set[str]):
        self.word_exclusions = word_exclusions

    def _tokenize(self, text: str):
@@ -14,29 +14,20 @@ class LinguisticAnalysis:
        return [t for t in tokens if t not in self.word_exclusions]

    def _clean_text(self, text: str) -> str:
-        text = re.sub(r"http\S+", "", text)        # remove URLs
+        text = re.sub(r"http\S+", "", text)  # remove URLs
        text = re.sub(r"www\S+", "", text)
-        text = re.sub(r"&\w+;", "", text)          # remove HTML entities
-        text = re.sub(r"\bamp\b", "", text)        # remove stray amp
+        text = re.sub(r"&\w+;", "", text)  # remove HTML entities
+        text = re.sub(r"\bamp\b", "", text)  # remove stray amp
        text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
        return text

-    def word_frequencies(self, limit: int = 100) -> dict:
-        texts = (
-            self.df["content"]
-            .dropna()
-            .astype(str)
-            .str.lower()
-        )
+    def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
+        texts = df["content"].dropna().astype(str).str.lower()

        words = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
-            words.extend(
-                w for w in tokens
-                if w not in self.word_exclusions
-            )
-
+            words.extend(w for w in tokens if w not in self.word_exclusions)

        counts = Counter(words)

@@ -48,16 +39,16 @@ class LinguisticAnalysis:
        )

        return word_frequencies.to_dict(orient="records")
-    
-    def ngrams(self, n=2, limit=100):
-        texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
+
+    def ngrams(self, df: pd.DataFrame, n=2, limit=100):
+        texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
        all_ngrams = []

        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)

            # stop word removal causes strange behaviors in ngrams
-            #tokens = [w for w in tokens if w not in self.word_exclusions]
+            # tokens = [w for w in tokens if w not in self.word_exclusions]

            ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
            all_ngrams.extend([" ".join(ng) for ng in ngrams])
@@ -69,4 +60,4 @@ class LinguisticAnalysis:
            .sort_values("count", ascending=False)
            .head(limit)
            .to_dict(orient="records")
-        )
+        )
--- a/server/analysis/temporal.py
+++ b/server/analysis/temporal.py
@@ -1,16 +1,14 @@
 import pandas as pd

+
 class TemporalAnalysis:
-    def __init__(self, df: pd.DataFrame):
-        self.df = df
-  
-    def avg_reply_time_per_emotion(self) -> dict:
-        df = self.df.copy()
+    def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
+        df = df.copy()

        replies = df[
-            (df["type"] == "comment") &
-            (df["reply_to"].notna()) &
-            (df["reply_to"] != "")
+            (df["type"] == "comment")
+            & (df["reply_to"].notna())
+            & (df["reply_to"] != "")
        ]

        id_to_time = df.set_index("id")["dt"].to_dict()
@@ -23,48 +21,51 @@ class TemporalAnalysis:
                return None

            return (row["dt"] - parent_time).total_seconds()
-        
+
        replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
-        emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
+        emotion_cols = [
+            col
+            for col in df.columns
+            if col.startswith("emotion_")
+            and col not in ("emotion_neutral", "emotion_surprise")
+        ]
        replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
-        
+
        grouped = (
-            replies
-            .groupby("dominant_emotion")["reply_time"]
+            replies.groupby("dominant_emotion")["reply_time"]
            .agg(["mean", "count"])
            .reset_index()
        )

        return grouped.to_dict(orient="records")
-    
-    def posts_per_day(self) -> dict:
-        per_day = (
-            self.df.groupby("date")
-            .size()
-            .reset_index(name="count")
-        )
+
+    def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
+        per_day = df.groupby("date").size().reset_index(name="count")

        return per_day.to_dict(orient="records")
-    
-    def heatmap(self) -> dict:
+
+    def heatmap(self, df: pd.DataFrame) -> list[dict]:
        weekday_order = [
-            "Monday", "Tuesday", "Wednesday",
-            "Thursday", "Friday", "Saturday", "Sunday"
+            "Monday",
+            "Tuesday",
+            "Wednesday",
+            "Thursday",
+            "Friday",
+            "Saturday",
+            "Sunday",
        ]

-        self.df["weekday"] = pd.Categorical(
-            self.df["weekday"],
-            categories=weekday_order,
-            ordered=True
+        df = df.copy()
+        df["weekday"] = pd.Categorical(
+            df["weekday"], categories=weekday_order, ordered=True
        )

        heatmap = (
-            self.df
-            .groupby(["weekday", "hour"], observed=True)
+            df.groupby(["weekday", "hour"], observed=True)
            .size()
            .unstack(fill_value=0)
            .reindex(columns=range(24), fill_value=0)
        )

        heatmap.columns = heatmap.columns.map(str)
-        return heatmap.to_dict(orient="records")
+        return heatmap.to_dict(orient="records")
--- a/server/app.py
+++ b/server/app.py
@@ -8,7 +8,7 @@ from flask_jwt_extended import (
    JWTManager,
    create_access_token,
    jwt_required,
-    get_jwt_identity
+    get_jwt_identity,
 )

 from server.stat_gen import StatGen
@@ -27,31 +27,34 @@ db = PostgresConnector()
 load_dotenv()
 frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
 jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
-jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes
+jwt_access_token_expires = int(
+    os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
+)  # Default to 20 minutes

 # Flask Configuration
 CORS(app, resources={r"/*": {"origins": frontend_url}})
 app.config["JWT_SECRET_KEY"] = jwt_secret_key
-app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires 
+app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires

 bcrypt = Bcrypt(app)
 jwt = JWTManager(app)
 auth_manager = AuthManager(db, bcrypt)

-# Global State
-# posts_df = pd.read_json('small.jsonl', lines=True)
-# with open("topic_buckets.json", "r", encoding="utf-8") as f:
-#     domain_topics = json.load(f)
-# stat_obj = StatGen(posts_df, domain_topics)
-stat_obj = None
+stat_gen = StatGen()

-@app.route('/register', methods=['POST'])
+
+@app.route("/register", methods=["POST"])
 def register_user():
    data = request.get_json()

-    if not data or "username" not in data or "email" not in data or "password" not in data: 
+    if (
+        not data
+        or "username" not in data
+        or "email" not in data
+        or "password" not in data
+    ):
        return jsonify({"error": "Missing username, email, or password"}), 400
-    
+
    username = data["username"]
    email = data["email"]
    password = data["password"]
@@ -67,39 +70,40 @@ def register_user():
    print(f"Registered new user: {username}")
    return jsonify({"message": f"User '{username}' registered successfully"}), 200

-@app.route('/login', methods=['POST'])
+
+@app.route("/login", methods=["POST"])
 def login_user():
    data = request.get_json()

    if not data or "username" not in data or "password" not in data:
        return jsonify({"error": "Missing username or password"}), 400
-    
+
    username = data["username"]
    password = data["password"]

    try:
        user = auth_manager.authenticate_user(username, password)
        if user:
-            access_token = create_access_token(identity=str(user['id']))
+            access_token = create_access_token(identity=str(user["id"]))
            return jsonify({"access_token": access_token}), 200
        else:
            return jsonify({"error": "Invalid username or password"}), 401
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
+
+
@app.route("/profile", methods=["GET"])
@jwt_required()
 def profile():
    current_user = get_jwt_identity()

    return jsonify(
-        message="Access granted",
-        user=auth_manager.get_user_by_id(current_user)
+        message="Access granted", user=auth_manager.get_user_by_id(current_user)
    ), 200


-@app.route('/upload', methods=['POST'])
+@app.route("/upload", methods=["POST"])
@jwt_required()
 def upload_data():
    if "posts" not in request.files or "topics" not in request.files:
@@ -111,27 +115,36 @@ def upload_data():
    if post_file.filename == "" or topic_file == "":
        return jsonify({"error": "Empty filename"}), 400

-    if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
-        return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
-    
+    if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
+        ".json"
+    ):
+        return jsonify(
+            {"error": "Invalid file type. Only .jsonl and .json files are allowed."}
+        ), 400
+
    try:
        current_user = get_jwt_identity()

        posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
        topics = json.load(topic_file)
-        
+
        processor = DatasetProcessor(posts_df, topics)
        enriched_df = processor.enrich()
-        dataset_id = db.save_dataset_info(current_user, f"dataset_{current_user}", topics)
+        dataset_id = db.save_dataset_info(
+            current_user, f"dataset_{current_user}", topics
+        )
        db.save_dataset_content(dataset_id, enriched_df)

-        return jsonify({"message": "File uploaded successfully", "event_count": len(enriched_df)}), 200
+        return jsonify(
+            {"message": "File uploaded successfully", "event_count": len(enriched_df)}
+        ), 200
    except ValueError as e:
        return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
    except Exception as e:
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route('/dataset/<int:dataset_id>', methods=['GET'])
+
+
+@app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required()
 def get_dataset(dataset_id):
    current_user = get_jwt_identity()
@@ -139,159 +152,205 @@ def get_dataset(dataset_id):

    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
-    
+
    dataset_content = db.get_dataset_content(dataset_id)
-    
+
    if dataset_content.empty:
        return jsonify({"error": "Dataset content not found"}), 404

    return jsonify(dataset_content.to_dict(orient="records")), 200

-@app.route('/stats/content', methods=['GET'])
-def word_frequencies():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
+
+@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
+@jwt_required()
+def content_endpoint(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
    try:
-        return jsonify(stat_obj.get_content_analysis()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route('/stats/summary', methods=["GET"])
-def get_summary():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.summary()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route("/stats/time", methods=["GET"])
-def get_time_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.get_time_analysis()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route("/stats/user", methods=["GET"])
-def get_user_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.get_user_analysis()), 200
-    except ValueError as e:
-        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
-    except Exception as e:
-        print(traceback.format_exc())
-        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
-@app.route("/stats/cultural", methods=["GET"])
-def get_cultural_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        return jsonify(stat_obj.get_cultural_analysis()), 200
+        return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500

-@app.route("/stats/interaction", methods=["GET"])
-def get_interaction_analysis():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
+
+@app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
+@jwt_required()
+def get_summary(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
    try:
-        return jsonify(stat_obj.get_interactional_analysis()), 200
+        return jsonify(stat_gen.summary(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500

-@app.route('/filter/query', methods=["POST"])
-def filter_query():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400

-    data = request.get_json(silent=True) or {}
+@app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
+@jwt_required()
+def get_time_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)

-    if "query" not in data:
-        return jsonify(stat_obj.df.to_dict(orient="records")), 200
-    
-    query = data["query"]
-    filtered_df = stat_obj.filter_by_query(query)
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403

-    return jsonify(filtered_df), 200
+    dataset_content = db.get_dataset_content(dataset_id)

-@app.route('/filter/time', methods=["POST"])
-def filter_time():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    data = request.get_json(silent=True)
-    if not data:
-        return jsonify({"error": "Invalid or missing JSON body"}), 400
-
-    if "start" not in data or "end" not in data:
-        return jsonify({"error": "Please include both start and end dates"}), 400
-    
    try:
-        start = pd.to_datetime(data["start"], utc=True)
-        end = pd.to_datetime(data["end"], utc=True)
-        filtered_df = stat_obj.set_time_range(start, end)
-        return jsonify(filtered_df), 200
-    except Exception:
-        return jsonify({"error": "Invalid datetime format"}), 400
-    
-@app.route('/filter/sources', methods=["POST"])
-def filter_sources():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    data = request.get_json(silent=True)
-    if not data:
-        return jsonify({"error": "Invalid or missing JSON body"}), 400
-    
-    if "sources" not in data:
-        return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
-    
-    try:
-        filtered_df = stat_obj.filter_data_sources(data["sources"])
-        return jsonify(filtered_df), 200
-    except ValueError:
-        return jsonify({"error": "Please enable at least one data source"}), 400
-    except Exception as e:
-        return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
-    
-    
-@app.route('/filter/reset', methods=["GET"])
-def reset_dataset():
-    if stat_obj is None:
-        return jsonify({"error": "No data uploaded"}), 400
-    
-    try:
-        stat_obj.reset_dataset()
-        return jsonify({"success": "Dataset successfully reset"})
+        return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-    
+
+
+@app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
+@jwt_required()
+def get_user_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
+    try:
+        return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+
+@app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
+@jwt_required()
+def get_cultural_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
+    try:
+        return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+
+@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
+@jwt_required()
+def get_interaction_analysis(dataset_id):
+    current_user = get_jwt_identity()
+    dataset = db.get_dataset_info(dataset_id)
+
+    if dataset.get("user_id") != int(current_user):
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
+
+    dataset_content = db.get_dataset_content(dataset_id)
+
+    try:
+        return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
+    except ValueError as e:
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
+    except Exception as e:
+        print(traceback.format_exc())
+        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+
+
+# @app.route("/filter/query", methods=["POST"])
+# def filter_query():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     data = request.get_json(silent=True) or {}
+
+#     if "query" not in data:
+#         return jsonify(stat_obj.df.to_dict(orient="records")), 200
+
+#     query = data["query"]
+#     filtered_df = stat_obj.filter_by_query(query)
+
+#     return jsonify(filtered_df), 200
+
+
+# @app.route("/filter/time", methods=["POST"])
+# def filter_time():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     data = request.get_json(silent=True)
+#     if not data:
+#         return jsonify({"error": "Invalid or missing JSON body"}), 400
+
+#     if "start" not in data or "end" not in data:
+#         return jsonify({"error": "Please include both start and end dates"}), 400
+
+#     try:
+#         start = pd.to_datetime(data["start"], utc=True)
+#         end = pd.to_datetime(data["end"], utc=True)
+#         filtered_df = stat_obj.set_time_range(start, end)
+#         return jsonify(filtered_df), 200
+#     except Exception:
+#         return jsonify({"error": "Invalid datetime format"}), 400
+
+
+# @app.route("/filter/sources", methods=["POST"])
+# def filter_sources():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     data = request.get_json(silent=True)
+#     if not data:
+#         return jsonify({"error": "Invalid or missing JSON body"}), 400
+
+#     if "sources" not in data:
+#         return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
+
+#     try:
+#         filtered_df = stat_obj.filter_data_sources(data["sources"])
+#         return jsonify(filtered_df), 200
+#     except ValueError:
+#         return jsonify({"error": "Please enable at least one data source"}), 400
+#     except Exception as e:
+#         return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
+
+
+# @app.route("/filter/reset", methods=["GET"])
+# def reset_dataset():
+#     if stat_obj is None:
+#         return jsonify({"error": "No data uploaded"}), 400
+
+#     try:
+#         stat_obj.reset_dataset()
+#         return jsonify({"success": "Dataset successfully reset"})
+#     except Exception as e:
+#         print(traceback.format_exc())
+#         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
+

 if __name__ == "__main__":
-    app.run(debug=True)
+    app.run(debug=True)
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -1,170 +1,135 @@
-import pandas as pd
 import datetime
-import nltk

+import nltk
+import pandas as pd
 from nltk.corpus import stopwords
-from server.analysis.nlp import NLP
-from server.analysis.temporal import TemporalAnalysis
+
+from server.analysis.cultural import CulturalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
-from server.analysis.cultural import CulturalAnalysis
+from server.analysis.temporal import TemporalAnalysis

 DOMAIN_STOPWORDS = {
-    "www", "https", "http",
-    "boards", "boardsie",
-    "comment", "comments",
-    "discussion", "thread",
-    "post", "posts",
-    "would", "get", "one"
+    "www",
+    "https",
+    "http",
+    "boards",
+    "boardsie",
+    "comment",
+    "comments",
+    "discussion",
+    "thread",
+    "post",
+    "posts",
+    "would",
+    "get",
+    "one",
 }

-nltk.download('stopwords')
-EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
+nltk.download("stopwords")
+EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
+

 class StatGen:
-    def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
-        comments_df = df[["id", "comments"]].explode("comments")
-        comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
-        comments_df = pd.json_normalize(comments_df["comments"])
+    def __init__(self) -> None:
+        self.temporal_analysis = TemporalAnalysis()
+        self.emotional_analysis = EmotionalAnalysis()
+        self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
+        self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
+        self.cultural_analysis = CulturalAnalysis()

-        posts_df = df.drop(columns=["comments"])
-        posts_df["type"] = "post"
-        posts_df["parent_id"] = None
-
-        comments_df["type"] = "comment"
-        comments_df["parent_id"] = comments_df.get("post_id")
-        self.domain_topics = domain_topics
-
-        self.df = pd.concat([posts_df, comments_df])
-        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
-
-        self.nlp = NLP(self.df, "title", "content", domain_topics)
-        self.nlp.add_emotion_cols()
-        self.nlp.add_topic_col()
-        self.nlp.add_ner_cols()
-        self._add_time_cols(self.df)
-
-        self.temporal_analysis = TemporalAnalysis(self.df)
-        self.emotional_analysis = EmotionalAnalysis(self.df)
-        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
-        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
-        self.cultural_analysis = CulturalAnalysis(self.df)
-
-        self.original_df = self.df.copy(deep=True)
-
-    ## Private Methods
-    def _add_time_cols(self, df: pd.DataFrame) -> None:
-        df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
-        df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
-        df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
-        df["hour"] = df["dt"].dt.hour
-        df["weekday"] = df["dt"].dt.day_name()
-    
-    ## Public
-
-    # topics over time
-    # emotions over time
-    def get_time_analysis(self) -> dict:
+    def get_time_analysis(self, df: pd.DataFrame) -> dict:
        return {
-            "events_per_day": self.temporal_analysis.posts_per_day(),
-            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
+            "events_per_day": self.temporal_analysis.posts_per_day(df),
+            "weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
        }

-    # average topic duration
-    def get_content_analysis(self) -> dict:
+    def get_content_analysis(self, df: pd.DataFrame) -> dict:
        return {
-            "word_frequencies": self.linguistic_analysis.word_frequencies(),
-            "common_two_phrases": self.linguistic_analysis.ngrams(),
-            "common_three_phrases": self.linguistic_analysis.ngrams(n=3),
-            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
-            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
+            "word_frequencies": self.linguistic_analysis.word_frequencies(df),
+            "common_two_phrases": self.linguistic_analysis.ngrams(df),
+            "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
+            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
+            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
        }
-    
-    # average emotion per user
-    # average chain length
-    def get_user_analysis(self) -> dict:
-        return {
-            "top_users": self.interaction_analysis.top_users(),
-            "users": self.interaction_analysis.per_user_analysis()
-        }
-    
-    # average / max thread depth
-    # high engagment threads based on volume
-    def get_interactional_analysis(self) -> dict:
-        return {
-            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
-            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
-            "interaction_graph": self.interaction_analysis.interaction_graph()
-        }
-    
-    # detect community jargon
-    # in-group and out-group linguistic markers
-    def get_cultural_analysis(self) -> dict:
-        return {
-            "identity_markers": self.cultural_analysis.get_identity_markers(),
-            "stance_markers": self.cultural_analysis.get_stance_markers(),
-            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
-        }
-    
-    def summary(self) -> dict:
-        total_posts = (self.df["type"] == "post").sum()
-        total_comments = (self.df["type"] == "comment").sum()

-        events_per_user = self.df.groupby("author").size()
+    def get_user_analysis(self, df: pd.DataFrame) -> dict:
+        return {
+            "top_users": self.interaction_analysis.top_users(df),
+            "users": self.interaction_analysis.per_user_analysis(df),
+            "interaction_graph": self.interaction_analysis.interaction_graph(df),
+        }
+
+    def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
+        return {
+            "average_thread_depth": self.interaction_analysis.average_thread_depth(df),
+            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
+        }
+
+    def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
+        return {
+            "identity_markers": self.cultural_analysis.get_identity_markers(df),
+            "stance_markers": self.cultural_analysis.get_stance_markers(df),
+            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
+        }
+
+    def summary(self, df: pd.DataFrame) -> dict:
+        total_posts = (df["type"] == "post").sum()
+        total_comments = (df["type"] == "comment").sum()
+        events_per_user = df.groupby("author").size()

        return {
-            "total_events": int(len(self.df)),
+            "total_events": int(len(df)),
            "total_posts": int(total_posts),
            "total_comments": int(total_comments),
            "unique_users": int(events_per_user.count()),
            "comments_per_post": round(total_comments / max(total_posts, 1), 2),
            "lurker_ratio": round((events_per_user == 1).mean(), 2),
            "time_range": {
-                "start": int(self.df["dt"].min().timestamp()),
-                "end": int(self.df["dt"].max().timestamp())
+                "start": int(df["dt"].min().timestamp()),
+                "end": int(df["dt"].max().timestamp()),
            },
-            "sources": self.df["source"].dropna().unique().tolist()
-        }
-        
-    def filter_by_query(self, search_query: str) -> dict:
-        self.df = self.df[
-            self.df["content"].str.contains(search_query)
-        ]
-
-        return {
-            "rows": len(self.df),
-            "data": self.df.to_dict(orient="records")
-        }
-    
-    def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict:
-        self.df = self.df[
-            (self.df["dt"] >= start) &
-            (self.df["dt"] <= end)
-        ]
-
-        return {
-            "rows": len(self.df),
-            "data": self.df.to_dict(orient="records")
-        }
-    
-    """
-    Input is a hash map (source_name: str -> enabled: bool)
-    """
-    def filter_data_sources(self, data_sources: dict) -> dict:
-        enabled_sources = [src for src, enabled in data_sources.items() if enabled]
-
-        if not enabled_sources:
-            raise ValueError("Please choose at least one data source")
-        
-        self.df = self.df[self.df["source"].isin(enabled_sources)]
-
-        return {
-            "rows": len(self.df),
-            "data": self.df.to_dict(orient="records")
+            "sources": df["source"].dropna().unique().tolist(),
        }

-    
-    def reset_dataset(self) -> None:
-        self.df = self.original_df.copy(deep=True)
+    # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
+    #     filtered_df = df[df["content"].str.contains(search_query, na=False)]

+    #     return {
+    #         "rows": len(filtered_df),
+    #         "data": filtered_df.to_dict(orient="records"),
+    #     }
+
+    # def set_time_range(
+    #     self,
+    #     original_df: pd.DataFrame,
+    #     start: datetime.datetime,
+    #     end: datetime.datetime,
+    # ) -> dict:
+    #     df = self._prepare_df(original_df)
+    #     filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
+
+    #     return {
+    #         "rows": len(filtered_df),
+    #         "data": filtered_df.to_dict(orient="records"),
+    #     }
+
+    # def filter_data_sources(
+    #     self, original_df: pd.DataFrame, data_sources: dict
+    # ) -> dict:
+    #     df = self._prepare_df(original_df)
+    #     enabled_sources = [src for src, enabled in data_sources.items() if enabled]
+
+    #     if not enabled_sources:
+    #         raise ValueError("Please choose at least one data source")
+
+    #     filtered_df = df[df["source"].isin(enabled_sources)]
+
+    #     return {
+    #         "rows": len(filtered_df),
+    #         "data": filtered_df.to_dict(orient="records"),
+    #     }
+
+    # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
+    #     return self._prepare_df(original_df)