fix: database schema missing type column

refactor: update analysis classes to accept DataFrame as parameter instead of instance variable
fix: incorrect dataset authorisation check
2026-03-01 16:40:00 +00:00 · 2026-03-01 16:25:39 +00:00 · 2026-03-01 16:10:42 +00:00
9 changed files with 422 additions and 396 deletions
--- a/db/database.py
+++ b/db/database.py
@@ -64,6 +64,7 @@ class PostgresConnector:
        query = """
            INSERT INTO events (
                dataset_id,
                type,
                parent_id,
                author,
                content,
@@ -87,7 +88,7 @@ class PostgresConnector:
                %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s,
-                %s, %s, %s, %s
+                %s, %s, %s, %s, %s
            )
        """
@@ -96,6 +97,7 @@ class PostgresConnector:
        for _, row in event_data.iterrows():
            values.append((
                dataset_id,
                row["type"],
                row["parent_id"],
                row["author"],
                row["content"],
@@ -121,14 +123,15 @@ class PostgresConnector:
            execute_batch(cursor, query, values)
            self.connection.commit()
-    def get_dataset_by_id(self, dataset_id: int) -> pd.DataFrame:
+    def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
        query = "SELECT * FROM events WHERE dataset_id = %s"
        result = self.execute(query, (dataset_id,), fetch=True)
        return pd.DataFrame(result)
-    def get_datasets_for_user(self, user_id: int) -> list:
+    def get_dataset_info(self, dataset_id: int) -> dict:
-        query = "SELECT * FROM datasets WHERE user_id = %s"
+        query = "SELECT * FROM datasets WHERE id = %s"
-        return self.execute(query, (user_id,), fetch=True)
+        result = self.execute(query, (dataset_id,), fetch=True)
        return result[0] if result else None
    def close(self):
        if self.connection:
--- a/db/schema.sql
+++ b/db/schema.sql
@@ -20,6 +20,7 @@ CREATE TABLE events (
    /* Required Fields */
    id SERIAL PRIMARY KEY,
    dataset_id INTEGER NOT NULL,
    type VARCHAR(255) NOT NULL,
    author VARCHAR(255) NOT NULL,
    content TEXT NOT NULL,
--- a/server/analysis/cultural.py
+++ b/server/analysis/cultural.py
@@ -6,13 +6,12 @@ from typing import Any
 class CulturalAnalysis:
-    def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
+    def __init__(self, content_col: str = "content", topic_col: str = "topic"):
        self.df = df
        self.content_col = content_col
        self.topic_col = topic_col
-    def get_identity_markers(self):
+    def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
-        df = self.df.copy()
+        df = original_df.copy()
        s = df[self.content_col].fillna("").astype(str).str.lower()
        in_group_words = {"we", "us", "our", "ourselves"}
@@ -60,8 +59,8 @@ class CulturalAnalysis:
        return result
-    def get_stance_markers(self) -> dict[str, Any]:
+    def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
-        s = self.df[self.content_col].fillna("").astype(str)
+        s = df[self.content_col].fillna("").astype(str)
        hedges = {
            "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
@@ -104,13 +103,11 @@ class CulturalAnalysis:
            "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
        }
-    def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
+    def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
-        if "entities" not in self.df.columns:
+        if "entities" not in df.columns:
            return {"entity_emotion_avg": {}}
        df = self.df
        emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
        entity_counter = Counter()
        for row in df["entities"].dropna():
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -1,18 +1,15 @@
 import pandas as pd
 class EmotionalAnalysis:
-    def __init__(self, df: pd.DataFrame):
+    def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
        self.df = df
    def avg_emotion_by_topic(self) -> dict:
        emotion_cols = [
-            col for col in self.df.columns
+            col for col in df.columns
            if col.startswith("emotion_")
        ]
        counts = (
-            self.df[
+            df[
-                (self.df["topic"] != "Misc")
+                (df["topic"] != "Misc")
            ]
            .groupby("topic")
            .size()
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
        )
        avg_emotion_by_topic = (
-            self.df[
+            df[
-                (self.df["topic"] != "Misc")
+                (df["topic"] != "Misc")
            ]
            .groupby("topic")[emotion_cols]
            .mean()
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -5,8 +5,7 @@ from collections import Counter
 class InteractionAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
+    def __init__(self, word_exclusions: set[str]):
        self.df = df
        self.word_exclusions = word_exclusions
    def _tokenize(self, text: str):
@@ -14,9 +13,9 @@ class InteractionAnalysis:
        return [t for t in tokens if t not in self.word_exclusions]
    def _vocab_richness_per_user(
-        self, min_words: int = 20, top_most_used_words: int = 100
+        self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
    ) -> list:
-        df = self.df.copy()
+        df = df.copy()
        df["content"] = df["content"].fillna("").astype(str).str.lower()
        df["tokens"] = df["content"].apply(self._tokenize)
@@ -58,10 +57,8 @@ class InteractionAnalysis:
        return rows
-    def top_users(self) -> list:
+    def top_users(self, df: pd.DataFrame) -> list:
-        counts = (
+        counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
            self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
        )
        top_users = [
            {"author": author, "source": source, "count": int(count)}
@@ -70,14 +67,14 @@ class InteractionAnalysis:
        return top_users
-    def per_user_analysis(self) -> dict:
+    def per_user_analysis(self, df: pd.DataFrame) -> dict:
-        per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
+        per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
-        emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
+        emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
        avg_emotions_by_author = {}
        if emotion_cols:
-            avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
+            avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
            avg_emotions_by_author = {
                author: {emotion: float(score) for emotion, score in row.items()}
                for author, row in avg_emotions.iterrows()
@@ -97,7 +94,7 @@ class InteractionAnalysis:
        per_user = per_user.sort_values("comment_post_ratio", ascending=True)
        per_user_records = per_user.reset_index().to_dict(orient="records")
-        vocab_rows = self._vocab_richness_per_user()
+        vocab_rows = self._vocab_richness_per_user(df)
        vocab_by_author = {row["author"]: row for row in vocab_rows}
        # merge vocab richness + per_user information
@@ -112,7 +109,14 @@ class InteractionAnalysis:
                    "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
                    "comment_share": float(row.get("comment_share", 0)),
                    "avg_emotions": avg_emotions_by_author.get(author, {}),
-                    "vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
+                    "vocab": vocab_by_author.get(
                        author,
                        {
                            "vocab_richness": 0,
                            "avg_words_per_event": 0,
                            "top_words": [],
                        },
                    ),
                }
            )
@@ -120,13 +124,13 @@ class InteractionAnalysis:
        return merged_users
-    def interaction_graph(self):
+    def interaction_graph(self, df: pd.DataFrame):
-        interactions = {a: {} for a in self.df["author"].dropna().unique()}
+        interactions = {a: {} for a in df["author"].dropna().unique()}
        # reply_to refers to the comment id, this allows us to map comment ids to usernames
-        id_to_author = self.df.set_index("id")["author"].to_dict()
+        id_to_author = df.set_index("id")["author"].to_dict()
-        for _, row in self.df.iterrows():
+        for _, row in df.iterrows():
            a = row["author"]
            reply_id = row["reply_to"]
@@ -141,10 +145,10 @@ class InteractionAnalysis:
        return interactions
-    def average_thread_depth(self):
+    def average_thread_depth(self, df: pd.DataFrame):
        depths = []
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
-        for _, row in self.df.iterrows():
+        for _, row in df.iterrows():
            depth = 0
            current_id = row["id"]
@@ -163,16 +167,16 @@ class InteractionAnalysis:
        return round(sum(depths) / len(depths), 2)
-    def average_thread_length_by_emotion(self):
+    def average_thread_length_by_emotion(self, df: pd.DataFrame):
        emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
        emotion_cols = [
            c
-            for c in self.df.columns
+            for c in df.columns
            if c.startswith("emotion_") and c not in emotion_exclusions
        ]
-        id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
+        id_to_reply = df.set_index("id")["reply_to"].to_dict()
        length_cache = {}
        def thread_length_from(start_id):
@@ -211,7 +215,7 @@ class InteractionAnalysis:
        emotion_to_lengths = {}
        # Fill NaNs in emotion cols to avoid max() issues
-        emo_df = self.df[["id"] + emotion_cols].copy()
+        emo_df = df[["id"] + emotion_cols].copy()
        emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
        for _, row in emo_df.iterrows():
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -4,9 +4,9 @@ import re
 from collections import Counter
 from itertools import islice
 class LinguisticAnalysis:
-    def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
+    def __init__(self, word_exclusions: set[str]):
        self.df = df
        self.word_exclusions = word_exclusions
    def _tokenize(self, text: str):
@@ -14,29 +14,20 @@ class LinguisticAnalysis:
        return [t for t in tokens if t not in self.word_exclusions]
    def _clean_text(self, text: str) -> str:
-        text = re.sub(r"http\S+", "", text)        # remove URLs
+        text = re.sub(r"http\S+", "", text)  # remove URLs
        text = re.sub(r"www\S+", "", text)
-        text = re.sub(r"&\w+;", "", text)          # remove HTML entities
+        text = re.sub(r"&\w+;", "", text)  # remove HTML entities
-        text = re.sub(r"\bamp\b", "", text)        # remove stray amp
+        text = re.sub(r"\bamp\b", "", text)  # remove stray amp
        text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
        return text
-    def word_frequencies(self, limit: int = 100) -> dict:
+    def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
-        texts = (
+        texts = df["content"].dropna().astype(str).str.lower()
            self.df["content"]
            .dropna()
            .astype(str)
            .str.lower()
        )
        words = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
-            words.extend(
+            words.extend(w for w in tokens if w not in self.word_exclusions)
                w for w in tokens
                if w not in self.word_exclusions
            )
        counts = Counter(words)
@@ -49,15 +40,15 @@ class LinguisticAnalysis:
        return word_frequencies.to_dict(orient="records")
-    def ngrams(self, n=2, limit=100):
+    def ngrams(self, df: pd.DataFrame, n=2, limit=100):
-        texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
+        texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
        all_ngrams = []
        for text in texts:
            tokens = re.findall(r"\b[a-z]{3,}\b", text)
            # stop word removal causes strange behaviors in ngrams
-            #tokens = [w for w in tokens if w not in self.word_exclusions]
+            # tokens = [w for w in tokens if w not in self.word_exclusions]
            ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
            all_ngrams.extend([" ".join(ng) for ng in ngrams])
--- a/server/analysis/temporal.py
+++ b/server/analysis/temporal.py
@@ -1,16 +1,14 @@
 import pandas as pd
 class TemporalAnalysis:
    def __init__(self, df: pd.DataFrame):
        self.df = df
-    def avg_reply_time_per_emotion(self) -> dict:
+class TemporalAnalysis:
-        df = self.df.copy()
+    def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
        df = df.copy()
        replies = df[
-            (df["type"] == "comment") &
+            (df["type"] == "comment")
-            (df["reply_to"].notna()) &
+            & (df["reply_to"].notna())
-            (df["reply_to"] != "")
+            & (df["reply_to"] != "")
        ]
        id_to_time = df.set_index("id")["dt"].to_dict()
@@ -25,42 +23,45 @@ class TemporalAnalysis:
            return (row["dt"] - parent_time).total_seconds()
        replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
-        emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
+        emotion_cols = [
            col
            for col in df.columns
            if col.startswith("emotion_")
            and col not in ("emotion_neutral", "emotion_surprise")
        ]
        replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
        grouped = (
-            replies
+            replies.groupby("dominant_emotion")["reply_time"]
            .groupby("dominant_emotion")["reply_time"]
            .agg(["mean", "count"])
            .reset_index()
        )
        return grouped.to_dict(orient="records")
-    def posts_per_day(self) -> dict:
+    def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
-        per_day = (
+        per_day = df.groupby("date").size().reset_index(name="count")
            self.df.groupby("date")
            .size()
            .reset_index(name="count")
        )
        return per_day.to_dict(orient="records")
-    def heatmap(self) -> dict:
+    def heatmap(self, df: pd.DataFrame) -> list[dict]:
        weekday_order = [
-            "Monday", "Tuesday", "Wednesday",
+            "Monday",
-            "Thursday", "Friday", "Saturday", "Sunday"
+            "Tuesday",
            "Wednesday",
            "Thursday",
            "Friday",
            "Saturday",
            "Sunday",
        ]
-        self.df["weekday"] = pd.Categorical(
+        df = df.copy()
-            self.df["weekday"],
+        df["weekday"] = pd.Categorical(
-            categories=weekday_order,
+            df["weekday"], categories=weekday_order, ordered=True
            ordered=True
        )
        heatmap = (
-            self.df
+            df.groupby(["weekday", "hour"], observed=True)
            .groupby(["weekday", "hour"], observed=True)
            .size()
            .unstack(fill_value=0)
            .reindex(columns=range(24), fill_value=0)
--- a/server/app.py
+++ b/server/app.py
@@ -8,7 +8,7 @@ from flask_jwt_extended import (
    JWTManager,
    create_access_token,
    jwt_required,
-    get_jwt_identity
+    get_jwt_identity,
 )
 from server.stat_gen import StatGen
@@ -27,7 +27,9 @@ db = PostgresConnector()
 load_dotenv()
 frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
 jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
-jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes
+jwt_access_token_expires = int(
    os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
 )  # Default to 20 minutes
 # Flask Configuration
 CORS(app, resources={r"/*": {"origins": frontend_url}})
@@ -38,18 +40,19 @@ bcrypt = Bcrypt(app)
 jwt = JWTManager(app)
 auth_manager = AuthManager(db, bcrypt)
-# Global State
+stat_gen = StatGen()
 # posts_df = pd.read_json('small.jsonl', lines=True)
 # with open("topic_buckets.json", "r", encoding="utf-8") as f:
 #     domain_topics = json.load(f)
 # stat_obj = StatGen(posts_df, domain_topics)
 stat_obj = None
-@app.route('/register', methods=['POST'])
+
@app.route("/register", methods=["POST"])
 def register_user():
    data = request.get_json()
-    if not data or "username" not in data or "email" not in data or "password" not in data: 
+    if (
        not data
        or "username" not in data
        or "email" not in data
        or "password" not in data
    ):
        return jsonify({"error": "Missing username, email, or password"}), 400
    username = data["username"]
@@ -67,7 +70,8 @@ def register_user():
    print(f"Registered new user: {username}")
    return jsonify({"message": f"User '{username}' registered successfully"}), 200
-@app.route('/login', methods=['POST'])
+
@app.route("/login", methods=["POST"])
 def login_user():
    data = request.get_json()
@@ -80,7 +84,7 @@ def login_user():
    try:
        user = auth_manager.authenticate_user(username, password)
        if user:
-            access_token = create_access_token(identity=str(user['id']))
+            access_token = create_access_token(identity=str(user["id"]))
            return jsonify({"access_token": access_token}), 200
        else:
            return jsonify({"error": "Invalid username or password"}), 401
@@ -88,18 +92,18 @@ def login_user():
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/profile", methods=["GET"])
@jwt_required()
 def profile():
    current_user = get_jwt_identity()
    return jsonify(
-        message="Access granted",
+        message="Access granted", user=auth_manager.get_user_by_id(current_user)
        user=auth_manager.get_user_by_id(current_user)
    ), 200
-@app.route('/upload', methods=['POST'])
+@app.route("/upload", methods=["POST"])
@jwt_required()
 def upload_data():
    if "posts" not in request.files or "topics" not in request.files:
@@ -111,8 +115,12 @@ def upload_data():
    if post_file.filename == "" or topic_file == "":
        return jsonify({"error": "Empty filename"}), 400
-    if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
+    if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
-        return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
+        ".json"
    ):
        return jsonify(
            {"error": "Invalid file type. Only .jsonl and .json files are allowed."}
        ), 400
    try:
        current_user = get_jwt_identity()
@@ -122,167 +130,226 @@ def upload_data():
        processor = DatasetProcessor(posts_df, topics)
        enriched_df = processor.enrich()
-        dataset_id = db.save_dataset_info(current_user, f"dataset_{current_user}", topics)
+        dataset_id = db.save_dataset_info(
            current_user, f"dataset_{current_user}", topics
        )
        db.save_dataset_content(dataset_id, enriched_df)
-        return jsonify({"message": "File uploaded successfully", "event_count": len(enriched_df)}), 200
+        return jsonify(
            {"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id}
        ), 200
    except ValueError as e:
        return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
    except Exception as e:
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route('/dataset/<int:dataset_id>', methods=['GET'])
+
@app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required()
 def get_dataset(dataset_id):
-    if stat_obj is None:
+    current_user = get_jwt_identity()
-        return jsonify({"error": "No data uploaded"}), 400
+    dataset = db.get_dataset_info(dataset_id)
-    return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"}
+    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
-@app.route('/stats/content', methods=['GET'])
+    dataset_content = db.get_dataset_content(dataset_id)
 def word_frequencies():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    if dataset_content.empty:
        return jsonify({"error": "Dataset content not found"}), 404
    return jsonify(dataset_content.to_dict(orient="records")), 200
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
@jwt_required()
 def content_endpoint(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_content_analysis()), 200
+        return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route('/stats/summary', methods=["GET"])
+
-def get_summary():
+@app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_summary(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.summary()), 200
+        return jsonify(stat_gen.summary(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/time", methods=["GET"])
 def get_time_analysis():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    try:
        return jsonify(stat_obj.get_time_analysis()), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route("/stats/user", methods=["GET"])
+
-def get_user_analysis():
+@app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_time_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_user_analysis()), 200
+        return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route("/stats/cultural", methods=["GET"])
+
-def get_cultural_analysis():
+@app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_user_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_cultural_analysis()), 200
+        return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
-@app.route("/stats/interaction", methods=["GET"])
+
-def get_interaction_analysis():
+@app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
-    if stat_obj is None:
+@jwt_required()
-        return jsonify({"error": "No data uploaded"}), 400
+def get_cultural_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
    if dataset.get("user_id") != int(current_user):
        return jsonify({"error": "Unauthorized access to dataset"}), 403
    dataset_content = db.get_dataset_content(dataset_id)
    try:
-        return jsonify(stat_obj.get_interactional_analysis()), 200
+        return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
    except ValueError as e:
        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/filter/query', methods=["POST"])
 def filter_query():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
-    data = request.get_json(silent=True) or {}
+@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
@jwt_required()
 def get_interaction_analysis(dataset_id):
    current_user = get_jwt_identity()
    dataset = db.get_dataset_info(dataset_id)
-    if "query" not in data:
+    if dataset.get("user_id") != int(current_user):
-        return jsonify(stat_obj.df.to_dict(orient="records")), 200
+        return jsonify({"error": "Unauthorized access to dataset"}), 403
-    query = data["query"]
+    dataset_content = db.get_dataset_content(dataset_id)
    filtered_df = stat_obj.filter_by_query(query)
    return jsonify(filtered_df), 200
@app.route('/filter/time', methods=["POST"])
 def filter_time():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    data = request.get_json(silent=True)
    if not data:
        return jsonify({"error": "Invalid or missing JSON body"}), 400
    if "start" not in data or "end" not in data:
        return jsonify({"error": "Please include both start and end dates"}), 400
    try:
-        start = pd.to_datetime(data["start"], utc=True)
+        return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
-        end = pd.to_datetime(data["end"], utc=True)
+    except ValueError as e:
-        filtered_df = stat_obj.set_time_range(start, end)
+        return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
        return jsonify(filtered_df), 200
    except Exception:
        return jsonify({"error": "Invalid datetime format"}), 400
@app.route('/filter/sources', methods=["POST"])
 def filter_sources():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    data = request.get_json(silent=True)
    if not data:
        return jsonify({"error": "Invalid or missing JSON body"}), 400
    if "sources" not in data:
        return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
    try:
        filtered_df = stat_obj.filter_data_sources(data["sources"])
        return jsonify(filtered_df), 200
    except ValueError:
        return jsonify({"error": "Please enable at least one data source"}), 400
    except Exception as e:
        return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
@app.route('/filter/reset', methods=["GET"])
 def reset_dataset():
    if stat_obj is None:
        return jsonify({"error": "No data uploaded"}), 400
    try:
        stat_obj.reset_dataset()
        return jsonify({"success": "Dataset successfully reset"})
    except Exception as e:
        print(traceback.format_exc())
        return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
 # @app.route("/filter/query", methods=["POST"])
 # def filter_query():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     data = request.get_json(silent=True) or {}
 #     if "query" not in data:
 #         return jsonify(stat_obj.df.to_dict(orient="records")), 200
 #     query = data["query"]
 #     filtered_df = stat_obj.filter_by_query(query)
 #     return jsonify(filtered_df), 200
 # @app.route("/filter/time", methods=["POST"])
 # def filter_time():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     data = request.get_json(silent=True)
 #     if not data:
 #         return jsonify({"error": "Invalid or missing JSON body"}), 400
 #     if "start" not in data or "end" not in data:
 #         return jsonify({"error": "Please include both start and end dates"}), 400
 #     try:
 #         start = pd.to_datetime(data["start"], utc=True)
 #         end = pd.to_datetime(data["end"], utc=True)
 #         filtered_df = stat_obj.set_time_range(start, end)
 #         return jsonify(filtered_df), 200
 #     except Exception:
 #         return jsonify({"error": "Invalid datetime format"}), 400
 # @app.route("/filter/sources", methods=["POST"])
 # def filter_sources():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     data = request.get_json(silent=True)
 #     if not data:
 #         return jsonify({"error": "Invalid or missing JSON body"}), 400
 #     if "sources" not in data:
 #         return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
 #     try:
 #         filtered_df = stat_obj.filter_data_sources(data["sources"])
 #         return jsonify(filtered_df), 200
 #     except ValueError:
 #         return jsonify({"error": "Please enable at least one data source"}), 400
 #     except Exception as e:
 #         return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
 # @app.route("/filter/reset", methods=["GET"])
 # def reset_dataset():
 #     if stat_obj is None:
 #         return jsonify({"error": "No data uploaded"}), 400
 #     try:
 #         stat_obj.reset_dataset()
 #         return jsonify({"success": "Dataset successfully reset"})
 #     except Exception as e:
 #         print(traceback.format_exc())
 #         return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
 if __name__ == "__main__":
    app.run(debug=True)
--- a/server/stat_gen.py
+++ b/server/stat_gen.py
@@ -1,170 +1,135 @@
 import pandas as pd
 import datetime
 import nltk
 import nltk
 import pandas as pd
 from nltk.corpus import stopwords
-from server.analysis.nlp import NLP
+
-from server.analysis.temporal import TemporalAnalysis
+from server.analysis.cultural import CulturalAnalysis
 from server.analysis.emotional import EmotionalAnalysis
 from server.analysis.interactional import InteractionAnalysis
 from server.analysis.linguistic import LinguisticAnalysis
-from server.analysis.cultural import CulturalAnalysis
+from server.analysis.temporal import TemporalAnalysis
 DOMAIN_STOPWORDS = {
-    "www", "https", "http",
+    "www",
-    "boards", "boardsie",
+    "https",
-    "comment", "comments",
+    "http",
-    "discussion", "thread",
+    "boards",
-    "post", "posts",
+    "boardsie",
-    "would", "get", "one"
+    "comment",
    "comments",
    "discussion",
    "thread",
    "post",
    "posts",
    "would",
    "get",
    "one",
 }
-nltk.download('stopwords')
+nltk.download("stopwords")
-EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
+EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
 class StatGen:
-    def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
+    def __init__(self) -> None:
-        comments_df = df[["id", "comments"]].explode("comments")
+        self.temporal_analysis = TemporalAnalysis()
-        comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
+        self.emotional_analysis = EmotionalAnalysis()
-        comments_df = pd.json_normalize(comments_df["comments"])
+        self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
        self.cultural_analysis = CulturalAnalysis()
-        posts_df = df.drop(columns=["comments"])
+    def get_time_analysis(self, df: pd.DataFrame) -> dict:
        posts_df["type"] = "post"
        posts_df["parent_id"] = None
        comments_df["type"] = "comment"
        comments_df["parent_id"] = comments_df.get("post_id")
        self.domain_topics = domain_topics
        self.df = pd.concat([posts_df, comments_df])
        self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
        self.nlp = NLP(self.df, "title", "content", domain_topics)
        self.nlp.add_emotion_cols()
        self.nlp.add_topic_col()
        self.nlp.add_ner_cols()
        self._add_time_cols(self.df)
        self.temporal_analysis = TemporalAnalysis(self.df)
        self.emotional_analysis = EmotionalAnalysis(self.df)
        self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
        self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
        self.cultural_analysis = CulturalAnalysis(self.df)
        self.original_df = self.df.copy(deep=True)
    ## Private Methods
    def _add_time_cols(self, df: pd.DataFrame) -> None:
        df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
        df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
        df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
        df["hour"] = df["dt"].dt.hour
        df["weekday"] = df["dt"].dt.day_name()
    ## Public
    # topics over time
    # emotions over time
    def get_time_analysis(self) -> dict:
        return {
-            "events_per_day": self.temporal_analysis.posts_per_day(),
+            "events_per_day": self.temporal_analysis.posts_per_day(df),
-            "weekday_hour_heatmap": self.temporal_analysis.heatmap()
+            "weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
        }
-    # average topic duration
+    def get_content_analysis(self, df: pd.DataFrame) -> dict:
    def get_content_analysis(self) -> dict:
        return {
-            "word_frequencies": self.linguistic_analysis.word_frequencies(),
+            "word_frequencies": self.linguistic_analysis.word_frequencies(df),
-            "common_two_phrases": self.linguistic_analysis.ngrams(),
+            "common_two_phrases": self.linguistic_analysis.ngrams(df),
-            "common_three_phrases": self.linguistic_analysis.ngrams(n=3),
+            "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
-            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
+            "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
-            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
+            "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
        }
-    # average emotion per user
+    def get_user_analysis(self, df: pd.DataFrame) -> dict:
    # average chain length
    def get_user_analysis(self) -> dict:
        return {
-            "top_users": self.interaction_analysis.top_users(),
+            "top_users": self.interaction_analysis.top_users(df),
-            "users": self.interaction_analysis.per_user_analysis()
+            "users": self.interaction_analysis.per_user_analysis(df),
            "interaction_graph": self.interaction_analysis.interaction_graph(df),
        }
-    # average / max thread depth
+    def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
    # high engagment threads based on volume
    def get_interactional_analysis(self) -> dict:
        return {
-            "average_thread_depth": self.interaction_analysis.average_thread_depth(),
+            "average_thread_depth": self.interaction_analysis.average_thread_depth(df),
-            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
+            "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
            "interaction_graph": self.interaction_analysis.interaction_graph()
        }
-    # detect community jargon
+    def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
    # in-group and out-group linguistic markers
    def get_cultural_analysis(self) -> dict:
        return {
-            "identity_markers": self.cultural_analysis.get_identity_markers(),
+            "identity_markers": self.cultural_analysis.get_identity_markers(df),
-            "stance_markers": self.cultural_analysis.get_stance_markers(),
+            "stance_markers": self.cultural_analysis.get_stance_markers(df),
-            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
+            "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
        }
-    def summary(self) -> dict:
+    def summary(self, df: pd.DataFrame) -> dict:
-        total_posts = (self.df["type"] == "post").sum()
+        total_posts = (df["type"] == "post").sum()
-        total_comments = (self.df["type"] == "comment").sum()
+        total_comments = (df["type"] == "comment").sum()
-
+        events_per_user = df.groupby("author").size()
        events_per_user = self.df.groupby("author").size()
        return {
-            "total_events": int(len(self.df)),
+            "total_events": int(len(df)),
            "total_posts": int(total_posts),
            "total_comments": int(total_comments),
            "unique_users": int(events_per_user.count()),
            "comments_per_post": round(total_comments / max(total_posts, 1), 2),
            "lurker_ratio": round((events_per_user == 1).mean(), 2),
            "time_range": {
-                "start": int(self.df["dt"].min().timestamp()),
+                "start": int(df["dt"].min().timestamp()),
-                "end": int(self.df["dt"].max().timestamp())
+                "end": int(df["dt"].max().timestamp()),
            },
-            "sources": self.df["source"].dropna().unique().tolist()
+            "sources": df["source"].dropna().unique().tolist(),
        }
-    def filter_by_query(self, search_query: str) -> dict:
+    # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
-        self.df = self.df[
+    #     filtered_df = df[df["content"].str.contains(search_query, na=False)]
            self.df["content"].str.contains(search_query)
        ]
-        return {
+    #     return {
-            "rows": len(self.df),
+    #         "rows": len(filtered_df),
-            "data": self.df.to_dict(orient="records")
+    #         "data": filtered_df.to_dict(orient="records"),
-        }
+    #     }
-    def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict:
+    # def set_time_range(
-        self.df = self.df[
+    #     self,
-            (self.df["dt"] >= start) &
+    #     original_df: pd.DataFrame,
-            (self.df["dt"] <= end)
+    #     start: datetime.datetime,
-        ]
+    #     end: datetime.datetime,
    # ) -> dict:
    #     df = self._prepare_df(original_df)
    #     filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
-        return {
+    #     return {
-            "rows": len(self.df),
+    #         "rows": len(filtered_df),
-            "data": self.df.to_dict(orient="records")
+    #         "data": filtered_df.to_dict(orient="records"),
-        }
+    #     }
-    """
+    # def filter_data_sources(
-    Input is a hash map (source_name: str -> enabled: bool)
+    #     self, original_df: pd.DataFrame, data_sources: dict
-    """
+    # ) -> dict:
-    def filter_data_sources(self, data_sources: dict) -> dict:
+    #     df = self._prepare_df(original_df)
-        enabled_sources = [src for src, enabled in data_sources.items() if enabled]
+    #     enabled_sources = [src for src, enabled in data_sources.items() if enabled]
-        if not enabled_sources:
+    #     if not enabled_sources:
-            raise ValueError("Please choose at least one data source")
+    #         raise ValueError("Please choose at least one data source")
-        self.df = self.df[self.df["source"].isin(enabled_sources)]
+    #     filtered_df = df[df["source"].isin(enabled_sources)]
-        return {
+    #     return {
-            "rows": len(self.df),
+    #         "rows": len(filtered_df),
-            "data": self.df.to_dict(orient="records")
+    #         "data": filtered_df.to_dict(orient="records"),
-        }
+    #     }
    def reset_dataset(self) -> None:
        self.df = self.original_df.copy(deep=True)
    # def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
    #     return self._prepare_df(original_df)
Author	SHA1	Message	Date
Dylan De Faoite	7ddd625bf8	fix: database schema missing type column	2026-03-01 16:40:00 +00:00
Dylan De Faoite	07ab7529a9	refactor: update analysis classes to accept DataFrame as parameter instead of instance variable	2026-03-01 16:25:39 +00:00
Dylan De Faoite	d20790ed4b	fix: incorrect dataset authorisation check	2026-03-01 16:10:42 +00:00