Compare commits

..

3 Commits

9 changed files with 422 additions and 396 deletions

View File

@@ -64,6 +64,7 @@ class PostgresConnector:
query = """ query = """
INSERT INTO events ( INSERT INTO events (
dataset_id, dataset_id,
type,
parent_id, parent_id,
author, author,
content, content,
@@ -87,7 +88,7 @@ class PostgresConnector:
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
%s, %s, %s, %s %s, %s, %s, %s, %s
) )
""" """
@@ -96,6 +97,7 @@ class PostgresConnector:
for _, row in event_data.iterrows(): for _, row in event_data.iterrows():
values.append(( values.append((
dataset_id, dataset_id,
row["type"],
row["parent_id"], row["parent_id"],
row["author"], row["author"],
row["content"], row["content"],
@@ -121,14 +123,15 @@ class PostgresConnector:
execute_batch(cursor, query, values) execute_batch(cursor, query, values)
self.connection.commit() self.connection.commit()
def get_dataset_by_id(self, dataset_id: int) -> pd.DataFrame: def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
query = "SELECT * FROM events WHERE dataset_id = %s" query = "SELECT * FROM events WHERE dataset_id = %s"
result = self.execute(query, (dataset_id,), fetch=True) result = self.execute(query, (dataset_id,), fetch=True)
return pd.DataFrame(result) return pd.DataFrame(result)
def get_datasets_for_user(self, user_id: int) -> list: def get_dataset_info(self, dataset_id: int) -> dict:
query = "SELECT * FROM datasets WHERE user_id = %s" query = "SELECT * FROM datasets WHERE id = %s"
return self.execute(query, (user_id,), fetch=True) result = self.execute(query, (dataset_id,), fetch=True)
return result[0] if result else None
def close(self): def close(self):
if self.connection: if self.connection:

View File

@@ -20,6 +20,7 @@ CREATE TABLE events (
/* Required Fields */ /* Required Fields */
id SERIAL PRIMARY KEY, id SERIAL PRIMARY KEY,
dataset_id INTEGER NOT NULL, dataset_id INTEGER NOT NULL,
type VARCHAR(255) NOT NULL,
author VARCHAR(255) NOT NULL, author VARCHAR(255) NOT NULL,
content TEXT NOT NULL, content TEXT NOT NULL,

View File

@@ -6,13 +6,12 @@ from typing import Any
class CulturalAnalysis: class CulturalAnalysis:
def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"): def __init__(self, content_col: str = "content", topic_col: str = "topic"):
self.df = df
self.content_col = content_col self.content_col = content_col
self.topic_col = topic_col self.topic_col = topic_col
def get_identity_markers(self): def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
df = self.df.copy() df = original_df.copy()
s = df[self.content_col].fillna("").astype(str).str.lower() s = df[self.content_col].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"} in_group_words = {"we", "us", "our", "ourselves"}
@@ -60,8 +59,8 @@ class CulturalAnalysis:
return result return result
def get_stance_markers(self) -> dict[str, Any]: def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
s = self.df[self.content_col].fillna("").astype(str) s = df[self.content_col].fillna("").astype(str)
hedges = { hedges = {
"maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem", "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
@@ -104,13 +103,11 @@ class CulturalAnalysis:
"permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3), "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
} }
def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]: def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
if "entities" not in self.df.columns: if "entities" not in df.columns:
return {"entity_emotion_avg": {}} return {"entity_emotion_avg": {}}
df = self.df
emotion_cols = [c for c in df.columns if c.startswith("emotion_")] emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
entity_counter = Counter() entity_counter = Counter()
for row in df["entities"].dropna(): for row in df["entities"].dropna():

View File

@@ -1,18 +1,15 @@
import pandas as pd import pandas as pd
class EmotionalAnalysis: class EmotionalAnalysis:
def __init__(self, df: pd.DataFrame): def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
self.df = df
def avg_emotion_by_topic(self) -> dict:
emotion_cols = [ emotion_cols = [
col for col in self.df.columns col for col in df.columns
if col.startswith("emotion_") if col.startswith("emotion_")
] ]
counts = ( counts = (
self.df[ df[
(self.df["topic"] != "Misc") (df["topic"] != "Misc")
] ]
.groupby("topic") .groupby("topic")
.size() .size()
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
) )
avg_emotion_by_topic = ( avg_emotion_by_topic = (
self.df[ df[
(self.df["topic"] != "Misc") (df["topic"] != "Misc")
] ]
.groupby("topic")[emotion_cols] .groupby("topic")[emotion_cols]
.mean() .mean()

View File

@@ -5,8 +5,7 @@ from collections import Counter
class InteractionAnalysis: class InteractionAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.df = df
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
def _tokenize(self, text: str): def _tokenize(self, text: str):
@@ -14,9 +13,9 @@ class InteractionAnalysis:
return [t for t in tokens if t not in self.word_exclusions] return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user( def _vocab_richness_per_user(
self, min_words: int = 20, top_most_used_words: int = 100 self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
) -> list: ) -> list:
df = self.df.copy() df = df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower() df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize) df["tokens"] = df["content"].apply(self._tokenize)
@@ -58,10 +57,8 @@ class InteractionAnalysis:
return rows return rows
def top_users(self) -> list: def top_users(self, df: pd.DataFrame) -> list:
counts = ( counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
)
top_users = [ top_users = [
{"author": author, "source": source, "count": int(count)} {"author": author, "source": source, "count": int(count)}
@@ -70,14 +67,14 @@ class InteractionAnalysis:
return top_users return top_users
def per_user_analysis(self) -> dict: def per_user_analysis(self, df: pd.DataFrame) -> dict:
per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0) per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")] emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
avg_emotions_by_author = {} avg_emotions_by_author = {}
if emotion_cols: if emotion_cols:
avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0) avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = { avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()} author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows() for author, row in avg_emotions.iterrows()
@@ -97,7 +94,7 @@ class InteractionAnalysis:
per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records") per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user() vocab_rows = self._vocab_richness_per_user(df)
vocab_by_author = {row["author"]: row for row in vocab_rows} vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information # merge vocab richness + per_user information
@@ -112,7 +109,14 @@ class InteractionAnalysis:
"comment_post_ratio": float(row.get("comment_post_ratio", 0)), "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)), "comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}), "avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}), "vocab": vocab_by_author.get(
author,
{
"vocab_richness": 0,
"avg_words_per_event": 0,
"top_words": [],
},
),
} }
) )
@@ -120,13 +124,13 @@ class InteractionAnalysis:
return merged_users return merged_users
def interaction_graph(self): def interaction_graph(self, df: pd.DataFrame):
interactions = {a: {} for a in self.df["author"].dropna().unique()} interactions = {a: {} for a in df["author"].dropna().unique()}
# reply_to refers to the comment id, this allows us to map comment ids to usernames # reply_to refers to the comment id, this allows us to map comment ids to usernames
id_to_author = self.df.set_index("id")["author"].to_dict() id_to_author = df.set_index("id")["author"].to_dict()
for _, row in self.df.iterrows(): for _, row in df.iterrows():
a = row["author"] a = row["author"]
reply_id = row["reply_to"] reply_id = row["reply_to"]
@@ -141,10 +145,10 @@ class InteractionAnalysis:
return interactions return interactions
def average_thread_depth(self): def average_thread_depth(self, df: pd.DataFrame):
depths = [] depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict() id_to_reply = df.set_index("id")["reply_to"].to_dict()
for _, row in self.df.iterrows(): for _, row in df.iterrows():
depth = 0 depth = 0
current_id = row["id"] current_id = row["id"]
@@ -163,16 +167,16 @@ class InteractionAnalysis:
return round(sum(depths) / len(depths), 2) return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self): def average_thread_length_by_emotion(self, df: pd.DataFrame):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [ emotion_cols = [
c c
for c in self.df.columns for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions if c.startswith("emotion_") and c not in emotion_exclusions
] ]
id_to_reply = self.df.set_index("id")["reply_to"].to_dict() id_to_reply = df.set_index("id")["reply_to"].to_dict()
length_cache = {} length_cache = {}
def thread_length_from(start_id): def thread_length_from(start_id):
@@ -211,7 +215,7 @@ class InteractionAnalysis:
emotion_to_lengths = {} emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues # Fill NaNs in emotion cols to avoid max() issues
emo_df = self.df[["id"] + emotion_cols].copy() emo_df = df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0) emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows(): for _, row in emo_df.iterrows():

View File

@@ -4,9 +4,9 @@ import re
from collections import Counter from collections import Counter
from itertools import islice from itertools import islice
class LinguisticAnalysis: class LinguisticAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.df = df
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
def _tokenize(self, text: str): def _tokenize(self, text: str):
@@ -14,29 +14,20 @@ class LinguisticAnalysis:
return [t for t in tokens if t not in self.word_exclusions] return [t for t in tokens if t not in self.word_exclusions]
def _clean_text(self, text: str) -> str: def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text) text = re.sub(r"www\S+", "", text)
text = re.sub(r"&\w+;", "", text) # remove HTML entities text = re.sub(r"&\w+;", "", text) # remove HTML entities
text = re.sub(r"\bamp\b", "", text) # remove stray amp text = re.sub(r"\bamp\b", "", text) # remove stray amp
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text return text
def word_frequencies(self, limit: int = 100) -> dict: def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = ( texts = df["content"].dropna().astype(str).str.lower()
self.df["content"]
.dropna()
.astype(str)
.str.lower()
)
words = [] words = []
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
words.extend( words.extend(w for w in tokens if w not in self.word_exclusions)
w for w in tokens
if w not in self.word_exclusions
)
counts = Counter(words) counts = Counter(words)
@@ -49,15 +40,15 @@ class LinguisticAnalysis:
return word_frequencies.to_dict(orient="records") return word_frequencies.to_dict(orient="records")
def ngrams(self, n=2, limit=100): def ngrams(self, df: pd.DataFrame, n=2, limit=100):
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower() texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
all_ngrams = [] all_ngrams = []
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
# stop word removal causes strange behaviors in ngrams # stop word removal causes strange behaviors in ngrams
#tokens = [w for w in tokens if w not in self.word_exclusions] # tokens = [w for w in tokens if w not in self.word_exclusions]
ngrams = zip(*(islice(tokens, i, None) for i in range(n))) ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
all_ngrams.extend([" ".join(ng) for ng in ngrams]) all_ngrams.extend([" ".join(ng) for ng in ngrams])

View File

@@ -1,16 +1,14 @@
import pandas as pd import pandas as pd
class TemporalAnalysis:
def __init__(self, df: pd.DataFrame):
self.df = df
def avg_reply_time_per_emotion(self) -> dict: class TemporalAnalysis:
df = self.df.copy() def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
df = df.copy()
replies = df[ replies = df[
(df["type"] == "comment") & (df["type"] == "comment")
(df["reply_to"].notna()) & & (df["reply_to"].notna())
(df["reply_to"] != "") & (df["reply_to"] != "")
] ]
id_to_time = df.set_index("id")["dt"].to_dict() id_to_time = df.set_index("id")["dt"].to_dict()
@@ -25,42 +23,45 @@ class TemporalAnalysis:
return (row["dt"] - parent_time).total_seconds() return (row["dt"] - parent_time).total_seconds()
replies["reply_time"] = replies.apply(compute_reply_time, axis=1) replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")] emotion_cols = [
col
for col in df.columns
if col.startswith("emotion_")
and col not in ("emotion_neutral", "emotion_surprise")
]
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1) replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
grouped = ( grouped = (
replies replies.groupby("dominant_emotion")["reply_time"]
.groupby("dominant_emotion")["reply_time"]
.agg(["mean", "count"]) .agg(["mean", "count"])
.reset_index() .reset_index()
) )
return grouped.to_dict(orient="records") return grouped.to_dict(orient="records")
def posts_per_day(self) -> dict: def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
per_day = ( per_day = df.groupby("date").size().reset_index(name="count")
self.df.groupby("date")
.size()
.reset_index(name="count")
)
return per_day.to_dict(orient="records") return per_day.to_dict(orient="records")
def heatmap(self) -> dict: def heatmap(self, df: pd.DataFrame) -> list[dict]:
weekday_order = [ weekday_order = [
"Monday", "Tuesday", "Wednesday", "Monday",
"Thursday", "Friday", "Saturday", "Sunday" "Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
] ]
self.df["weekday"] = pd.Categorical( df = df.copy()
self.df["weekday"], df["weekday"] = pd.Categorical(
categories=weekday_order, df["weekday"], categories=weekday_order, ordered=True
ordered=True
) )
heatmap = ( heatmap = (
self.df df.groupby(["weekday", "hour"], observed=True)
.groupby(["weekday", "hour"], observed=True)
.size() .size()
.unstack(fill_value=0) .unstack(fill_value=0)
.reindex(columns=range(24), fill_value=0) .reindex(columns=range(24), fill_value=0)

View File

@@ -8,7 +8,7 @@ from flask_jwt_extended import (
JWTManager, JWTManager,
create_access_token, create_access_token,
jwt_required, jwt_required,
get_jwt_identity get_jwt_identity,
) )
from server.stat_gen import StatGen from server.stat_gen import StatGen
@@ -27,7 +27,9 @@ db = PostgresConnector()
load_dotenv() load_dotenv()
frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173") frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this") jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes jwt_access_token_expires = int(
os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
) # Default to 20 minutes
# Flask Configuration # Flask Configuration
CORS(app, resources={r"/*": {"origins": frontend_url}}) CORS(app, resources={r"/*": {"origins": frontend_url}})
@@ -38,18 +40,19 @@ bcrypt = Bcrypt(app)
jwt = JWTManager(app) jwt = JWTManager(app)
auth_manager = AuthManager(db, bcrypt) auth_manager = AuthManager(db, bcrypt)
# Global State stat_gen = StatGen()
# posts_df = pd.read_json('small.jsonl', lines=True)
# with open("topic_buckets.json", "r", encoding="utf-8") as f:
# domain_topics = json.load(f)
# stat_obj = StatGen(posts_df, domain_topics)
stat_obj = None
@app.route('/register', methods=['POST'])
@app.route("/register", methods=["POST"])
def register_user(): def register_user():
data = request.get_json() data = request.get_json()
if not data or "username" not in data or "email" not in data or "password" not in data: if (
not data
or "username" not in data
or "email" not in data
or "password" not in data
):
return jsonify({"error": "Missing username, email, or password"}), 400 return jsonify({"error": "Missing username, email, or password"}), 400
username = data["username"] username = data["username"]
@@ -67,7 +70,8 @@ def register_user():
print(f"Registered new user: {username}") print(f"Registered new user: {username}")
return jsonify({"message": f"User '{username}' registered successfully"}), 200 return jsonify({"message": f"User '{username}' registered successfully"}), 200
@app.route('/login', methods=['POST'])
@app.route("/login", methods=["POST"])
def login_user(): def login_user():
data = request.get_json() data = request.get_json()
@@ -80,7 +84,7 @@ def login_user():
try: try:
user = auth_manager.authenticate_user(username, password) user = auth_manager.authenticate_user(username, password)
if user: if user:
access_token = create_access_token(identity=str(user['id'])) access_token = create_access_token(identity=str(user["id"]))
return jsonify({"access_token": access_token}), 200 return jsonify({"access_token": access_token}), 200
else: else:
return jsonify({"error": "Invalid username or password"}), 401 return jsonify({"error": "Invalid username or password"}), 401
@@ -88,18 +92,18 @@ def login_user():
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/profile", methods=["GET"]) @app.route("/profile", methods=["GET"])
@jwt_required() @jwt_required()
def profile(): def profile():
current_user = get_jwt_identity() current_user = get_jwt_identity()
return jsonify( return jsonify(
message="Access granted", message="Access granted", user=auth_manager.get_user_by_id(current_user)
user=auth_manager.get_user_by_id(current_user)
), 200 ), 200
@app.route('/upload', methods=['POST']) @app.route("/upload", methods=["POST"])
@jwt_required() @jwt_required()
def upload_data(): def upload_data():
if "posts" not in request.files or "topics" not in request.files: if "posts" not in request.files or "topics" not in request.files:
@@ -111,8 +115,12 @@ def upload_data():
if post_file.filename == "" or topic_file == "": if post_file.filename == "" or topic_file == "":
return jsonify({"error": "Empty filename"}), 400 return jsonify({"error": "Empty filename"}), 400
if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'): if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400 ".json"
):
return jsonify(
{"error": "Invalid file type. Only .jsonl and .json files are allowed."}
), 400
try: try:
current_user = get_jwt_identity() current_user = get_jwt_identity()
@@ -122,167 +130,226 @@ def upload_data():
processor = DatasetProcessor(posts_df, topics) processor = DatasetProcessor(posts_df, topics)
enriched_df = processor.enrich() enriched_df = processor.enrich()
dataset_id = db.save_dataset_info(current_user, f"dataset_{current_user}", topics) dataset_id = db.save_dataset_info(
current_user, f"dataset_{current_user}", topics
)
db.save_dataset_content(dataset_id, enriched_df) db.save_dataset_content(dataset_id, enriched_df)
return jsonify({"message": "File uploaded successfully", "event_count": len(enriched_df)}), 200 return jsonify(
{"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id}
), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
except Exception as e: except Exception as e:
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/dataset/<int:dataset_id>', methods=['GET'])
@app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required()
def get_dataset(dataset_id): def get_dataset(dataset_id):
if stat_obj is None: current_user = get_jwt_identity()
return jsonify({"error": "No data uploaded"}), 400 dataset = db.get_dataset_info(dataset_id)
return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"} if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
@app.route('/stats/content', methods=['GET']) dataset_content = db.get_dataset_content(dataset_id)
def word_frequencies():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
if dataset_content.empty:
return jsonify({"error": "Dataset content not found"}), 404
return jsonify(dataset_content.to_dict(orient="records")), 200
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
@jwt_required()
def content_endpoint(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_content_analysis()), 200 return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/stats/summary', methods=["GET"])
def get_summary(): @app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_summary(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.summary()), 200 return jsonify(stat_gen.summary(dataset_content)), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/time", methods=["GET"])
def get_time_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_time_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/user", methods=["GET"])
def get_user_analysis(): @app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_time_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_user_analysis()), 200 return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
def get_cultural_analysis(): @app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_user_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_cultural_analysis()), 200 return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
def get_interaction_analysis(): @app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_cultural_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_interactional_analysis()), 200 return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/filter/query', methods=["POST"])
def filter_query():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
data = request.get_json(silent=True) or {} @app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
@jwt_required()
def get_interaction_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if "query" not in data: if dataset.get("user_id") != int(current_user):
return jsonify(stat_obj.df.to_dict(orient="records")), 200 return jsonify({"error": "Unauthorized access to dataset"}), 403
query = data["query"] dataset_content = db.get_dataset_content(dataset_id)
filtered_df = stat_obj.filter_by_query(query)
return jsonify(filtered_df), 200
@app.route('/filter/time', methods=["POST"])
def filter_time():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
data = request.get_json(silent=True)
if not data:
return jsonify({"error": "Invalid or missing JSON body"}), 400
if "start" not in data or "end" not in data:
return jsonify({"error": "Please include both start and end dates"}), 400
try: try:
start = pd.to_datetime(data["start"], utc=True) return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
end = pd.to_datetime(data["end"], utc=True) except ValueError as e:
filtered_df = stat_obj.set_time_range(start, end) return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
return jsonify(filtered_df), 200
except Exception:
return jsonify({"error": "Invalid datetime format"}), 400
@app.route('/filter/sources', methods=["POST"])
def filter_sources():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
data = request.get_json(silent=True)
if not data:
return jsonify({"error": "Invalid or missing JSON body"}), 400
if "sources" not in data:
return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
try:
filtered_df = stat_obj.filter_data_sources(data["sources"])
return jsonify(filtered_df), 200
except ValueError:
return jsonify({"error": "Please enable at least one data source"}), 400
except Exception as e:
return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
@app.route('/filter/reset', methods=["GET"])
def reset_dataset():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
stat_obj.reset_dataset()
return jsonify({"success": "Dataset successfully reset"})
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
# @app.route("/filter/query", methods=["POST"])
# def filter_query():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# data = request.get_json(silent=True) or {}
# if "query" not in data:
# return jsonify(stat_obj.df.to_dict(orient="records")), 200
# query = data["query"]
# filtered_df = stat_obj.filter_by_query(query)
# return jsonify(filtered_df), 200
# @app.route("/filter/time", methods=["POST"])
# def filter_time():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# data = request.get_json(silent=True)
# if not data:
# return jsonify({"error": "Invalid or missing JSON body"}), 400
# if "start" not in data or "end" not in data:
# return jsonify({"error": "Please include both start and end dates"}), 400
# try:
# start = pd.to_datetime(data["start"], utc=True)
# end = pd.to_datetime(data["end"], utc=True)
# filtered_df = stat_obj.set_time_range(start, end)
# return jsonify(filtered_df), 200
# except Exception:
# return jsonify({"error": "Invalid datetime format"}), 400
# @app.route("/filter/sources", methods=["POST"])
# def filter_sources():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# data = request.get_json(silent=True)
# if not data:
# return jsonify({"error": "Invalid or missing JSON body"}), 400
# if "sources" not in data:
# return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
# try:
# filtered_df = stat_obj.filter_data_sources(data["sources"])
# return jsonify(filtered_df), 200
# except ValueError:
# return jsonify({"error": "Please enable at least one data source"}), 400
# except Exception as e:
# return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
# @app.route("/filter/reset", methods=["GET"])
# def reset_dataset():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# try:
# stat_obj.reset_dataset()
# return jsonify({"success": "Dataset successfully reset"})
# except Exception as e:
# print(traceback.format_exc())
# return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True) app.run(debug=True)

View File

@@ -1,170 +1,135 @@
import pandas as pd
import datetime import datetime
import nltk
import nltk
import pandas as pd
from nltk.corpus import stopwords from nltk.corpus import stopwords
from server.analysis.nlp import NLP
from server.analysis.temporal import TemporalAnalysis from server.analysis.cultural import CulturalAnalysis
from server.analysis.emotional import EmotionalAnalysis from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.cultural import CulturalAnalysis from server.analysis.temporal import TemporalAnalysis
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www",
"boards", "boardsie", "https",
"comment", "comments", "http",
"discussion", "thread", "boards",
"post", "posts", "boardsie",
"would", "get", "one" "comment",
"comments",
"discussion",
"thread",
"post",
"posts",
"would",
"get",
"one",
} }
nltk.download('stopwords') nltk.download("stopwords")
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
class StatGen: class StatGen:
def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None: def __init__(self) -> None:
comments_df = df[["id", "comments"]].explode("comments") self.temporal_analysis = TemporalAnalysis()
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))] self.emotional_analysis = EmotionalAnalysis()
comments_df = pd.json_normalize(comments_df["comments"]) self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis()
posts_df = df.drop(columns=["comments"]) def get_time_analysis(self, df: pd.DataFrame) -> dict:
posts_df["type"] = "post"
posts_df["parent_id"] = None
comments_df["type"] = "comment"
comments_df["parent_id"] = comments_df.get("post_id")
self.domain_topics = domain_topics
self.df = pd.concat([posts_df, comments_df])
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
self.nlp = NLP(self.df, "title", "content", domain_topics)
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
self._add_time_cols(self.df)
self.temporal_analysis = TemporalAnalysis(self.df)
self.emotional_analysis = EmotionalAnalysis(self.df)
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis(self.df)
self.original_df = self.df.copy(deep=True)
## Private Methods
def _add_time_cols(self, df: pd.DataFrame) -> None:
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name()
## Public
# topics over time
# emotions over time
def get_time_analysis(self) -> dict:
return { return {
"events_per_day": self.temporal_analysis.posts_per_day(), "events_per_day": self.temporal_analysis.posts_per_day(df),
"weekday_hour_heatmap": self.temporal_analysis.heatmap() "weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
} }
# average topic duration def get_content_analysis(self, df: pd.DataFrame) -> dict:
def get_content_analysis(self) -> dict:
return { return {
"word_frequencies": self.linguistic_analysis.word_frequencies(), "word_frequencies": self.linguistic_analysis.word_frequencies(df),
"common_two_phrases": self.linguistic_analysis.ngrams(), "common_two_phrases": self.linguistic_analysis.ngrams(df),
"common_three_phrases": self.linguistic_analysis.ngrams(n=3), "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(), "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
} }
# average emotion per user def get_user_analysis(self, df: pd.DataFrame) -> dict:
# average chain length
def get_user_analysis(self) -> dict:
return { return {
"top_users": self.interaction_analysis.top_users(), "top_users": self.interaction_analysis.top_users(df),
"users": self.interaction_analysis.per_user_analysis() "users": self.interaction_analysis.per_user_analysis(df),
"interaction_graph": self.interaction_analysis.interaction_graph(df),
} }
# average / max thread depth def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return { return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(), "average_thread_depth": self.interaction_analysis.average_thread_depth(df),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(), "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
"interaction_graph": self.interaction_analysis.interaction_graph()
} }
# detect community jargon def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return { return {
"identity_markers": self.cultural_analysis.get_identity_markers(), "identity_markers": self.cultural_analysis.get_identity_markers(df),
"stance_markers": self.cultural_analysis.get_stance_markers(), "stance_markers": self.cultural_analysis.get_stance_markers(df),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity() "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
} }
def summary(self) -> dict: def summary(self, df: pd.DataFrame) -> dict:
total_posts = (self.df["type"] == "post").sum() total_posts = (df["type"] == "post").sum()
total_comments = (self.df["type"] == "comment").sum() total_comments = (df["type"] == "comment").sum()
events_per_user = df.groupby("author").size()
events_per_user = self.df.groupby("author").size()
return { return {
"total_events": int(len(self.df)), "total_events": int(len(df)),
"total_posts": int(total_posts), "total_posts": int(total_posts),
"total_comments": int(total_comments), "total_comments": int(total_comments),
"unique_users": int(events_per_user.count()), "unique_users": int(events_per_user.count()),
"comments_per_post": round(total_comments / max(total_posts, 1), 2), "comments_per_post": round(total_comments / max(total_posts, 1), 2),
"lurker_ratio": round((events_per_user == 1).mean(), 2), "lurker_ratio": round((events_per_user == 1).mean(), 2),
"time_range": { "time_range": {
"start": int(self.df["dt"].min().timestamp()), "start": int(df["dt"].min().timestamp()),
"end": int(self.df["dt"].max().timestamp()) "end": int(df["dt"].max().timestamp()),
}, },
"sources": self.df["source"].dropna().unique().tolist() "sources": df["source"].dropna().unique().tolist(),
} }
def filter_by_query(self, search_query: str) -> dict: # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
self.df = self.df[ # filtered_df = df[df["content"].str.contains(search_query, na=False)]
self.df["content"].str.contains(search_query)
]
return { # return {
"rows": len(self.df), # "rows": len(filtered_df),
"data": self.df.to_dict(orient="records") # "data": filtered_df.to_dict(orient="records"),
} # }
def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict: # def set_time_range(
self.df = self.df[ # self,
(self.df["dt"] >= start) & # original_df: pd.DataFrame,
(self.df["dt"] <= end) # start: datetime.datetime,
] # end: datetime.datetime,
# ) -> dict:
# df = self._prepare_df(original_df)
# filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
return { # return {
"rows": len(self.df), # "rows": len(filtered_df),
"data": self.df.to_dict(orient="records") # "data": filtered_df.to_dict(orient="records"),
} # }
""" # def filter_data_sources(
Input is a hash map (source_name: str -> enabled: bool) # self, original_df: pd.DataFrame, data_sources: dict
""" # ) -> dict:
def filter_data_sources(self, data_sources: dict) -> dict: # df = self._prepare_df(original_df)
enabled_sources = [src for src, enabled in data_sources.items() if enabled] # enabled_sources = [src for src, enabled in data_sources.items() if enabled]
if not enabled_sources: # if not enabled_sources:
raise ValueError("Please choose at least one data source") # raise ValueError("Please choose at least one data source")
self.df = self.df[self.df["source"].isin(enabled_sources)] # filtered_df = df[df["source"].isin(enabled_sources)]
return { # return {
"rows": len(self.df), # "rows": len(filtered_df),
"data": self.df.to_dict(orient="records") # "data": filtered_df.to_dict(orient="records"),
} # }
def reset_dataset(self) -> None:
self.df = self.original_df.copy(deep=True)
# def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
# return self._prepare_df(original_df)