Compare commits

...

13 Commits

11 changed files with 411 additions and 21 deletions

52
db/database.py Normal file
View File

@@ -0,0 +1,52 @@
import os
import psycopg2
from psycopg2.extras import RealDictCursor
class PostgresConnector:
"""
Simple PostgreSQL connector (single connection).
"""
def __init__(self):
self.connection = psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432),
user=os.getenv("POSTGRES_USER", "postgres"),
password=os.getenv("POSTGRES_PASSWORD", "postgres"),
database=os.getenv("POSTGRES_DB", "postgres"),
)
self.connection.autocommit = False
def execute(self, query, params=None, fetch=False):
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(query, params)
if fetch:
return cursor.fetchall()
self.connection.commit()
def executemany(self, query, param_list):
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.executemany(query, param_list)
self.connection.commit()
def save_user(self, username, email, password_hash):
query = """
INSERT INTO users (username, email, password_hash)
VALUES (%s, %s, %s)
"""
self.execute(query, (username, email, password_hash))
def get_user_by_username(self, username) -> dict:
query = "SELECT id, username, email, password_hash FROM users WHERE username = %s"
result = self.execute(query, (username,), fetch=True)
return result[0] if result else None
def get_user_by_email(self, email) -> dict:
query = "SELECT id, username, email, password_hash FROM users WHERE email = %s"
result = self.execute(query, (email,), fetch=True)
return result[0] if result else None
def close(self):
if self.connection:
self.connection.close()

35
db/schema.sql Normal file
View File

@@ -0,0 +1,35 @@
CREATE TABLE users (
id SERIAL PRIMARY KEY,
username VARCHAR(255) NOT NULL UNIQUE,
email VARCHAR(255) NOT NULL UNIQUE,
password_hash VARCHAR(255) NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE has_access (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
post_id INTEGER NOT NULL,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE,
FOREIGN KEY (post_id) REFERENCES posts(id) ON DELETE CASCADE
);
CREATE TABLE posts (
id SERIAL PRIMARY KEY,
author VARCHAR(255) NOT NULL,
title VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
created_at TIMESTAMP NOT NULL,
source VARCHAR(255) NOT NULL
);
CREATE TABLE comments (
id SERIAL PRIMARY KEY,
post_id INTEGER NOT NULL,
author VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
created_at TIMESTAMP NOT NULL,
reply_to VARCHAR(255),
source VARCHAR(255) NOT NULL,
FOREIGN KEY (post_id) REFERENCES posts(id) ON DELETE CASCADE
);

15
docker-compose.yml Normal file
View File

@@ -0,0 +1,15 @@
services:
postgres:
image: postgres:16
container_name: postgres_db
restart: unless-stopped
env_file:
- .env
ports:
- "5432:5432"
volumes:
- ./db/postgres_vol:/var/lib/postgresql/data
- ./db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
volumes:
postgres_data:

View File

@@ -34,7 +34,7 @@ function ApiToGraphData(apiData: InteractionGraph) {
}
const InteractionStats = (props: { data: UserAnalysisResponse }) => {
const UserStats = (props: { data: UserAnalysisResponse }) => {
const graphData = ApiToGraphData(props.data.interaction_graph);
return (
@@ -44,7 +44,7 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
This graph visualizes interactions between users based on comments and replies.
Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
</p>
<div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}>
<div>
<ForceGraph3D
graphData={graphData}
nodeAutoColorBy="id"
@@ -58,4 +58,4 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
);
}
export default InteractionStats;
export default UserStats;

View File

@@ -3,7 +3,7 @@ import axios from "axios";
import StatsStyling from "../styles/stats_styling";
import SummaryStats from "../components/SummaryStats";
import EmotionalStats from "../components/EmotionalStats";
import InteractionStats from "../components/InteractionStats";
import InteractionStats from "../components/UserStats";
import {
type SummaryResponse,

View File

@@ -1,10 +1,12 @@
beautifulsoup4==4.14.3
Flask==3.1.2
Flask==3.1.3
flask_cors==6.0.2
google_api_python_client==2.188.0
keybert==0.9.0
nltk==3.9.2
pandas==3.0.0
numpy==2.4.2
pandas==3.0.1
psycopg2==2.9.11
psycopg2_binary==2.9.11
python-dotenv==1.2.1
Requests==2.32.5
sentence_transformers==5.2.2

View File

@@ -124,3 +124,85 @@ class InteractionAnalysis:
interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions
def average_thread_depth(self):
depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
for _, row in self.df.iterrows():
depth = 0
current_id = row["id"]
while True:
reply_to = id_to_reply.get(current_id)
if pd.isna(reply_to) or reply_to == "":
break
depth += 1
current_id = reply_to
depths.append(depth)
if not depths:
return 0
return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c for c in self.df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
length_cache = {}
def thread_length_from(start_id):
if start_id in length_cache:
return length_cache[start_id]
seen = set()
length = 1
current = start_id
while True:
if current in seen:
# infinite loop shouldn't happen, but just in case
break
seen.add(current)
reply_to = id_to_reply.get(current)
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
break
length += 1
current = reply_to
if current in length_cache:
length += (length_cache[current] - 1)
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = self.df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
return {
emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items()
}

View File

@@ -9,6 +9,10 @@ class LinguisticAnalysis:
self.df = df
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text)
@@ -66,3 +70,44 @@ class LinguisticAnalysis:
.head(limit)
.to_dict(orient="records")
)
def identity_markers(self):
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
in_count = 0
out_count = 0
in_emotions = {e: 0 for e in emotion_cols}
out_emotions = {e: 0 for e in emotion_cols}
total = 0
for post in df:
text = post["content"]
tokens = re.findall(r"\b[a-z]{2,}\b", text)
total += len(tokens)
in_count += sum(t in in_group_words for t in tokens)
out_count += sum(t in out_group_words for t in tokens)
emotions = post[emotion_cols]
print(emotions)
return {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total, 1), 5),
"out_group_ratio": round(out_count / max(total, 1), 5),
}

View File

@@ -1,21 +1,102 @@
import os
from dotenv import load_dotenv
from flask import Flask, jsonify, request
from flask_cors import CORS
from flask_bcrypt import Bcrypt
from flask_jwt_extended import (
JWTManager,
create_access_token,
jwt_required,
get_jwt_identity
)
from server.stat_gen import StatGen
from db.database import PostgresConnector
from server.auth import AuthManager
import pandas as pd
import traceback
import json
app = Flask(__name__)
db = PostgresConnector()
# Allow for CORS from localhost:5173
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
# Env Variables
load_dotenv()
frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes
# Flask Configuration
CORS(app, resources={r"/*": {"origins": frontend_url}})
app.config["JWT_SECRET_KEY"] = jwt_secret_key
app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
bcrypt = Bcrypt(app)
jwt = JWTManager(app)
auth_manager = AuthManager(db, bcrypt)
# Global State
posts_df = pd.read_json('small.jsonl', lines=True)
with open("topic_buckets.json", "r", encoding="utf-8") as f:
domain_topics = json.load(f)
stat_obj = StatGen(posts_df, domain_topics)
# posts_df = pd.read_json('small.jsonl', lines=True)
# with open("topic_buckets.json", "r", encoding="utf-8") as f:
# domain_topics = json.load(f)
# stat_obj = StatGen(posts_df, domain_topics)
stat_obj = None
@app.route('/register', methods=['POST'])
def register_user():
data = request.get_json()
if not data or "username" not in data or "email" not in data or "password" not in data:
return jsonify({"error": "Missing username, email, or password"}), 400
username = data["username"]
email = data["email"]
password = data["password"]
try:
auth_manager.register_user(username, email, password)
except ValueError as e:
return jsonify({"error": str(e)}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
print(f"Registered new user: {username}")
return jsonify({"message": f"User '{username}' registered successfully"}), 200
@app.route('/login', methods=['POST'])
def login_user():
data = request.get_json()
if not data or "username" not in data or "password" not in data:
return jsonify({"error": "Missing username or password"}), 400
username = data["username"]
password = data["password"]
try:
user = auth_manager.authenticate_user(username, password)
if user:
access_token = create_access_token(identity=str(user['id']))
return jsonify({"access_token": access_token}), 200
else:
return jsonify({"error": "Invalid username or password"}), 401
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/profile", methods=["GET"])
@jwt_required()
def profile():
current_user = get_jwt_identity()
return jsonify(
message="Access granted",
user=auth_manager.get_user_by_id(current_user)
), 200
@app.route('/upload', methods=['POST'])
def upload_data():
@@ -55,7 +136,7 @@ def word_frequencies():
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.content_analysis()), 200
return jsonify(stat_obj.get_content_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
@@ -80,7 +161,7 @@ def get_time_analysis():
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.time_analysis()), 200
return jsonify(stat_obj.get_time_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
@@ -93,7 +174,33 @@ def get_user_analysis():
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.user_analysis()), 200
return jsonify(stat_obj.get_user_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
def get_cultural_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_cultural_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
def get_interaction_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_interactional_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:

29
server/auth.py Normal file
View File

@@ -0,0 +1,29 @@
from db.database import PostgresConnector
from flask_bcrypt import Bcrypt
class AuthManager:
def __init__(self, db: PostgresConnector, bcrypt: Bcrypt):
self.db = db
self.bcrypt = bcrypt
def register_user(self, username, email, password):
hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8")
if self.db.get_user_by_email(email):
raise ValueError("Email already registered")
if self.db.get_user_by_username(username):
raise ValueError("Username already taken")
self.db.save_user(username, email, hashed_password)
def authenticate_user(self, username, password):
user = self.db.get_user_by_username(username)
if user and self.bcrypt.check_password_hash(user['password_hash'], password):
return user
return None
def get_user_by_id(self, user_id):
query = "SELECT id, username, email FROM users WHERE id = %s"
result = self.db.execute(query, (user_id,), fetch=True)
return result[0] if result else None

View File

@@ -62,13 +62,18 @@ class StatGen:
self.nlp.add_ner_cols()
## Public
def time_analysis(self) -> pd.DataFrame:
# topics over time
# emotions over time
def get_time_analysis(self) -> pd.DataFrame:
return {
"events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
}
def content_analysis(self) -> dict:
# average topic duration
def get_content_analysis(self) -> dict:
return {
"word_frequencies": self.linguistic_analysis.word_frequencies(),
"common_two_phrases": self.linguistic_analysis.ngrams(),
@@ -77,13 +82,31 @@ class StatGen:
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
}
def user_analysis(self) -> dict:
# average emotion per user
# average chain length
def get_user_analysis(self) -> dict:
return {
"top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis(),
"interaction_graph": self.interaction_analysis.interaction_graph()
}
# average / max thread depth
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
}
# detect community jargon
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return {
"identity_markers": self.linguistic_analysis.identity_markers()
}
def summary(self) -> dict:
total_posts = (self.df["type"] == "post").sum()
total_comments = (self.df["type"] == "comment").sum()