Compare commits

...

13 Commits

11 changed files with 411 additions and 21 deletions

52
db/database.py Normal file
View File

@@ -0,0 +1,52 @@
import os
import psycopg2
from psycopg2.extras import RealDictCursor
class PostgresConnector:
"""
Simple PostgreSQL connector (single connection).
"""
def __init__(self):
self.connection = psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432),
user=os.getenv("POSTGRES_USER", "postgres"),
password=os.getenv("POSTGRES_PASSWORD", "postgres"),
database=os.getenv("POSTGRES_DB", "postgres"),
)
self.connection.autocommit = False
def execute(self, query, params=None, fetch=False):
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(query, params)
if fetch:
return cursor.fetchall()
self.connection.commit()
def executemany(self, query, param_list):
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.executemany(query, param_list)
self.connection.commit()
def save_user(self, username, email, password_hash):
query = """
INSERT INTO users (username, email, password_hash)
VALUES (%s, %s, %s)
"""
self.execute(query, (username, email, password_hash))
def get_user_by_username(self, username) -> dict:
query = "SELECT id, username, email, password_hash FROM users WHERE username = %s"
result = self.execute(query, (username,), fetch=True)
return result[0] if result else None
def get_user_by_email(self, email) -> dict:
query = "SELECT id, username, email, password_hash FROM users WHERE email = %s"
result = self.execute(query, (email,), fetch=True)
return result[0] if result else None
def close(self):
if self.connection:
self.connection.close()

35
db/schema.sql Normal file
View File

@@ -0,0 +1,35 @@
CREATE TABLE users (
id SERIAL PRIMARY KEY,
username VARCHAR(255) NOT NULL UNIQUE,
email VARCHAR(255) NOT NULL UNIQUE,
password_hash VARCHAR(255) NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE has_access (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
post_id INTEGER NOT NULL,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE,
FOREIGN KEY (post_id) REFERENCES posts(id) ON DELETE CASCADE
);
CREATE TABLE posts (
id SERIAL PRIMARY KEY,
author VARCHAR(255) NOT NULL,
title VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
created_at TIMESTAMP NOT NULL,
source VARCHAR(255) NOT NULL
);
CREATE TABLE comments (
id SERIAL PRIMARY KEY,
post_id INTEGER NOT NULL,
author VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
created_at TIMESTAMP NOT NULL,
reply_to VARCHAR(255),
source VARCHAR(255) NOT NULL,
FOREIGN KEY (post_id) REFERENCES posts(id) ON DELETE CASCADE
);

15
docker-compose.yml Normal file
View File

@@ -0,0 +1,15 @@
services:
postgres:
image: postgres:16
container_name: postgres_db
restart: unless-stopped
env_file:
- .env
ports:
- "5432:5432"
volumes:
- ./db/postgres_vol:/var/lib/postgresql/data
- ./db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
volumes:
postgres_data:

View File

@@ -34,7 +34,7 @@ function ApiToGraphData(apiData: InteractionGraph) {
} }
const InteractionStats = (props: { data: UserAnalysisResponse }) => { const UserStats = (props: { data: UserAnalysisResponse }) => {
const graphData = ApiToGraphData(props.data.interaction_graph); const graphData = ApiToGraphData(props.data.interaction_graph);
return ( return (
@@ -44,7 +44,7 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
This graph visualizes interactions between users based on comments and replies. This graph visualizes interactions between users based on comments and replies.
Nodes represent users, and edges represent interactions (e.g., comments or replies) between them. Nodes represent users, and edges represent interactions (e.g., comments or replies) between them.
</p> </p>
<div style={{ height: "600px", border: "1px solid #ccc", borderRadius: 8, marginTop: 16 }}> <div>
<ForceGraph3D <ForceGraph3D
graphData={graphData} graphData={graphData}
nodeAutoColorBy="id" nodeAutoColorBy="id"
@@ -58,4 +58,4 @@ const InteractionStats = (props: { data: UserAnalysisResponse }) => {
); );
} }
export default InteractionStats; export default UserStats;

View File

@@ -3,7 +3,7 @@ import axios from "axios";
import StatsStyling from "../styles/stats_styling"; import StatsStyling from "../styles/stats_styling";
import SummaryStats from "../components/SummaryStats"; import SummaryStats from "../components/SummaryStats";
import EmotionalStats from "../components/EmotionalStats"; import EmotionalStats from "../components/EmotionalStats";
import InteractionStats from "../components/InteractionStats"; import InteractionStats from "../components/UserStats";
import { import {
type SummaryResponse, type SummaryResponse,

View File

@@ -1,10 +1,12 @@
beautifulsoup4==4.14.3 beautifulsoup4==4.14.3
Flask==3.1.2 Flask==3.1.3
flask_cors==6.0.2 flask_cors==6.0.2
google_api_python_client==2.188.0 google_api_python_client==2.188.0
keybert==0.9.0
nltk==3.9.2 nltk==3.9.2
pandas==3.0.0 numpy==2.4.2
pandas==3.0.1
psycopg2==2.9.11
psycopg2_binary==2.9.11
python-dotenv==1.2.1 python-dotenv==1.2.1
Requests==2.32.5 Requests==2.32.5
sentence_transformers==5.2.2 sentence_transformers==5.2.2

View File

@@ -124,3 +124,85 @@ class InteractionAnalysis:
interactions[a][b] = interactions[a].get(b, 0) + 1 interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions return interactions
def average_thread_depth(self):
depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
for _, row in self.df.iterrows():
depth = 0
current_id = row["id"]
while True:
reply_to = id_to_reply.get(current_id)
if pd.isna(reply_to) or reply_to == "":
break
depth += 1
current_id = reply_to
depths.append(depth)
if not depths:
return 0
return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c for c in self.df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
length_cache = {}
def thread_length_from(start_id):
if start_id in length_cache:
return length_cache[start_id]
seen = set()
length = 1
current = start_id
while True:
if current in seen:
# infinite loop shouldn't happen, but just in case
break
seen.add(current)
reply_to = id_to_reply.get(current)
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "":
break
length += 1
current = reply_to
if current in length_cache:
length += (length_cache[current] - 1)
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = self.df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
return {
emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items()
}

View File

@@ -9,6 +9,10 @@ class LinguisticAnalysis:
self.df = df self.df = df
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _clean_text(self, text: str) -> str: def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"http\S+", "", text) # remove URLs
text = re.sub(r"www\S+", "", text) text = re.sub(r"www\S+", "", text)
@@ -66,3 +70,44 @@ class LinguisticAnalysis:
.head(limit) .head(limit)
.to_dict(orient="records") .to_dict(orient="records")
) )
def identity_markers(self):
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
in_count = 0
out_count = 0
in_emotions = {e: 0 for e in emotion_cols}
out_emotions = {e: 0 for e in emotion_cols}
total = 0
for post in df:
text = post["content"]
tokens = re.findall(r"\b[a-z]{2,}\b", text)
total += len(tokens)
in_count += sum(t in in_group_words for t in tokens)
out_count += sum(t in out_group_words for t in tokens)
emotions = post[emotion_cols]
print(emotions)
return {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total, 1), 5),
"out_group_ratio": round(out_count / max(total, 1), 5),
}

View File

@@ -1,21 +1,102 @@
import os
from dotenv import load_dotenv
from flask import Flask, jsonify, request from flask import Flask, jsonify, request
from flask_cors import CORS from flask_cors import CORS
from flask_bcrypt import Bcrypt
from flask_jwt_extended import (
JWTManager,
create_access_token,
jwt_required,
get_jwt_identity
)
from server.stat_gen import StatGen from server.stat_gen import StatGen
from db.database import PostgresConnector
from server.auth import AuthManager
import pandas as pd import pandas as pd
import traceback import traceback
import json import json
app = Flask(__name__) app = Flask(__name__)
db = PostgresConnector()
# Allow for CORS from localhost:5173 # Env Variables
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}}) load_dotenv()
frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
jwt_access_token_expires = int(os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)) # Default to 20 minutes
# Flask Configuration
CORS(app, resources={r"/*": {"origins": frontend_url}})
app.config["JWT_SECRET_KEY"] = jwt_secret_key
app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
bcrypt = Bcrypt(app)
jwt = JWTManager(app)
auth_manager = AuthManager(db, bcrypt)
# Global State # Global State
posts_df = pd.read_json('small.jsonl', lines=True) # posts_df = pd.read_json('small.jsonl', lines=True)
with open("topic_buckets.json", "r", encoding="utf-8") as f: # with open("topic_buckets.json", "r", encoding="utf-8") as f:
domain_topics = json.load(f) # domain_topics = json.load(f)
stat_obj = StatGen(posts_df, domain_topics) # stat_obj = StatGen(posts_df, domain_topics)
stat_obj = None
@app.route('/register', methods=['POST'])
def register_user():
data = request.get_json()
if not data or "username" not in data or "email" not in data or "password" not in data:
return jsonify({"error": "Missing username, email, or password"}), 400
username = data["username"]
email = data["email"]
password = data["password"]
try:
auth_manager.register_user(username, email, password)
except ValueError as e:
return jsonify({"error": str(e)}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
print(f"Registered new user: {username}")
return jsonify({"message": f"User '{username}' registered successfully"}), 200
@app.route('/login', methods=['POST'])
def login_user():
data = request.get_json()
if not data or "username" not in data or "password" not in data:
return jsonify({"error": "Missing username or password"}), 400
username = data["username"]
password = data["password"]
try:
user = auth_manager.authenticate_user(username, password)
if user:
access_token = create_access_token(identity=str(user['id']))
return jsonify({"access_token": access_token}), 200
else:
return jsonify({"error": "Invalid username or password"}), 401
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/profile", methods=["GET"])
@jwt_required()
def profile():
current_user = get_jwt_identity()
return jsonify(
message="Access granted",
user=auth_manager.get_user_by_id(current_user)
), 200
@app.route('/upload', methods=['POST']) @app.route('/upload', methods=['POST'])
def upload_data(): def upload_data():
@@ -55,7 +136,7 @@ def word_frequencies():
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
try: try:
return jsonify(stat_obj.content_analysis()), 200 return jsonify(stat_obj.get_content_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
@@ -80,7 +161,7 @@ def get_time_analysis():
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
try: try:
return jsonify(stat_obj.time_analysis()), 200 return jsonify(stat_obj.get_time_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
@@ -93,7 +174,33 @@ def get_user_analysis():
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
try: try:
return jsonify(stat_obj.user_analysis()), 200 return jsonify(stat_obj.get_user_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
def get_cultural_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_cultural_analysis()), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
def get_interaction_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_interactional_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:

29
server/auth.py Normal file
View File

@@ -0,0 +1,29 @@
from db.database import PostgresConnector
from flask_bcrypt import Bcrypt
class AuthManager:
def __init__(self, db: PostgresConnector, bcrypt: Bcrypt):
self.db = db
self.bcrypt = bcrypt
def register_user(self, username, email, password):
hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8")
if self.db.get_user_by_email(email):
raise ValueError("Email already registered")
if self.db.get_user_by_username(username):
raise ValueError("Username already taken")
self.db.save_user(username, email, hashed_password)
def authenticate_user(self, username, password):
user = self.db.get_user_by_username(username)
if user and self.bcrypt.check_password_hash(user['password_hash'], password):
return user
return None
def get_user_by_id(self, user_id):
query = "SELECT id, username, email FROM users WHERE id = %s"
result = self.db.execute(query, (user_id,), fetch=True)
return result[0] if result else None

View File

@@ -62,13 +62,18 @@ class StatGen:
self.nlp.add_ner_cols() self.nlp.add_ner_cols()
## Public ## Public
def time_analysis(self) -> pd.DataFrame:
# topics over time
# emotions over time
def get_time_analysis(self) -> pd.DataFrame:
return { return {
"events_per_day": self.temporal_analysis.posts_per_day(), "events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": self.temporal_analysis.heatmap() "weekday_hour_heatmap": self.temporal_analysis.heatmap()
} }
def content_analysis(self) -> dict: # average topic duration
def get_content_analysis(self) -> dict:
return { return {
"word_frequencies": self.linguistic_analysis.word_frequencies(), "word_frequencies": self.linguistic_analysis.word_frequencies(),
"common_two_phrases": self.linguistic_analysis.ngrams(), "common_two_phrases": self.linguistic_analysis.ngrams(),
@@ -77,13 +82,31 @@ class StatGen:
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
} }
def user_analysis(self) -> dict: # average emotion per user
# average chain length
def get_user_analysis(self) -> dict:
return { return {
"top_users": self.interaction_analysis.top_users(), "top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis(), "users": self.interaction_analysis.per_user_analysis(),
"interaction_graph": self.interaction_analysis.interaction_graph() "interaction_graph": self.interaction_analysis.interaction_graph()
} }
# average / max thread depth
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion()
}
# detect community jargon
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return {
"identity_markers": self.linguistic_analysis.identity_markers()
}
def summary(self) -> dict: def summary(self) -> dict:
total_posts = (self.df["type"] == "post").sum() total_posts = (self.df["type"] == "post").sum()
total_comments = (self.df["type"] == "comment").sum() total_comments = (self.df["type"] == "comment").sum()