Storage of user data and datasets in PostGreSQL #2

Merged
dylan merged 19 commits from feat/database-integration into main 2026-03-01 16:47:25 +00:00
14 changed files with 774 additions and 387 deletions

2
.gitignore vendored
View File

@@ -9,3 +9,5 @@ __pycache__/
# React App Vite # React App Vite
node_modules/ node_modules/
dist/ dist/
*.sh

138
db/database.py Normal file
View File

@@ -0,0 +1,138 @@
import os
import psycopg2
import pandas as pd
from psycopg2.extras import RealDictCursor
from psycopg2.extras import execute_batch, Json
class PostgresConnector:
"""
Simple PostgreSQL connector (single connection).
"""
def __init__(self):
self.connection = psycopg2.connect(
host=os.getenv("POSTGRES_HOST", "localhost"),
port=os.getenv("POSTGRES_PORT", 5432),
user=os.getenv("POSTGRES_USER", "postgres"),
password=os.getenv("POSTGRES_PASSWORD", "postgres"),
database=os.getenv("POSTGRES_DB", "postgres"),
)
self.connection.autocommit = False
def execute(self, query, params=None, fetch=False) -> list:
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.execute(query, params)
if fetch:
return cursor.fetchall()
self.connection.commit()
def executemany(self, query, param_list) -> list:
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
cursor.executemany(query, param_list)
self.connection.commit()
## User Management Methods
def save_user(self, username, email, password_hash):
query = """
INSERT INTO users (username, email, password_hash)
VALUES (%s, %s, %s)
"""
self.execute(query, (username, email, password_hash))
def get_user_by_username(self, username) -> dict:
query = "SELECT id, username, email, password_hash FROM users WHERE username = %s"
result = self.execute(query, (username,), fetch=True)
return result[0] if result else None
def get_user_by_email(self, email) -> dict:
query = "SELECT id, username, email, password_hash FROM users WHERE email = %s"
result = self.execute(query, (email,), fetch=True)
return result[0] if result else None
# Dataset Management Methods
def save_dataset_info(self, user_id: int, dataset_name: str, topics: dict) -> int:
query = """
INSERT INTO datasets (user_id, name, topics)
VALUES (%s, %s, %s)
RETURNING id
"""
result = self.execute(query, (user_id, dataset_name, Json(topics)), fetch=True)
return result[0]["id"] if result else None
def save_dataset_content(self, dataset_id: int, event_data: pd.DataFrame):
query = """
INSERT INTO events (
dataset_id,
type,
parent_id,
author,
content,
timestamp,
date,
dt,
hour,
weekday,
reply_to,
source,
topic,
topic_confidence,
ner_entities,
emotion_anger,
emotion_disgust,
emotion_fear,
emotion_joy,
emotion_sadness
)
VALUES (
%s, %s, %s, %s, %s,
%s, %s, %s, %s, %s,
%s, %s, %s, %s, %s,
%s, %s, %s, %s, %s
)
"""
values = []
for _, row in event_data.iterrows():
values.append((
dataset_id,
row["type"],
row["parent_id"],
row["author"],
row["content"],
row["timestamp"],
row["date"],
row["dt"],
row["hour"],
row["weekday"],
row.get("reply_to"),
row["source"],
row.get("topic"),
row.get("topic_confidence"),
Json(row["ner_entities"]) if row.get("ner_entities") else None,
row.get("emotion_anger"),
row.get("emotion_disgust"),
row.get("emotion_fear"),
row.get("emotion_joy"),
row.get("emotion_sadness"),
))
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
execute_batch(cursor, query, values)
self.connection.commit()
def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
query = "SELECT * FROM events WHERE dataset_id = %s"
result = self.execute(query, (dataset_id,), fetch=True)
return pd.DataFrame(result)
def get_dataset_info(self, dataset_id: int) -> dict:
query = "SELECT * FROM datasets WHERE id = %s"
result = self.execute(query, (dataset_id,), fetch=True)
return result[0] if result else None
def close(self):
if self.connection:
self.connection.close()

51
db/schema.sql Normal file
View File

@@ -0,0 +1,51 @@
CREATE TABLE users (
id SERIAL PRIMARY KEY,
username VARCHAR(255) NOT NULL UNIQUE,
email VARCHAR(255) NOT NULL UNIQUE,
password_hash VARCHAR(255) NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE datasets (
id SERIAL PRIMARY KEY,
user_id INTEGER NOT NULL,
name VARCHAR(255) NOT NULL,
description TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
topics JSONB,
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
);
CREATE TABLE events (
/* Required Fields */
id SERIAL PRIMARY KEY,
dataset_id INTEGER NOT NULL,
type VARCHAR(255) NOT NULL,
author VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
timestamp BIGINT NOT NULL,
date DATE NOT NULL,
dt TIMESTAMP NOT NULL,
hour INTEGER NOT NULL,
weekday VARCHAR(255) NOT NULL,
/* Comments and Replies */
parent_id VARCHAR(255),
reply_to VARCHAR(255),
source VARCHAR(255) NOT NULL,
/* NLP Fields */
topic VARCHAR(255),
topic_confidence FLOAT,
ner_entities JSONB,
emotion_anger FLOAT,
emotion_disgust FLOAT,
emotion_fear FLOAT,
emotion_joy FLOAT,
emotion_sadness FLOAT,
FOREIGN KEY (dataset_id) REFERENCES datasets(id) ON DELETE CASCADE
);

15
docker-compose.yml Normal file
View File

@@ -0,0 +1,15 @@
services:
postgres:
image: postgres:16
container_name: postgres_db
restart: unless-stopped
env_file:
- .env
ports:
- "5432:5432"
volumes:
- ./db/postgres_vol:/var/lib/postgresql/data
- ./db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
volumes:
postgres_data:

View File

@@ -1,10 +1,12 @@
beautifulsoup4==4.14.3 beautifulsoup4==4.14.3
Flask==3.1.2 Flask==3.1.3
flask_cors==6.0.2 flask_cors==6.0.2
google_api_python_client==2.188.0 google_api_python_client==2.188.0
keybert==0.9.0
nltk==3.9.2 nltk==3.9.2
pandas==3.0.0 numpy==2.4.2
pandas==3.0.1
psycopg2==2.9.11
psycopg2_binary==2.9.11
python-dotenv==1.2.1 python-dotenv==1.2.1
Requests==2.32.5 Requests==2.32.5
sentence_transformers==5.2.2 sentence_transformers==5.2.2

View File

@@ -6,13 +6,12 @@ from typing import Any
class CulturalAnalysis: class CulturalAnalysis:
def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"): def __init__(self, content_col: str = "content", topic_col: str = "topic"):
self.df = df
self.content_col = content_col self.content_col = content_col
self.topic_col = topic_col self.topic_col = topic_col
def get_identity_markers(self): def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
df = self.df.copy() df = original_df.copy()
s = df[self.content_col].fillna("").astype(str).str.lower() s = df[self.content_col].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"} in_group_words = {"we", "us", "our", "ourselves"}
@@ -60,8 +59,8 @@ class CulturalAnalysis:
return result return result
def get_stance_markers(self) -> dict[str, Any]: def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
s = self.df[self.content_col].fillna("").astype(str) s = df[self.content_col].fillna("").astype(str)
hedges = { hedges = {
"maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem", "maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
@@ -104,13 +103,11 @@ class CulturalAnalysis:
"permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3), "permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
} }
def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]: def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
if "entities" not in self.df.columns: if "entities" not in df.columns:
return {"entity_emotion_avg": {}} return {"entity_emotion_avg": {}}
df = self.df
emotion_cols = [c for c in df.columns if c.startswith("emotion_")] emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
entity_counter = Counter() entity_counter = Counter()
for row in df["entities"].dropna(): for row in df["entities"].dropna():

View File

@@ -1,18 +1,15 @@
import pandas as pd import pandas as pd
class EmotionalAnalysis: class EmotionalAnalysis:
def __init__(self, df: pd.DataFrame): def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
self.df = df
def avg_emotion_by_topic(self) -> dict:
emotion_cols = [ emotion_cols = [
col for col in self.df.columns col for col in df.columns
if col.startswith("emotion_") if col.startswith("emotion_")
] ]
counts = ( counts = (
self.df[ df[
(self.df["topic"] != "Misc") (df["topic"] != "Misc")
] ]
.groupby("topic") .groupby("topic")
.size() .size()
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
) )
avg_emotion_by_topic = ( avg_emotion_by_topic = (
self.df[ df[
(self.df["topic"] != "Misc") (df["topic"] != "Misc")
] ]
.groupby("topic")[emotion_cols] .groupby("topic")[emotion_cols]
.mean() .mean()

View File

@@ -5,8 +5,7 @@ from collections import Counter
class InteractionAnalysis: class InteractionAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.df = df
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
def _tokenize(self, text: str): def _tokenize(self, text: str):
@@ -14,9 +13,9 @@ class InteractionAnalysis:
return [t for t in tokens if t not in self.word_exclusions] return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user( def _vocab_richness_per_user(
self, min_words: int = 20, top_most_used_words: int = 100 self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
) -> list: ) -> list:
df = self.df.copy() df = df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower() df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize) df["tokens"] = df["content"].apply(self._tokenize)
@@ -58,10 +57,8 @@ class InteractionAnalysis:
return rows return rows
def top_users(self) -> list: def top_users(self, df: pd.DataFrame) -> list:
counts = ( counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
)
top_users = [ top_users = [
{"author": author, "source": source, "count": int(count)} {"author": author, "source": source, "count": int(count)}
@@ -70,14 +67,14 @@ class InteractionAnalysis:
return top_users return top_users
def per_user_analysis(self) -> dict: def per_user_analysis(self, df: pd.DataFrame) -> dict:
per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0) per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")] emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
avg_emotions_by_author = {} avg_emotions_by_author = {}
if emotion_cols: if emotion_cols:
avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0) avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = { avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()} author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows() for author, row in avg_emotions.iterrows()
@@ -97,7 +94,7 @@ class InteractionAnalysis:
per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records") per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user() vocab_rows = self._vocab_richness_per_user(df)
vocab_by_author = {row["author"]: row for row in vocab_rows} vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information # merge vocab richness + per_user information
@@ -112,7 +109,14 @@ class InteractionAnalysis:
"comment_post_ratio": float(row.get("comment_post_ratio", 0)), "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)), "comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}), "avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}), "vocab": vocab_by_author.get(
author,
{
"vocab_richness": 0,
"avg_words_per_event": 0,
"top_words": [],
},
),
} }
) )
@@ -120,13 +124,13 @@ class InteractionAnalysis:
return merged_users return merged_users
def interaction_graph(self): def interaction_graph(self, df: pd.DataFrame):
interactions = {a: {} for a in self.df["author"].dropna().unique()} interactions = {a: {} for a in df["author"].dropna().unique()}
# reply_to refers to the comment id, this allows us to map comment ids to usernames # reply_to refers to the comment id, this allows us to map comment ids to usernames
id_to_author = self.df.set_index("id")["author"].to_dict() id_to_author = df.set_index("id")["author"].to_dict()
for _, row in self.df.iterrows(): for _, row in df.iterrows():
a = row["author"] a = row["author"]
reply_id = row["reply_to"] reply_id = row["reply_to"]
@@ -141,10 +145,10 @@ class InteractionAnalysis:
return interactions return interactions
def average_thread_depth(self): def average_thread_depth(self, df: pd.DataFrame):
depths = [] depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict() id_to_reply = df.set_index("id")["reply_to"].to_dict()
for _, row in self.df.iterrows(): for _, row in df.iterrows():
depth = 0 depth = 0
current_id = row["id"] current_id = row["id"]
@@ -163,16 +167,16 @@ class InteractionAnalysis:
return round(sum(depths) / len(depths), 2) return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self): def average_thread_length_by_emotion(self, df: pd.DataFrame):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [ emotion_cols = [
c c
for c in self.df.columns for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions if c.startswith("emotion_") and c not in emotion_exclusions
] ]
id_to_reply = self.df.set_index("id")["reply_to"].to_dict() id_to_reply = df.set_index("id")["reply_to"].to_dict()
length_cache = {} length_cache = {}
def thread_length_from(start_id): def thread_length_from(start_id):
@@ -211,7 +215,7 @@ class InteractionAnalysis:
emotion_to_lengths = {} emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues # Fill NaNs in emotion cols to avoid max() issues
emo_df = self.df[["id"] + emotion_cols].copy() emo_df = df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0) emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows(): for _, row in emo_df.iterrows():

View File

@@ -4,9 +4,9 @@ import re
from collections import Counter from collections import Counter
from itertools import islice from itertools import islice
class LinguisticAnalysis: class LinguisticAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.df = df
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
def _tokenize(self, text: str): def _tokenize(self, text: str):
@@ -21,22 +21,13 @@ class LinguisticAnalysis:
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text return text
def word_frequencies(self, limit: int = 100) -> dict: def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = ( texts = df["content"].dropna().astype(str).str.lower()
self.df["content"]
.dropna()
.astype(str)
.str.lower()
)
words = [] words = []
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
words.extend( words.extend(w for w in tokens if w not in self.word_exclusions)
w for w in tokens
if w not in self.word_exclusions
)
counts = Counter(words) counts = Counter(words)
@@ -49,8 +40,8 @@ class LinguisticAnalysis:
return word_frequencies.to_dict(orient="records") return word_frequencies.to_dict(orient="records")
def ngrams(self, n=2, limit=100): def ngrams(self, df: pd.DataFrame, n=2, limit=100):
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower() texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
all_ngrams = [] all_ngrams = []
for text in texts: for text in texts:

View File

@@ -1,16 +1,14 @@
import pandas as pd import pandas as pd
class TemporalAnalysis:
def __init__(self, df: pd.DataFrame):
self.df = df
def avg_reply_time_per_emotion(self) -> dict: class TemporalAnalysis:
df = self.df.copy() def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
df = df.copy()
replies = df[ replies = df[
(df["type"] == "comment") & (df["type"] == "comment")
(df["reply_to"].notna()) & & (df["reply_to"].notna())
(df["reply_to"] != "") & (df["reply_to"] != "")
] ]
id_to_time = df.set_index("id")["dt"].to_dict() id_to_time = df.set_index("id")["dt"].to_dict()
@@ -25,42 +23,45 @@ class TemporalAnalysis:
return (row["dt"] - parent_time).total_seconds() return (row["dt"] - parent_time).total_seconds()
replies["reply_time"] = replies.apply(compute_reply_time, axis=1) replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")] emotion_cols = [
col
for col in df.columns
if col.startswith("emotion_")
and col not in ("emotion_neutral", "emotion_surprise")
]
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1) replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
grouped = ( grouped = (
replies replies.groupby("dominant_emotion")["reply_time"]
.groupby("dominant_emotion")["reply_time"]
.agg(["mean", "count"]) .agg(["mean", "count"])
.reset_index() .reset_index()
) )
return grouped.to_dict(orient="records") return grouped.to_dict(orient="records")
def posts_per_day(self) -> dict: def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
per_day = ( per_day = df.groupby("date").size().reset_index(name="count")
self.df.groupby("date")
.size()
.reset_index(name="count")
)
return per_day.to_dict(orient="records") return per_day.to_dict(orient="records")
def heatmap(self) -> dict: def heatmap(self, df: pd.DataFrame) -> list[dict]:
weekday_order = [ weekday_order = [
"Monday", "Tuesday", "Wednesday", "Monday",
"Thursday", "Friday", "Saturday", "Sunday" "Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
] ]
self.df["weekday"] = pd.Categorical( df = df.copy()
self.df["weekday"], df["weekday"] = pd.Categorical(
categories=weekday_order, df["weekday"], categories=weekday_order, ordered=True
ordered=True
) )
heatmap = ( heatmap = (
self.df df.groupby(["weekday", "hour"], observed=True)
.groupby(["weekday", "hour"], observed=True)
.size() .size()
.unstack(fill_value=0) .unstack(fill_value=0)
.reindex(columns=range(24), fill_value=0) .reindex(columns=range(24), fill_value=0)

View File

@@ -1,23 +1,110 @@
import os
from dotenv import load_dotenv
from flask import Flask, jsonify, request from flask import Flask, jsonify, request
from flask_cors import CORS from flask_cors import CORS
from flask_bcrypt import Bcrypt
from flask_jwt_extended import (
JWTManager,
create_access_token,
jwt_required,
get_jwt_identity,
)
from server.stat_gen import StatGen from server.stat_gen import StatGen
from server.dataset_processor import DatasetProcessor
from db.database import PostgresConnector
from server.auth import AuthManager
import pandas as pd import pandas as pd
import traceback import traceback
import json import json
app = Flask(__name__) app = Flask(__name__)
db = PostgresConnector()
# Allow for CORS from localhost:5173 # Env Variables
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}}) load_dotenv()
frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
jwt_access_token_expires = int(
os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
) # Default to 20 minutes
# Global State # Flask Configuration
posts_df = pd.read_json('small.jsonl', lines=True) CORS(app, resources={r"/*": {"origins": frontend_url}})
with open("topic_buckets.json", "r", encoding="utf-8") as f: app.config["JWT_SECRET_KEY"] = jwt_secret_key
domain_topics = json.load(f) app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
stat_obj = StatGen(posts_df, domain_topics)
@app.route('/upload', methods=['POST']) bcrypt = Bcrypt(app)
jwt = JWTManager(app)
auth_manager = AuthManager(db, bcrypt)
stat_gen = StatGen()
@app.route("/register", methods=["POST"])
def register_user():
data = request.get_json()
if (
not data
or "username" not in data
or "email" not in data
or "password" not in data
):
return jsonify({"error": "Missing username, email, or password"}), 400
username = data["username"]
email = data["email"]
password = data["password"]
try:
auth_manager.register_user(username, email, password)
except ValueError as e:
return jsonify({"error": str(e)}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
print(f"Registered new user: {username}")
return jsonify({"message": f"User '{username}' registered successfully"}), 200
@app.route("/login", methods=["POST"])
def login_user():
data = request.get_json()
if not data or "username" not in data or "password" not in data:
return jsonify({"error": "Missing username or password"}), 400
username = data["username"]
password = data["password"]
try:
user = auth_manager.authenticate_user(username, password)
if user:
access_token = create_access_token(identity=str(user["id"]))
return jsonify({"access_token": access_token}), 200
else:
return jsonify({"error": "Invalid username or password"}), 401
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/profile", methods=["GET"])
@jwt_required()
def profile():
current_user = get_jwt_identity()
return jsonify(
message="Access granted", user=auth_manager.get_user_by_id(current_user)
), 200
@app.route("/upload", methods=["POST"])
@jwt_required()
def upload_data(): def upload_data():
if "posts" not in request.files or "topics" not in request.files: if "posts" not in request.files or "topics" not in request.files:
return jsonify({"error": "Missing required files or form data"}), 400 return jsonify({"error": "Missing required files or form data"}), 400
@@ -28,172 +115,241 @@ def upload_data():
if post_file.filename == "" or topic_file == "": if post_file.filename == "" or topic_file == "":
return jsonify({"error": "Empty filename"}), 400 return jsonify({"error": "Empty filename"}), 400
if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'): if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400 ".json"
):
return jsonify(
{"error": "Invalid file type. Only .jsonl and .json files are allowed."}
), 400
try: try:
global stat_obj current_user = get_jwt_identity()
posts_df = pd.read_json(post_file, lines=True) posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
stat_obj = StatGen(posts_df, json.load(topic_file)) topics = json.load(topic_file)
return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
processor = DatasetProcessor(posts_df, topics)
enriched_df = processor.enrich()
dataset_id = db.save_dataset_info(
current_user, f"dataset_{current_user}", topics
)
db.save_dataset_content(dataset_id, enriched_df)
return jsonify(
{"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id}
), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
except Exception as e: except Exception as e:
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/dataset', methods=['GET'])
def get_dataset():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"} @app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required()
def get_dataset(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
@app.route('/stats/content', methods=['GET']) if dataset.get("user_id") != int(current_user):
def word_frequencies(): return jsonify({"error": "Unauthorized access to dataset"}), 403
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
dataset_content = db.get_dataset_content(dataset_id)
if dataset_content.empty:
return jsonify({"error": "Dataset content not found"}), 404
return jsonify(dataset_content.to_dict(orient="records")), 200
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
@jwt_required()
def content_endpoint(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_content_analysis()), 200 return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/stats/summary', methods=["GET"])
def get_summary(): @app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_summary(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.summary()), 200 return jsonify(stat_gen.summary(dataset_content)), 200
except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e:
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/time", methods=["GET"])
def get_time_analysis():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
return jsonify(stat_obj.get_time_analysis()), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/user", methods=["GET"])
def get_user_analysis(): @app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_time_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_user_analysis()), 200 return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/cultural", methods=["GET"])
def get_cultural_analysis(): @app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_user_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_cultural_analysis()), 200 return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route("/stats/interaction", methods=["GET"])
def get_interaction_analysis(): @app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
if stat_obj is None: @jwt_required()
return jsonify({"error": "No data uploaded"}), 400 def get_cultural_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if dataset.get("user_id") != int(current_user):
return jsonify({"error": "Unauthorized access to dataset"}), 403
dataset_content = db.get_dataset_content(dataset_id)
try: try:
return jsonify(stat_obj.get_interactional_analysis()), 200 return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
except ValueError as e: except ValueError as e:
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400 return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/filter/query', methods=["POST"])
def filter_query():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
data = request.get_json(silent=True) or {} @app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
@jwt_required()
def get_interaction_analysis(dataset_id):
current_user = get_jwt_identity()
dataset = db.get_dataset_info(dataset_id)
if "query" not in data: if dataset.get("user_id") != int(current_user):
return jsonify(stat_obj.df.to_dict(orient="records")), 200 return jsonify({"error": "Unauthorized access to dataset"}), 403
query = data["query"] dataset_content = db.get_dataset_content(dataset_id)
filtered_df = stat_obj.filter_by_query(query)
return jsonify(filtered_df), 200
@app.route('/filter/time', methods=["POST"])
def filter_time():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
data = request.get_json(silent=True)
if not data:
return jsonify({"error": "Invalid or missing JSON body"}), 400
if "start" not in data or "end" not in data:
return jsonify({"error": "Please include both start and end dates"}), 400
try: try:
start = pd.to_datetime(data["start"], utc=True) return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
end = pd.to_datetime(data["end"], utc=True) except ValueError as e:
filtered_df = stat_obj.set_time_range(start, end) return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
return jsonify(filtered_df), 200
except Exception:
return jsonify({"error": "Invalid datetime format"}), 400
@app.route('/filter/sources', methods=["POST"])
def filter_sources():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
data = request.get_json(silent=True)
if not data:
return jsonify({"error": "Invalid or missing JSON body"}), 400
if "sources" not in data:
return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
try:
filtered_df = stat_obj.filter_data_sources(data["sources"])
return jsonify(filtered_df), 200
except ValueError:
return jsonify({"error": "Please enable at least one data source"}), 400
except Exception as e:
return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
@app.route('/filter/reset', methods=["GET"])
def reset_dataset():
if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400
try:
stat_obj.reset_dataset()
return jsonify({"success": "Dataset successfully reset"})
except Exception as e: except Exception as e:
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
# @app.route("/filter/query", methods=["POST"])
# def filter_query():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# data = request.get_json(silent=True) or {}
# if "query" not in data:
# return jsonify(stat_obj.df.to_dict(orient="records")), 200
# query = data["query"]
# filtered_df = stat_obj.filter_by_query(query)
# return jsonify(filtered_df), 200
# @app.route("/filter/time", methods=["POST"])
# def filter_time():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# data = request.get_json(silent=True)
# if not data:
# return jsonify({"error": "Invalid or missing JSON body"}), 400
# if "start" not in data or "end" not in data:
# return jsonify({"error": "Please include both start and end dates"}), 400
# try:
# start = pd.to_datetime(data["start"], utc=True)
# end = pd.to_datetime(data["end"], utc=True)
# filtered_df = stat_obj.set_time_range(start, end)
# return jsonify(filtered_df), 200
# except Exception:
# return jsonify({"error": "Invalid datetime format"}), 400
# @app.route("/filter/sources", methods=["POST"])
# def filter_sources():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# data = request.get_json(silent=True)
# if not data:
# return jsonify({"error": "Invalid or missing JSON body"}), 400
# if "sources" not in data:
# return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
# try:
# filtered_df = stat_obj.filter_data_sources(data["sources"])
# return jsonify(filtered_df), 200
# except ValueError:
# return jsonify({"error": "Please enable at least one data source"}), 400
# except Exception as e:
# return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
# @app.route("/filter/reset", methods=["GET"])
# def reset_dataset():
# if stat_obj is None:
# return jsonify({"error": "No data uploaded"}), 400
# try:
# stat_obj.reset_dataset()
# return jsonify({"success": "Dataset successfully reset"})
# except Exception as e:
# print(traceback.format_exc())
# return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True) app.run(debug=True)

29
server/auth.py Normal file
View File

@@ -0,0 +1,29 @@
from db.database import PostgresConnector
from flask_bcrypt import Bcrypt
class AuthManager:
def __init__(self, db: PostgresConnector, bcrypt: Bcrypt):
self.db = db
self.bcrypt = bcrypt
def register_user(self, username, email, password):
hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8")
if self.db.get_user_by_email(email):
raise ValueError("Email already registered")
if self.db.get_user_by_username(username):
raise ValueError("Username already taken")
self.db.save_user(username, email, hashed_password)
def authenticate_user(self, username, password):
user = self.db.get_user_by_username(username)
if user and self.bcrypt.check_password_hash(user['password_hash'], password):
return user
return None
def get_user_by_id(self, user_id):
query = "SELECT id, username, email FROM users WHERE id = %s"
result = self.db.execute(query, (user_id,), fetch=True)
return result[0] if result else None

View File

@@ -0,0 +1,39 @@
import pandas as pd
from server.analysis.nlp import NLP
class DatasetProcessor:
def __init__(self, df, topics):
self.df = self._explode_comments(df)
self.topics = topics
self.nlp = NLP(self.df, "title", "content", self.topics)
def _explode_comments(self, df) -> pd.DataFrame:
comments_df = df[["id", "comments"]].explode("comments")
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
comments_df = pd.json_normalize(comments_df["comments"])
posts_df = df.drop(columns=["comments"])
posts_df["type"] = "post"
posts_df["parent_id"] = None
comments_df["type"] = "comment"
comments_df["parent_id"] = comments_df.get("post_id")
df = pd.concat([posts_df, comments_df])
df.drop(columns=["post_id"], inplace=True, errors="ignore")
return df
def enrich(self) -> pd.DataFrame:
self.df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='raise')
self.df['date'] = pd.to_datetime(self.df['timestamp'], unit='s').dt.date
self.df["dt"] = pd.to_datetime(self.df["timestamp"], unit="s", utc=True)
self.df["hour"] = self.df["dt"].dt.hour
self.df["weekday"] = self.df["dt"].dt.day_name()
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
return self.df

View File

@@ -1,170 +1,135 @@
import pandas as pd
import datetime import datetime
import nltk
import nltk
import pandas as pd
from nltk.corpus import stopwords from nltk.corpus import stopwords
from server.analysis.nlp import NLP
from server.analysis.temporal import TemporalAnalysis from server.analysis.cultural import CulturalAnalysis
from server.analysis.emotional import EmotionalAnalysis from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.cultural import CulturalAnalysis from server.analysis.temporal import TemporalAnalysis
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www",
"boards", "boardsie", "https",
"comment", "comments", "http",
"discussion", "thread", "boards",
"post", "posts", "boardsie",
"would", "get", "one" "comment",
"comments",
"discussion",
"thread",
"post",
"posts",
"would",
"get",
"one",
} }
nltk.download('stopwords') nltk.download("stopwords")
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
class StatGen: class StatGen:
def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None: def __init__(self) -> None:
comments_df = df[["id", "comments"]].explode("comments") self.temporal_analysis = TemporalAnalysis()
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))] self.emotional_analysis = EmotionalAnalysis()
comments_df = pd.json_normalize(comments_df["comments"]) self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis()
posts_df = df.drop(columns=["comments"]) def get_time_analysis(self, df: pd.DataFrame) -> dict:
posts_df["type"] = "post"
posts_df["parent_id"] = None
comments_df["type"] = "comment"
comments_df["parent_id"] = comments_df.get("post_id")
self.domain_topics = domain_topics
self.df = pd.concat([posts_df, comments_df])
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
self.nlp = NLP(self.df, "title", "content", domain_topics)
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
self._add_time_cols(self.df)
self.temporal_analysis = TemporalAnalysis(self.df)
self.emotional_analysis = EmotionalAnalysis(self.df)
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis(self.df)
self.original_df = self.df.copy(deep=True)
## Private Methods
def _add_time_cols(self, df: pd.DataFrame) -> None:
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name()
## Public
# topics over time
# emotions over time
def get_time_analysis(self) -> dict:
return { return {
"events_per_day": self.temporal_analysis.posts_per_day(), "events_per_day": self.temporal_analysis.posts_per_day(df),
"weekday_hour_heatmap": self.temporal_analysis.heatmap() "weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
} }
# average topic duration def get_content_analysis(self, df: pd.DataFrame) -> dict:
def get_content_analysis(self) -> dict:
return { return {
"word_frequencies": self.linguistic_analysis.word_frequencies(), "word_frequencies": self.linguistic_analysis.word_frequencies(df),
"common_two_phrases": self.linguistic_analysis.ngrams(), "common_two_phrases": self.linguistic_analysis.ngrams(df),
"common_three_phrases": self.linguistic_analysis.ngrams(n=3), "common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(), "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
} }
# average emotion per user def get_user_analysis(self, df: pd.DataFrame) -> dict:
# average chain length
def get_user_analysis(self) -> dict:
return { return {
"top_users": self.interaction_analysis.top_users(), "top_users": self.interaction_analysis.top_users(df),
"users": self.interaction_analysis.per_user_analysis() "users": self.interaction_analysis.per_user_analysis(df),
"interaction_graph": self.interaction_analysis.interaction_graph(df),
} }
# average / max thread depth def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
# high engagment threads based on volume
def get_interactional_analysis(self) -> dict:
return { return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(), "average_thread_depth": self.interaction_analysis.average_thread_depth(df),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(), "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
"interaction_graph": self.interaction_analysis.interaction_graph()
} }
# detect community jargon def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
# in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict:
return { return {
"identity_markers": self.cultural_analysis.get_identity_markers(), "identity_markers": self.cultural_analysis.get_identity_markers(df),
"stance_markers": self.cultural_analysis.get_stance_markers(), "stance_markers": self.cultural_analysis.get_stance_markers(df),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity() "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
} }
def summary(self) -> dict: def summary(self, df: pd.DataFrame) -> dict:
total_posts = (self.df["type"] == "post").sum() total_posts = (df["type"] == "post").sum()
total_comments = (self.df["type"] == "comment").sum() total_comments = (df["type"] == "comment").sum()
events_per_user = df.groupby("author").size()
events_per_user = self.df.groupby("author").size()
return { return {
"total_events": int(len(self.df)), "total_events": int(len(df)),
"total_posts": int(total_posts), "total_posts": int(total_posts),
"total_comments": int(total_comments), "total_comments": int(total_comments),
"unique_users": int(events_per_user.count()), "unique_users": int(events_per_user.count()),
"comments_per_post": round(total_comments / max(total_posts, 1), 2), "comments_per_post": round(total_comments / max(total_posts, 1), 2),
"lurker_ratio": round((events_per_user == 1).mean(), 2), "lurker_ratio": round((events_per_user == 1).mean(), 2),
"time_range": { "time_range": {
"start": int(self.df["dt"].min().timestamp()), "start": int(df["dt"].min().timestamp()),
"end": int(self.df["dt"].max().timestamp()) "end": int(df["dt"].max().timestamp()),
}, },
"sources": self.df["source"].dropna().unique().tolist() "sources": df["source"].dropna().unique().tolist(),
} }
def filter_by_query(self, search_query: str) -> dict: # def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
self.df = self.df[ # filtered_df = df[df["content"].str.contains(search_query, na=False)]
self.df["content"].str.contains(search_query)
]
return { # return {
"rows": len(self.df), # "rows": len(filtered_df),
"data": self.df.to_dict(orient="records") # "data": filtered_df.to_dict(orient="records"),
} # }
def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict: # def set_time_range(
self.df = self.df[ # self,
(self.df["dt"] >= start) & # original_df: pd.DataFrame,
(self.df["dt"] <= end) # start: datetime.datetime,
] # end: datetime.datetime,
# ) -> dict:
# df = self._prepare_df(original_df)
# filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
return { # return {
"rows": len(self.df), # "rows": len(filtered_df),
"data": self.df.to_dict(orient="records") # "data": filtered_df.to_dict(orient="records"),
} # }
""" # def filter_data_sources(
Input is a hash map (source_name: str -> enabled: bool) # self, original_df: pd.DataFrame, data_sources: dict
""" # ) -> dict:
def filter_data_sources(self, data_sources: dict) -> dict: # df = self._prepare_df(original_df)
enabled_sources = [src for src, enabled in data_sources.items() if enabled] # enabled_sources = [src for src, enabled in data_sources.items() if enabled]
if not enabled_sources: # if not enabled_sources:
raise ValueError("Please choose at least one data source") # raise ValueError("Please choose at least one data source")
self.df = self.df[self.df["source"].isin(enabled_sources)] # filtered_df = df[df["source"].isin(enabled_sources)]
return { # return {
"rows": len(self.df), # "rows": len(filtered_df),
"data": self.df.to_dict(orient="records") # "data": filtered_df.to_dict(orient="records"),
} # }
def reset_dataset(self) -> None:
self.df = self.original_df.copy(deep=True)
# def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
# return self._prepare_df(original_df)